diff --git a/cmake/Findclang.cmake b/cmake/Findclang.cmake
index b4bd80773d..3ce52df893 100644
--- a/cmake/Findclang.cmake
+++ b/cmake/Findclang.cmake
@@ -9,27 +9,27 @@
 
 find_path(CLANG_INCLUDE_DIRS NAMES clang/Frontend/ASTUnit.h
   PATHS
-    /usr/lib/llvm/11/include
-    /usr/lib/llvm-11/include
-    /usr/lib/llvm-11.0/include
-    /usr/local/llvm110/include
-    /usr/local/llvm11/include
+    /usr/lib/llvm/12/include
+    /usr/lib/llvm-12/include
+    /usr/lib/llvm-12.0/include
+    /usr/local/llvm120/include
+    /usr/local/llvm12/include
     /mingw64/include
 )
 
 if(ZIG_PREFER_CLANG_CPP_DYLIB)
   find_library(CLANG_LIBRARIES
     NAMES
-      clang-cpp-11.0
-      clang-cpp110
+      clang-cpp-12.0
+      clang-cpp120
       clang-cpp
     PATHS
       ${CLANG_LIBDIRS}
-      /usr/lib/llvm/11/lib
-      /usr/lib/llvm/11/lib64
-      /usr/lib/llvm-11/lib
-      /usr/local/llvm110/lib
-      /usr/local/llvm11/lib
+      /usr/lib/llvm/12/lib
+      /usr/lib/llvm/12/lib64
+      /usr/lib/llvm-12/lib
+      /usr/local/llvm120/lib
+      /usr/local/llvm12/lib
   )
 endif()
 
@@ -39,11 +39,11 @@ if(NOT CLANG_LIBRARIES)
     find_library(CLANG_${_prettylibname_}_LIB NAMES ${_libname_}
       PATHS
         ${CLANG_LIBDIRS}
-        /usr/lib/llvm/11/lib
-        /usr/lib/llvm-11/lib
-        /usr/lib/llvm-11.0/lib
-        /usr/local/llvm110/lib
-        /usr/local/llvm11/lib
+        /usr/lib/llvm/12/lib
+        /usr/lib/llvm-12/lib
+        /usr/lib/llvm-12.0/lib
+        /usr/local/llvm120/lib
+        /usr/local/llvm12/lib
         /mingw64/lib
         /c/msys64/mingw64/lib
         c:\\msys64\\mingw64\\lib
diff --git a/cmake/Findlld.cmake b/cmake/Findlld.cmake
index 3103601ff8..72724ecd1e 100644
--- a/cmake/Findlld.cmake
+++ b/cmake/Findlld.cmake
@@ -8,16 +8,16 @@
 
 find_path(LLD_INCLUDE_DIRS NAMES lld/Common/Driver.h
     PATHS
-        /usr/lib/llvm-11/include
-        /usr/local/llvm110/include
-        /usr/local/llvm11/include
+        /usr/lib/llvm-12/include
+        /usr/local/llvm120/include
+        /usr/local/llvm12/include
         /mingw64/include)
 
-find_library(LLD_LIBRARY NAMES lld-11.0 lld110 lld
+find_library(LLD_LIBRARY NAMES lld-12.0 lld120 lld
     PATHS
-        /usr/lib/llvm-11/lib
-        /usr/local/llvm110/lib
-        /usr/local/llvm11/lib
+        /usr/lib/llvm-12/lib
+        /usr/local/llvm120/lib
+        /usr/local/llvm12/lib
 )
 if(EXISTS ${LLD_LIBRARY})
     set(LLD_LIBRARIES ${LLD_LIBRARY})
@@ -27,9 +27,9 @@ else()
         find_library(LLD_${_prettylibname_}_LIB NAMES ${_libname_}
             PATHS
                 ${LLD_LIBDIRS}
-                /usr/lib/llvm-11/lib
-                /usr/local/llvm110/lib
-                /usr/local/llvm11/lib
+                /usr/lib/llvm-12/lib
+                /usr/local/llvm120/lib
+                /usr/local/llvm12/lib
                 /mingw64/lib
                 /c/msys64/mingw64/lib
                 c:/msys64/mingw64/lib)
diff --git a/cmake/Findllvm.cmake b/cmake/Findllvm.cmake
index 4984723ec2..bfde645cad 100644
--- a/cmake/Findllvm.cmake
+++ b/cmake/Findllvm.cmake
@@ -9,37 +9,37 @@
 
 find_path(LLVM_INCLUDE_DIRS NAMES llvm/IR/IRBuilder.h
   PATHS
-    /usr/lib/llvm/11/include
-    /usr/lib/llvm-11/include
-    /usr/lib/llvm-11.0/include
-    /usr/local/llvm11/include
-    /usr/local/llvm110/include
+    /usr/lib/llvm/12/include
+    /usr/lib/llvm-12/include
+    /usr/lib/llvm-12.0/include
+    /usr/local/llvm12/include
+    /usr/local/llvm120/include
     /mingw64/include
 )
 
 if(ZIG_PREFER_CLANG_CPP_DYLIB)
   find_library(LLVM_LIBRARIES
     NAMES
-      LLVM-11.0
-      LLVM-11
-      LLVM-110
+      LLVM-12.0
+      LLVM-12
+      LLVM-120
       LLVM
     PATHS
       ${LLVM_LIBDIRS}
-      /usr/lib/llvm/11/lib
-      /usr/lib/llvm/11/lib64
-      /usr/lib/llvm-11/lib
-      /usr/local/llvm11/lib
-      /usr/local/llvm110/lib
+      /usr/lib/llvm/12/lib
+      /usr/lib/llvm/12/lib64
+      /usr/lib/llvm-12/lib
+      /usr/local/llvm12/lib
+      /usr/local/llvm120/lib
   )
 
   find_program(LLVM_CONFIG_EXE
-      NAMES llvm-config-11 llvm-config-11.0 llvm-config110 llvm-config11 llvm-config
+      NAMES llvm-config-12 llvm-config-12.0 llvm-config120 llvm-config12 llvm-config
       PATHS
           "/mingw64/bin"
           "/c/msys64/mingw64/bin"
           "c:/msys64/mingw64/bin"
-          "C:/Libraries/llvm-11.0.0/bin")
+          "C:/Libraries/llvm-12.0.0/bin")
 
   if ("${LLVM_CONFIG_EXE}" STREQUAL "LLVM_CONFIG_EXE-NOTFOUND")
     message(FATAL_ERROR "unable to find llvm-config")
@@ -54,23 +54,23 @@ if(ZIG_PREFER_CLANG_CPP_DYLIB)
     OUTPUT_VARIABLE LLVM_CONFIG_VERSION
     OUTPUT_STRIP_TRAILING_WHITESPACE)
 
-  if("${LLVM_CONFIG_VERSION}" VERSION_LESS 11)
-    message(FATAL_ERROR "expected LLVM 11.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
+  if("${LLVM_CONFIG_VERSION}" VERSION_LESS 12)
+    message(FATAL_ERROR "expected LLVM 12.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
   endif()
-  if("${LLVM_CONFIG_VERSION}" VERSION_EQUAL 12)
-    message(FATAL_ERROR "expected LLVM 11.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
+  if("${LLVM_CONFIG_VERSION}" VERSION_EQUAL 13)
+    message(FATAL_ERROR "expected LLVM 12.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
   endif()
-  if("${LLVM_CONFIG_VERSION}" VERSION_GREATER 12)
-    message(FATAL_ERROR "expected LLVM 11.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
+  if("${LLVM_CONFIG_VERSION}" VERSION_GREATER 13)
+    message(FATAL_ERROR "expected LLVM 12.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
   endif()
 elseif(ZIG_USE_LLVM_CONFIG)
   find_program(LLVM_CONFIG_EXE
-      NAMES llvm-config-11 llvm-config-11.0 llvm-config110 llvm-config11 llvm-config
+      NAMES llvm-config-12 llvm-config-12.0 llvm-config120 llvm-config12 llvm-config
       PATHS
           "/mingw64/bin"
           "/c/msys64/mingw64/bin"
           "c:/msys64/mingw64/bin"
-          "C:/Libraries/llvm-11.0.0/bin")
+          "C:/Libraries/llvm-12.0.0/bin")
 
   if ("${LLVM_CONFIG_EXE}" STREQUAL "LLVM_CONFIG_EXE-NOTFOUND")
     message(FATAL_ERROR "unable to find llvm-config")
@@ -85,14 +85,14 @@ elseif(ZIG_USE_LLVM_CONFIG)
     OUTPUT_VARIABLE LLVM_CONFIG_VERSION
     OUTPUT_STRIP_TRAILING_WHITESPACE)
 
-  if("${LLVM_CONFIG_VERSION}" VERSION_LESS 11)
-    message(FATAL_ERROR "expected LLVM 11.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
+  if("${LLVM_CONFIG_VERSION}" VERSION_LESS 12)
+    message(FATAL_ERROR "expected LLVM 12.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
   endif()
-  if("${LLVM_CONFIG_VERSION}" VERSION_EQUAL 12)
-    message(FATAL_ERROR "expected LLVM 11.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
+  if("${LLVM_CONFIG_VERSION}" VERSION_EQUAL 13)
+    message(FATAL_ERROR "expected LLVM 12.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
   endif()
-  if("${LLVM_CONFIG_VERSION}" VERSION_GREATER 12)
-    message(FATAL_ERROR "expected LLVM 11.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
+  if("${LLVM_CONFIG_VERSION}" VERSION_GREATER 13)
+    message(FATAL_ERROR "expected LLVM 12.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
   endif()
 
   execute_process(
@@ -166,7 +166,7 @@ elseif(ZIG_USE_LLVM_CONFIG)
   set(LLVM_LIBRARIES ${LLVM_LIBRARIES} ${LLVM_SYSTEM_LIBS})
 
   if(NOT LLVM_LIBRARIES)
-    find_library(LLVM_LIBRARIES NAMES LLVM LLVM-11 LLVM-11.0)
+    find_library(LLVM_LIBRARIES NAMES LLVM LLVM-12 LLVM-12.0)
   endif()
 
   link_directories("${CMAKE_PREFIX_PATH}/lib")
@@ -180,11 +180,11 @@ else()
     find_library(LLVM_${_prettylibname_}_LIB NAMES ${_libname_}
       PATHS
       ${LLVM_LIBDIRS}
-      /usr/lib/llvm/11/lib
-      /usr/lib/llvm-11/lib
-      /usr/lib/llvm-11.0/lib
-      /usr/local/llvm110/lib
-      /usr/local/llvm11/lib
+      /usr/lib/llvm/12/lib
+      /usr/lib/llvm-12/lib
+      /usr/lib/llvm-12.0/lib
+      /usr/local/llvm120/lib
+      /usr/local/llvm12/lib
       /mingw64/lib
       /c/msys64/mingw64/lib
       c:\\msys64\\mingw64\\lib)
@@ -194,78 +194,57 @@ else()
   # This list can be re-generated with `llvm-config --libfiles` and then
   # reformatting using your favorite text editor. Note we do not execute
   # `llvm-config` here because we are cross compiling.
-  FIND_AND_ADD_LLVM_LIB(LLVMXRay)
   FIND_AND_ADD_LLVM_LIB(LLVMWindowsManifest)
-  FIND_AND_ADD_LLVM_LIB(LLVMSymbolize)
-  FIND_AND_ADD_LLVM_LIB(LLVMDebugInfoPDB)
-  FIND_AND_ADD_LLVM_LIB(LLVMOrcJIT)
-  FIND_AND_ADD_LLVM_LIB(LLVMOrcError)
-  FIND_AND_ADD_LLVM_LIB(LLVMJITLink)
-  FIND_AND_ADD_LLVM_LIB(LLVMObjectYAML)
-  FIND_AND_ADD_LLVM_LIB(LLVMMCA)
-  FIND_AND_ADD_LLVM_LIB(LLVMLTO)
-  FIND_AND_ADD_LLVM_LIB(LLVMPasses)
-  FIND_AND_ADD_LLVM_LIB(LLVMCoroutines)
-  FIND_AND_ADD_LLVM_LIB(LLVMObjCARCOpts)
-  FIND_AND_ADD_LLVM_LIB(LLVMExtensions)
-  FIND_AND_ADD_LLVM_LIB(LLVMLineEditor)
+  FIND_AND_ADD_LLVM_LIB(LLVMXRay)
   FIND_AND_ADD_LLVM_LIB(LLVMLibDriver)
-  FIND_AND_ADD_LLVM_LIB(LLVMInterpreter)
-  FIND_AND_ADD_LLVM_LIB(LLVMFuzzMutate)
-  FIND_AND_ADD_LLVM_LIB(LLVMMCJIT)
-  FIND_AND_ADD_LLVM_LIB(LLVMExecutionEngine)
-  FIND_AND_ADD_LLVM_LIB(LLVMRuntimeDyld)
-  FIND_AND_ADD_LLVM_LIB(LLVMDWARFLinker)
   FIND_AND_ADD_LLVM_LIB(LLVMDlltoolDriver)
-  FIND_AND_ADD_LLVM_LIB(LLVMOption)
-  FIND_AND_ADD_LLVM_LIB(LLVMDebugInfoGSYM)
   FIND_AND_ADD_LLVM_LIB(LLVMCoverage)
+  FIND_AND_ADD_LLVM_LIB(LLVMLineEditor)
   FIND_AND_ADD_LLVM_LIB(LLVMXCoreDisassembler)
   FIND_AND_ADD_LLVM_LIB(LLVMXCoreCodeGen)
   FIND_AND_ADD_LLVM_LIB(LLVMXCoreDesc)
   FIND_AND_ADD_LLVM_LIB(LLVMXCoreInfo)
   FIND_AND_ADD_LLVM_LIB(LLVMX86Disassembler)
-  FIND_AND_ADD_LLVM_LIB(LLVMX86CodeGen)
   FIND_AND_ADD_LLVM_LIB(LLVMX86AsmParser)
+  FIND_AND_ADD_LLVM_LIB(LLVMX86CodeGen)
   FIND_AND_ADD_LLVM_LIB(LLVMX86Desc)
   FIND_AND_ADD_LLVM_LIB(LLVMX86Info)
   FIND_AND_ADD_LLVM_LIB(LLVMWebAssemblyDisassembler)
+  FIND_AND_ADD_LLVM_LIB(LLVMWebAssemblyAsmParser)
   FIND_AND_ADD_LLVM_LIB(LLVMWebAssemblyCodeGen)
   FIND_AND_ADD_LLVM_LIB(LLVMWebAssemblyDesc)
-  FIND_AND_ADD_LLVM_LIB(LLVMWebAssemblyAsmParser)
   FIND_AND_ADD_LLVM_LIB(LLVMWebAssemblyInfo)
   FIND_AND_ADD_LLVM_LIB(LLVMSystemZDisassembler)
-  FIND_AND_ADD_LLVM_LIB(LLVMSystemZCodeGen)
   FIND_AND_ADD_LLVM_LIB(LLVMSystemZAsmParser)
+  FIND_AND_ADD_LLVM_LIB(LLVMSystemZCodeGen)
   FIND_AND_ADD_LLVM_LIB(LLVMSystemZDesc)
   FIND_AND_ADD_LLVM_LIB(LLVMSystemZInfo)
   FIND_AND_ADD_LLVM_LIB(LLVMSparcDisassembler)
-  FIND_AND_ADD_LLVM_LIB(LLVMSparcCodeGen)
   FIND_AND_ADD_LLVM_LIB(LLVMSparcAsmParser)
+  FIND_AND_ADD_LLVM_LIB(LLVMSparcCodeGen)
   FIND_AND_ADD_LLVM_LIB(LLVMSparcDesc)
   FIND_AND_ADD_LLVM_LIB(LLVMSparcInfo)
   FIND_AND_ADD_LLVM_LIB(LLVMRISCVDisassembler)
-  FIND_AND_ADD_LLVM_LIB(LLVMRISCVCodeGen)
   FIND_AND_ADD_LLVM_LIB(LLVMRISCVAsmParser)
+  FIND_AND_ADD_LLVM_LIB(LLVMRISCVCodeGen)
   FIND_AND_ADD_LLVM_LIB(LLVMRISCVDesc)
-  FIND_AND_ADD_LLVM_LIB(LLVMRISCVUtils)
   FIND_AND_ADD_LLVM_LIB(LLVMRISCVInfo)
   FIND_AND_ADD_LLVM_LIB(LLVMPowerPCDisassembler)
-  FIND_AND_ADD_LLVM_LIB(LLVMPowerPCCodeGen)
   FIND_AND_ADD_LLVM_LIB(LLVMPowerPCAsmParser)
+  FIND_AND_ADD_LLVM_LIB(LLVMPowerPCCodeGen)
   FIND_AND_ADD_LLVM_LIB(LLVMPowerPCDesc)
   FIND_AND_ADD_LLVM_LIB(LLVMPowerPCInfo)
   FIND_AND_ADD_LLVM_LIB(LLVMNVPTXCodeGen)
   FIND_AND_ADD_LLVM_LIB(LLVMNVPTXDesc)
   FIND_AND_ADD_LLVM_LIB(LLVMNVPTXInfo)
   FIND_AND_ADD_LLVM_LIB(LLVMMSP430Disassembler)
-  FIND_AND_ADD_LLVM_LIB(LLVMMSP430CodeGen)
   FIND_AND_ADD_LLVM_LIB(LLVMMSP430AsmParser)
+  FIND_AND_ADD_LLVM_LIB(LLVMMSP430CodeGen)
   FIND_AND_ADD_LLVM_LIB(LLVMMSP430Desc)
   FIND_AND_ADD_LLVM_LIB(LLVMMSP430Info)
   FIND_AND_ADD_LLVM_LIB(LLVMMipsDisassembler)
-  FIND_AND_ADD_LLVM_LIB(LLVMMipsCodeGen)
   FIND_AND_ADD_LLVM_LIB(LLVMMipsAsmParser)
+  FIND_AND_ADD_LLVM_LIB(LLVMMipsCodeGen)
   FIND_AND_ADD_LLVM_LIB(LLVMMipsDesc)
   FIND_AND_ADD_LLVM_LIB(LLVMMipsInfo)
   FIND_AND_ADD_LLVM_LIB(LLVMLanaiDisassembler)
@@ -279,44 +258,73 @@ else()
   FIND_AND_ADD_LLVM_LIB(LLVMHexagonDesc)
   FIND_AND_ADD_LLVM_LIB(LLVMHexagonInfo)
   FIND_AND_ADD_LLVM_LIB(LLVMBPFDisassembler)
-  FIND_AND_ADD_LLVM_LIB(LLVMBPFCodeGen)
   FIND_AND_ADD_LLVM_LIB(LLVMBPFAsmParser)
+  FIND_AND_ADD_LLVM_LIB(LLVMBPFCodeGen)
   FIND_AND_ADD_LLVM_LIB(LLVMBPFDesc)
   FIND_AND_ADD_LLVM_LIB(LLVMBPFInfo)
   FIND_AND_ADD_LLVM_LIB(LLVMAVRDisassembler)
-  FIND_AND_ADD_LLVM_LIB(LLVMAVRCodeGen)
   FIND_AND_ADD_LLVM_LIB(LLVMAVRAsmParser)
+  FIND_AND_ADD_LLVM_LIB(LLVMAVRCodeGen)
   FIND_AND_ADD_LLVM_LIB(LLVMAVRDesc)
   FIND_AND_ADD_LLVM_LIB(LLVMAVRInfo)
   FIND_AND_ADD_LLVM_LIB(LLVMARMDisassembler)
-  FIND_AND_ADD_LLVM_LIB(LLVMARMCodeGen)
   FIND_AND_ADD_LLVM_LIB(LLVMARMAsmParser)
+  FIND_AND_ADD_LLVM_LIB(LLVMARMCodeGen)
   FIND_AND_ADD_LLVM_LIB(LLVMARMDesc)
   FIND_AND_ADD_LLVM_LIB(LLVMARMUtils)
   FIND_AND_ADD_LLVM_LIB(LLVMARMInfo)
   FIND_AND_ADD_LLVM_LIB(LLVMAMDGPUDisassembler)
-  FIND_AND_ADD_LLVM_LIB(LLVMAMDGPUCodeGen)
-  FIND_AND_ADD_LLVM_LIB(LLVMMIRParser)
-  FIND_AND_ADD_LLVM_LIB(LLVMipo)
-  FIND_AND_ADD_LLVM_LIB(LLVMInstrumentation)
-  FIND_AND_ADD_LLVM_LIB(LLVMVectorize)
-  FIND_AND_ADD_LLVM_LIB(LLVMLinker)
-  FIND_AND_ADD_LLVM_LIB(LLVMIRReader)
-  FIND_AND_ADD_LLVM_LIB(LLVMAsmParser)
-  FIND_AND_ADD_LLVM_LIB(LLVMFrontendOpenMP)
   FIND_AND_ADD_LLVM_LIB(LLVMAMDGPUAsmParser)
+  FIND_AND_ADD_LLVM_LIB(LLVMAMDGPUCodeGen)
   FIND_AND_ADD_LLVM_LIB(LLVMAMDGPUDesc)
   FIND_AND_ADD_LLVM_LIB(LLVMAMDGPUUtils)
   FIND_AND_ADD_LLVM_LIB(LLVMAMDGPUInfo)
   FIND_AND_ADD_LLVM_LIB(LLVMAArch64Disassembler)
-  FIND_AND_ADD_LLVM_LIB(LLVMMCDisassembler)
+  FIND_AND_ADD_LLVM_LIB(LLVMAArch64AsmParser)
   FIND_AND_ADD_LLVM_LIB(LLVMAArch64CodeGen)
+  FIND_AND_ADD_LLVM_LIB(LLVMAArch64Desc)
+  FIND_AND_ADD_LLVM_LIB(LLVMAArch64Utils)
+  FIND_AND_ADD_LLVM_LIB(LLVMAArch64Info)
+  FIND_AND_ADD_LLVM_LIB(LLVMOrcJIT)
+  FIND_AND_ADD_LLVM_LIB(LLVMMCJIT)
+  FIND_AND_ADD_LLVM_LIB(LLVMJITLink)
+  FIND_AND_ADD_LLVM_LIB(LLVMOrcTargetProcess)
+  FIND_AND_ADD_LLVM_LIB(LLVMOrcShared)
+  FIND_AND_ADD_LLVM_LIB(LLVMInterpreter)
+  FIND_AND_ADD_LLVM_LIB(LLVMExecutionEngine)
+  FIND_AND_ADD_LLVM_LIB(LLVMRuntimeDyld)
+  FIND_AND_ADD_LLVM_LIB(LLVMSymbolize)
+  FIND_AND_ADD_LLVM_LIB(LLVMDebugInfoPDB)
+  FIND_AND_ADD_LLVM_LIB(LLVMDebugInfoGSYM)
+  FIND_AND_ADD_LLVM_LIB(LLVMOption)
+  FIND_AND_ADD_LLVM_LIB(LLVMObjectYAML)
+  FIND_AND_ADD_LLVM_LIB(LLVMMCA)
+  FIND_AND_ADD_LLVM_LIB(LLVMMCDisassembler)
+  FIND_AND_ADD_LLVM_LIB(LLVMLTO)
+  FIND_AND_ADD_LLVM_LIB(LLVMPasses)
   FIND_AND_ADD_LLVM_LIB(LLVMCFGuard)
+  FIND_AND_ADD_LLVM_LIB(LLVMCoroutines)
+  FIND_AND_ADD_LLVM_LIB(LLVMObjCARCOpts)
+  FIND_AND_ADD_LLVM_LIB(LLVMHelloNew)
+  FIND_AND_ADD_LLVM_LIB(LLVMipo)
+  FIND_AND_ADD_LLVM_LIB(LLVMVectorize)
+  FIND_AND_ADD_LLVM_LIB(LLVMLinker)
+  FIND_AND_ADD_LLVM_LIB(LLVMInstrumentation)
+  FIND_AND_ADD_LLVM_LIB(LLVMFrontendOpenMP)
+  FIND_AND_ADD_LLVM_LIB(LLVMFrontendOpenACC)
+  FIND_AND_ADD_LLVM_LIB(LLVMExtensions)
+  FIND_AND_ADD_LLVM_LIB(LLVMDWARFLinker)
   FIND_AND_ADD_LLVM_LIB(LLVMGlobalISel)
-  FIND_AND_ADD_LLVM_LIB(LLVMSelectionDAG)
+  FIND_AND_ADD_LLVM_LIB(LLVMMIRParser)
   FIND_AND_ADD_LLVM_LIB(LLVMAsmPrinter)
   FIND_AND_ADD_LLVM_LIB(LLVMDebugInfoDWARF)
+  FIND_AND_ADD_LLVM_LIB(LLVMSelectionDAG)
   FIND_AND_ADD_LLVM_LIB(LLVMCodeGen)
+  FIND_AND_ADD_LLVM_LIB(LLVMIRReader)
+  FIND_AND_ADD_LLVM_LIB(LLVMAsmParser)
+  FIND_AND_ADD_LLVM_LIB(LLVMInterfaceStub)
+  FIND_AND_ADD_LLVM_LIB(LLVMFileCheck)
+  FIND_AND_ADD_LLVM_LIB(LLVMFuzzMutate)
   FIND_AND_ADD_LLVM_LIB(LLVMTarget)
   FIND_AND_ADD_LLVM_LIB(LLVMScalarOpts)
   FIND_AND_ADD_LLVM_LIB(LLVMInstCombine)
@@ -327,19 +335,15 @@ else()
   FIND_AND_ADD_LLVM_LIB(LLVMProfileData)
   FIND_AND_ADD_LLVM_LIB(LLVMObject)
   FIND_AND_ADD_LLVM_LIB(LLVMTextAPI)
+  FIND_AND_ADD_LLVM_LIB(LLVMMCParser)
+  FIND_AND_ADD_LLVM_LIB(LLVMMC)
+  FIND_AND_ADD_LLVM_LIB(LLVMDebugInfoCodeView)
+  FIND_AND_ADD_LLVM_LIB(LLVMDebugInfoMSF)
   FIND_AND_ADD_LLVM_LIB(LLVMBitReader)
   FIND_AND_ADD_LLVM_LIB(LLVMCore)
   FIND_AND_ADD_LLVM_LIB(LLVMRemarks)
   FIND_AND_ADD_LLVM_LIB(LLVMBitstreamReader)
-  FIND_AND_ADD_LLVM_LIB(LLVMAArch64AsmParser)
-  FIND_AND_ADD_LLVM_LIB(LLVMMCParser)
-  FIND_AND_ADD_LLVM_LIB(LLVMAArch64Desc)
-  FIND_AND_ADD_LLVM_LIB(LLVMMC)
-  FIND_AND_ADD_LLVM_LIB(LLVMDebugInfoCodeView)
-  FIND_AND_ADD_LLVM_LIB(LLVMDebugInfoMSF)
   FIND_AND_ADD_LLVM_LIB(LLVMBinaryFormat)
-  FIND_AND_ADD_LLVM_LIB(LLVMAArch64Utils)
-  FIND_AND_ADD_LLVM_LIB(LLVMAArch64Info)
   FIND_AND_ADD_LLVM_LIB(LLVMSupport)
   FIND_AND_ADD_LLVM_LIB(LLVMDemangle)
 endif()
diff --git a/lib/include/__clang_cuda_builtin_vars.h b/lib/include/__clang_cuda_builtin_vars.h
index 2ba1521f25..412e823a82 100644
--- a/lib/include/__clang_cuda_builtin_vars.h
+++ b/lib/include/__clang_cuda_builtin_vars.h
@@ -55,7 +55,9 @@ struct __cuda_builtin_threadIdx_t {
   __CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_tid_z());
   // threadIdx should be convertible to uint3 (in fact in nvcc, it *is* a
   // uint3).  This function is defined after we pull in vector_types.h.
+  __attribute__((device)) operator dim3() const;
   __attribute__((device)) operator uint3() const;
+
 private:
   __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_threadIdx_t);
 };
@@ -66,7 +68,9 @@ struct __cuda_builtin_blockIdx_t {
   __CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_ctaid_z());
   // blockIdx should be convertible to uint3 (in fact in nvcc, it *is* a
   // uint3).  This function is defined after we pull in vector_types.h.
+  __attribute__((device)) operator dim3() const;
   __attribute__((device)) operator uint3() const;
+
 private:
   __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_blockIdx_t);
 };
@@ -78,6 +82,8 @@ struct __cuda_builtin_blockDim_t {
   // blockDim should be convertible to dim3 (in fact in nvcc, it *is* a
   // dim3).  This function is defined after we pull in vector_types.h.
   __attribute__((device)) operator dim3() const;
+  __attribute__((device)) operator uint3() const;
+
 private:
   __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_blockDim_t);
 };
@@ -89,6 +95,8 @@ struct __cuda_builtin_gridDim_t {
   // gridDim should be convertible to dim3 (in fact in nvcc, it *is* a
   // dim3).  This function is defined after we pull in vector_types.h.
   __attribute__((device)) operator dim3() const;
+  __attribute__((device)) operator uint3() const;
+
 private:
   __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_gridDim_t);
 };
@@ -108,5 +116,6 @@ __attribute__((device)) const int warpSize = 32;
 #undef __CUDA_DEVICE_BUILTIN
 #undef __CUDA_BUILTIN_VAR
 #undef __CUDA_DISALLOW_BUILTINVAR_ACCESS
+#undef __DELETE
 
 #endif /* __CUDA_BUILTIN_VARS_H */
diff --git a/lib/include/__clang_cuda_cmath.h b/lib/include/__clang_cuda_cmath.h
index 8ba182689a..5bbb59a93b 100644
--- a/lib/include/__clang_cuda_cmath.h
+++ b/lib/include/__clang_cuda_cmath.h
@@ -66,10 +66,38 @@ __DEVICE__ float frexp(float __arg, int *__exp) {
 }
 
 // For inscrutable reasons, the CUDA headers define these functions for us on
-// Windows. For OpenMP we omit these as some old system headers have
-// non-conforming `isinf(float)` and `isnan(float)` implementations that return
-// an `int`. The system versions of these functions should be fine anyway.
-#if !defined(_MSC_VER) && !defined(__OPENMP_NVPTX__)
+// Windows.
+#if !defined(_MSC_VER) || defined(__OPENMP_NVPTX__)
+
+// For OpenMP we work around some old system headers that have non-conforming
+// `isinf(float)` and `isnan(float)` implementations that return an `int`. We do
+// this by providing two versions of these functions, differing only in the
+// return type. To avoid conflicting definitions we disable implicit base
+// function generation. That means we will end up with two specializations, one
+// per type, but only one has a base function defined by the system header.
+#if defined(__OPENMP_NVPTX__)
+#pragma omp begin declare variant match(                                       \
+    implementation = {extension(disable_implicit_base)})
+
+// FIXME: We lack an extension to customize the mangling of the variants, e.g.,
+//        add a suffix. This means we would clash with the names of the variants
+//        (note that we do not create implicit base functions here). To avoid
+//        this clash we add a new trait to some of them that is always true
+//        (this is LLVM after all ;)). It will only influence the mangled name
+//        of the variants inside the inner region and avoid the clash.
+#pragma omp begin declare variant match(implementation = {vendor(llvm)})
+
+__DEVICE__ int isinf(float __x) { return ::__isinff(__x); }
+__DEVICE__ int isinf(double __x) { return ::__isinf(__x); }
+__DEVICE__ int isfinite(float __x) { return ::__finitef(__x); }
+__DEVICE__ int isfinite(double __x) { return ::__isfinited(__x); }
+__DEVICE__ int isnan(float __x) { return ::__isnanf(__x); }
+__DEVICE__ int isnan(double __x) { return ::__isnan(__x); }
+
+#pragma omp end declare variant
+
+#endif
+
 __DEVICE__ bool isinf(float __x) { return ::__isinff(__x); }
 __DEVICE__ bool isinf(double __x) { return ::__isinf(__x); }
 __DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
@@ -79,6 +107,11 @@ __DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
 __DEVICE__ bool isfinite(double __x) { return ::__isfinited(__x); }
 __DEVICE__ bool isnan(float __x) { return ::__isnanf(__x); }
 __DEVICE__ bool isnan(double __x) { return ::__isnan(__x); }
+
+#if defined(__OPENMP_NVPTX__)
+#pragma omp end declare variant
+#endif
+
 #endif
 
 __DEVICE__ bool isgreater(float __x, float __y) {
@@ -142,6 +175,15 @@ __DEVICE__ float sqrt(float __x) { return ::sqrtf(__x); }
 __DEVICE__ float tan(float __x) { return ::tanf(__x); }
 __DEVICE__ float tanh(float __x) { return ::tanhf(__x); }
 
+// There was a redefinition error for this this overload in CUDA mode.
+// We restrict it to OpenMP mode for now, that is where it is actually needed
+// anyway.
+#ifdef __OPENMP_NVPTX__
+__DEVICE__ float remquo(float __n, float __d, int *__q) {
+  return ::remquof(__n, __d, __q);
+}
+#endif
+
 // Notably missing above is nexttoward.  We omit it because
 // libdevice doesn't provide an implementation, and we don't want to be in the
 // business of implementing tricky libm functions in this header.
diff --git a/lib/include/__clang_cuda_complex_builtins.h b/lib/include/__clang_cuda_complex_builtins.h
index d924487ab2..2b701fef0e 100644
--- a/lib/include/__clang_cuda_complex_builtins.h
+++ b/lib/include/__clang_cuda_complex_builtins.h
@@ -41,6 +41,27 @@
 #define _ABSf std::abs
 #define _LOGBd std::logb
 #define _LOGBf std::logb
+// Rather than pulling in std::max from algorithm everytime, use available ::max.
+#define _fmaxd max
+#define _fmaxf max
+#else
+#ifdef __AMDGCN__
+#define _ISNANd __ocml_isnan_f64
+#define _ISNANf __ocml_isnan_f32
+#define _ISINFd __ocml_isinf_f64
+#define _ISINFf __ocml_isinf_f32
+#define _ISFINITEd __ocml_isfinite_f64
+#define _ISFINITEf __ocml_isfinite_f32
+#define _COPYSIGNd __ocml_copysign_f64
+#define _COPYSIGNf __ocml_copysign_f32
+#define _SCALBNd __ocml_scalbn_f64
+#define _SCALBNf __ocml_scalbn_f32
+#define _ABSd __ocml_fabs_f64
+#define _ABSf __ocml_fabs_f32
+#define _LOGBd __ocml_logb_f64
+#define _LOGBf __ocml_logb_f32
+#define _fmaxd __ocml_fmax_f64
+#define _fmaxf __ocml_fmax_f32
 #else
 #define _ISNANd __nv_isnand
 #define _ISNANf __nv_isnanf
@@ -56,6 +77,9 @@
 #define _ABSf __nv_fabsf
 #define _LOGBd __nv_logb
 #define _LOGBf __nv_logbf
+#define _fmaxd __nv_fmax
+#define _fmaxf __nv_fmaxf
+#endif
 #endif
 
 #if defined(__cplusplus)
@@ -167,7 +191,7 @@ __DEVICE__ double _Complex __divdc3(double __a, double __b, double __c,
   // Can't use std::max, because that's defined in <algorithm>, and we don't
   // want to pull that in for every compile.  The CUDA headers define
   // ::max(float, float) and ::max(double, double), which is sufficient for us.
-  double __logbw = _LOGBd(max(_ABSd(__c), _ABSd(__d)));
+  double __logbw = _LOGBd(_fmaxd(_ABSd(__c), _ABSd(__d)));
   if (_ISFINITEd(__logbw)) {
     __ilogbw = (int)__logbw;
     __c = _SCALBNd(__c, -__ilogbw);
@@ -200,7 +224,7 @@ __DEVICE__ double _Complex __divdc3(double __a, double __b, double __c,
 
 __DEVICE__ float _Complex __divsc3(float __a, float __b, float __c, float __d) {
   int __ilogbw = 0;
-  float __logbw = _LOGBf(max(_ABSf(__c), _ABSf(__d)));
+  float __logbw = _LOGBf(_fmaxf(_ABSf(__c), _ABSf(__d)));
   if (_ISFINITEf(__logbw)) {
     __ilogbw = (int)__logbw;
     __c = _SCALBNf(__c, -__ilogbw);
@@ -249,6 +273,8 @@ __DEVICE__ float _Complex __divsc3(float __a, float __b, float __c, float __d) {
 #undef _ABSf
 #undef _LOGBd
 #undef _LOGBf
+#undef _fmaxd
+#undef _fmaxf
 
 #ifdef __OPENMP_NVPTX__
 #pragma omp end declare target
diff --git a/lib/include/__clang_cuda_math.h b/lib/include/__clang_cuda_math.h
index 332e616702..acb26ad345 100644
--- a/lib/include/__clang_cuda_math.h
+++ b/lib/include/__clang_cuda_math.h
@@ -195,8 +195,8 @@ __DEVICE__ int max(int __a, int __b) { return __nv_max(__a, __b); }
 __DEVICE__ int min(int __a, int __b) { return __nv_min(__a, __b); }
 __DEVICE__ double modf(double __a, double *__b) { return __nv_modf(__a, __b); }
 __DEVICE__ float modff(float __a, float *__b) { return __nv_modff(__a, __b); }
-__DEVICE__ double nearbyint(double __a) { return __nv_nearbyint(__a); }
-__DEVICE__ float nearbyintf(float __a) { return __nv_nearbyintf(__a); }
+__DEVICE__ double nearbyint(double __a) { return __builtin_nearbyint(__a); }
+__DEVICE__ float nearbyintf(float __a) { return __builtin_nearbyintf(__a); }
 __DEVICE__ double nextafter(double __a, double __b) {
   return __nv_nextafter(__a, __b);
 }
@@ -249,8 +249,9 @@ __DEVICE__ double rhypot(double __a, double __b) {
 __DEVICE__ float rhypotf(float __a, float __b) {
   return __nv_rhypotf(__a, __b);
 }
-__DEVICE__ double rint(double __a) { return __nv_rint(__a); }
-__DEVICE__ float rintf(float __a) { return __nv_rintf(__a); }
+// __nv_rint* in libdevice is buggy and produces incorrect results.
+__DEVICE__ double rint(double __a) { return __builtin_rint(__a); }
+__DEVICE__ float rintf(float __a) { return __builtin_rintf(__a); }
 __DEVICE__ double rnorm(int __a, const double *__b) {
   return __nv_rnorm(__a, __b);
 }
diff --git a/lib/include/__clang_cuda_math_forward_declares.h b/lib/include/__clang_cuda_math_forward_declares.h
index 8a270859e4..c0f1f47cc9 100644
--- a/lib/include/__clang_cuda_math_forward_declares.h
+++ b/lib/include/__clang_cuda_math_forward_declares.h
@@ -160,6 +160,9 @@ __DEVICE__ double scalbln(double, long);
 __DEVICE__ float scalbln(float, long);
 __DEVICE__ double scalbn(double, int);
 __DEVICE__ float scalbn(float, int);
+#ifdef _MSC_VER
+__DEVICE__ bool signbit(long double);
+#endif
 __DEVICE__ bool signbit(double);
 __DEVICE__ bool signbit(float);
 __DEVICE__ double sin(double);
diff --git a/lib/include/__clang_cuda_runtime_wrapper.h b/lib/include/__clang_cuda_runtime_wrapper.h
index f43ed55de4..f88c39a9b6 100644
--- a/lib/include/__clang_cuda_runtime_wrapper.h
+++ b/lib/include/__clang_cuda_runtime_wrapper.h
@@ -377,30 +377,38 @@ __device__ static inline void *malloc(size_t __size) {
 // Out-of-line implementations from __clang_cuda_builtin_vars.h.  These need to
 // come after we've pulled in the definition of uint3 and dim3.
 
+__device__ inline __cuda_builtin_threadIdx_t::operator dim3() const {
+  return dim3(x, y, z);
+}
+
 __device__ inline __cuda_builtin_threadIdx_t::operator uint3() const {
-  uint3 ret;
-  ret.x = x;
-  ret.y = y;
-  ret.z = z;
-  return ret;
+  return {x, y, z};
+}
+
+__device__ inline __cuda_builtin_blockIdx_t::operator dim3() const {
+  return dim3(x, y, z);
 }
 
 __device__ inline __cuda_builtin_blockIdx_t::operator uint3() const {
-  uint3 ret;
-  ret.x = x;
-  ret.y = y;
-  ret.z = z;
-  return ret;
+  return {x, y, z};
 }
 
 __device__ inline __cuda_builtin_blockDim_t::operator dim3() const {
   return dim3(x, y, z);
 }
 
+__device__ inline __cuda_builtin_blockDim_t::operator uint3() const {
+  return {x, y, z};
+}
+
 __device__ inline __cuda_builtin_gridDim_t::operator dim3() const {
   return dim3(x, y, z);
 }
 
+__device__ inline __cuda_builtin_gridDim_t::operator uint3() const {
+  return {x, y, z};
+}
+
 #include <__clang_cuda_cmath.h>
 #include <__clang_cuda_intrinsics.h>
 #include <__clang_cuda_complex_builtins.h>
diff --git a/lib/include/__clang_hip_cmath.h b/lib/include/__clang_hip_cmath.h
new file mode 100644
index 0000000000..cd22a2df95
--- /dev/null
+++ b/lib/include/__clang_hip_cmath.h
@@ -0,0 +1,664 @@
+/*===---- __clang_hip_cmath.h - HIP cmath decls -----------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_HIP_CMATH_H__
+#define __CLANG_HIP_CMATH_H__
+
+#if !defined(__HIP__)
+#error "This file is for HIP and OpenMP AMDGCN device compilation only."
+#endif
+
+#if defined(__cplusplus)
+#include <limits>
+#include <type_traits>
+#include <utility>
+#endif
+#include <limits.h>
+#include <stdint.h>
+
+#pragma push_macro("__DEVICE__")
+#define __DEVICE__ static __device__ inline __attribute__((always_inline))
+
+// Start with functions that cannot be defined by DEF macros below.
+#if defined(__cplusplus)
+__DEVICE__ double abs(double __x) { return ::fabs(__x); }
+__DEVICE__ float abs(float __x) { return ::fabsf(__x); }
+__DEVICE__ long long abs(long long __n) { return ::llabs(__n); }
+__DEVICE__ long abs(long __n) { return ::labs(__n); }
+__DEVICE__ float fma(float __x, float __y, float __z) {
+  return ::fmaf(__x, __y, __z);
+}
+__DEVICE__ int fpclassify(float __x) {
+  return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
+                              FP_ZERO, __x);
+}
+__DEVICE__ int fpclassify(double __x) {
+  return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
+                              FP_ZERO, __x);
+}
+__DEVICE__ float frexp(float __arg, int *__exp) {
+  return ::frexpf(__arg, __exp);
+}
+__DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
+__DEVICE__ bool isfinite(double __x) { return ::__finite(__x); }
+__DEVICE__ bool isgreater(float __x, float __y) {
+  return __builtin_isgreater(__x, __y);
+}
+__DEVICE__ bool isgreater(double __x, double __y) {
+  return __builtin_isgreater(__x, __y);
+}
+__DEVICE__ bool isgreaterequal(float __x, float __y) {
+  return __builtin_isgreaterequal(__x, __y);
+}
+__DEVICE__ bool isgreaterequal(double __x, double __y) {
+  return __builtin_isgreaterequal(__x, __y);
+}
+__DEVICE__ bool isinf(float __x) { return ::__isinff(__x); }
+__DEVICE__ bool isinf(double __x) { return ::__isinf(__x); }
+__DEVICE__ bool isless(float __x, float __y) {
+  return __builtin_isless(__x, __y);
+}
+__DEVICE__ bool isless(double __x, double __y) {
+  return __builtin_isless(__x, __y);
+}
+__DEVICE__ bool islessequal(float __x, float __y) {
+  return __builtin_islessequal(__x, __y);
+}
+__DEVICE__ bool islessequal(double __x, double __y) {
+  return __builtin_islessequal(__x, __y);
+}
+__DEVICE__ bool islessgreater(float __x, float __y) {
+  return __builtin_islessgreater(__x, __y);
+}
+__DEVICE__ bool islessgreater(double __x, double __y) {
+  return __builtin_islessgreater(__x, __y);
+}
+__DEVICE__ bool isnan(float __x) { return ::__isnanf(__x); }
+__DEVICE__ bool isnan(double __x) { return ::__isnan(__x); }
+__DEVICE__ bool isnormal(float __x) { return __builtin_isnormal(__x); }
+__DEVICE__ bool isnormal(double __x) { return __builtin_isnormal(__x); }
+__DEVICE__ bool isunordered(float __x, float __y) {
+  return __builtin_isunordered(__x, __y);
+}
+__DEVICE__ bool isunordered(double __x, double __y) {
+  return __builtin_isunordered(__x, __y);
+}
+__DEVICE__ float modf(float __x, float *__iptr) { return ::modff(__x, __iptr); }
+__DEVICE__ float pow(float __base, int __iexp) {
+  return ::powif(__base, __iexp);
+}
+__DEVICE__ double pow(double __base, int __iexp) {
+  return ::powi(__base, __iexp);
+}
+__DEVICE__ float remquo(float __x, float __y, int *__quo) {
+  return ::remquof(__x, __y, __quo);
+}
+__DEVICE__ float scalbln(float __x, long int __n) {
+  return ::scalblnf(__x, __n);
+}
+__DEVICE__ bool signbit(float __x) { return ::__signbitf(__x); }
+__DEVICE__ bool signbit(double __x) { return ::__signbit(__x); }
+
+// Notably missing above is nexttoward.  We omit it because
+// ocml doesn't provide an implementation, and we don't want to be in the
+// business of implementing tricky libm functions in this header.
+
+// Other functions.
+__DEVICE__ _Float16 fma(_Float16 __x, _Float16 __y, _Float16 __z) {
+  return __ocml_fma_f16(__x, __y, __z);
+}
+__DEVICE__ _Float16 pow(_Float16 __base, int __iexp) {
+  return __ocml_pown_f16(__base, __iexp);
+}
+
+// BEGIN DEF_FUN and HIP_OVERLOAD
+
+// BEGIN DEF_FUN
+
+#pragma push_macro("__DEF_FUN1")
+#pragma push_macro("__DEF_FUN2")
+#pragma push_macro("__DEF_FUN2_FI")
+
+// Define cmath functions with float argument and returns __retty.
+#define __DEF_FUN1(__retty, __func)                                            \
+  __DEVICE__                                                                   \
+  __retty __func(float __x) { return __func##f(__x); }
+
+// Define cmath functions with two float arguments and returns __retty.
+#define __DEF_FUN2(__retty, __func)                                            \
+  __DEVICE__                                                                   \
+  __retty __func(float __x, float __y) { return __func##f(__x, __y); }
+
+// Define cmath functions with a float and an int argument and returns __retty.
+#define __DEF_FUN2_FI(__retty, __func)                                         \
+  __DEVICE__                                                                   \
+  __retty __func(float __x, int __y) { return __func##f(__x, __y); }
+
+__DEF_FUN1(float, acos)
+__DEF_FUN1(float, acosh)
+__DEF_FUN1(float, asin)
+__DEF_FUN1(float, asinh)
+__DEF_FUN1(float, atan)
+__DEF_FUN2(float, atan2)
+__DEF_FUN1(float, atanh)
+__DEF_FUN1(float, cbrt)
+__DEF_FUN1(float, ceil)
+__DEF_FUN2(float, copysign)
+__DEF_FUN1(float, cos)
+__DEF_FUN1(float, cosh)
+__DEF_FUN1(float, erf)
+__DEF_FUN1(float, erfc)
+__DEF_FUN1(float, exp)
+__DEF_FUN1(float, exp2)
+__DEF_FUN1(float, expm1)
+__DEF_FUN1(float, fabs)
+__DEF_FUN2(float, fdim)
+__DEF_FUN1(float, floor)
+__DEF_FUN2(float, fmax)
+__DEF_FUN2(float, fmin)
+__DEF_FUN2(float, fmod)
+__DEF_FUN2(float, hypot)
+__DEF_FUN1(int, ilogb)
+__DEF_FUN2_FI(float, ldexp)
+__DEF_FUN1(float, lgamma)
+__DEF_FUN1(float, log)
+__DEF_FUN1(float, log10)
+__DEF_FUN1(float, log1p)
+__DEF_FUN1(float, log2)
+__DEF_FUN1(float, logb)
+__DEF_FUN1(long long, llrint)
+__DEF_FUN1(long long, llround)
+__DEF_FUN1(long, lrint)
+__DEF_FUN1(long, lround)
+__DEF_FUN1(float, nearbyint)
+__DEF_FUN2(float, nextafter)
+__DEF_FUN2(float, pow)
+__DEF_FUN2(float, remainder)
+__DEF_FUN1(float, rint)
+__DEF_FUN1(float, round)
+__DEF_FUN2_FI(float, scalbn)
+__DEF_FUN1(float, sin)
+__DEF_FUN1(float, sinh)
+__DEF_FUN1(float, sqrt)
+__DEF_FUN1(float, tan)
+__DEF_FUN1(float, tanh)
+__DEF_FUN1(float, tgamma)
+__DEF_FUN1(float, trunc)
+
+#pragma pop_macro("__DEF_FUN1")
+#pragma pop_macro("__DEF_FUN2")
+#pragma pop_macro("__DEF_FUN2_FI")
+
+// END DEF_FUN
+
+// BEGIN HIP_OVERLOAD
+
+#pragma push_macro("__HIP_OVERLOAD1")
+#pragma push_macro("__HIP_OVERLOAD2")
+
+// __hip_enable_if::type is a type function which returns __T if __B is true.
+template <bool __B, class __T = void> struct __hip_enable_if {};
+
+template <class __T> struct __hip_enable_if<true, __T> { typedef __T type; };
+
+// decltype is only available in C++11 and above.
+#if __cplusplus >= 201103L
+// __hip_promote
+namespace __hip {
+
+template <class _Tp> struct __numeric_type {
+  static void __test(...);
+  static _Float16 __test(_Float16);
+  static float __test(float);
+  static double __test(char);
+  static double __test(int);
+  static double __test(unsigned);
+  static double __test(long);
+  static double __test(unsigned long);
+  static double __test(long long);
+  static double __test(unsigned long long);
+  static double __test(double);
+  // No support for long double, use double instead.
+  static double __test(long double);
+
+  typedef decltype(__test(std::declval<_Tp>())) type;
+  static const bool value = !std::is_same<type, void>::value;
+};
+
+template <> struct __numeric_type<void> { static const bool value = true; };
+
+template <class _A1, class _A2 = void, class _A3 = void,
+          bool = __numeric_type<_A1>::value &&__numeric_type<_A2>::value
+              &&__numeric_type<_A3>::value>
+class __promote_imp {
+public:
+  static const bool value = false;
+};
+
+template <class _A1, class _A2, class _A3>
+class __promote_imp<_A1, _A2, _A3, true> {
+private:
+  typedef typename __promote_imp<_A1>::type __type1;
+  typedef typename __promote_imp<_A2>::type __type2;
+  typedef typename __promote_imp<_A3>::type __type3;
+
+public:
+  typedef decltype(__type1() + __type2() + __type3()) type;
+  static const bool value = true;
+};
+
+template <class _A1, class _A2> class __promote_imp<_A1, _A2, void, true> {
+private:
+  typedef typename __promote_imp<_A1>::type __type1;
+  typedef typename __promote_imp<_A2>::type __type2;
+
+public:
+  typedef decltype(__type1() + __type2()) type;
+  static const bool value = true;
+};
+
+template <class _A1> class __promote_imp<_A1, void, void, true> {
+public:
+  typedef typename __numeric_type<_A1>::type type;
+  static const bool value = true;
+};
+
+template <class _A1, class _A2 = void, class _A3 = void>
+class __promote : public __promote_imp<_A1, _A2, _A3> {};
+
+} // namespace __hip
+#endif //__cplusplus >= 201103L
+
+// __HIP_OVERLOAD1 is used to resolve function calls with integer argument to
+// avoid compilation error due to ambibuity. e.g. floor(5) is resolved with
+// floor(double).
+#define __HIP_OVERLOAD1(__retty, __fn)                                         \
+  template <typename __T>                                                      \
+  __DEVICE__ typename __hip_enable_if<std::numeric_limits<__T>::is_integer,    \
+                                      __retty>::type                           \
+  __fn(__T __x) {                                                              \
+    return ::__fn((double)__x);                                                \
+  }
+
+// __HIP_OVERLOAD2 is used to resolve function calls with mixed float/double
+// or integer argument to avoid compilation error due to ambibuity. e.g.
+// max(5.0f, 6.0) is resolved with max(double, double).
+#if __cplusplus >= 201103L
+#define __HIP_OVERLOAD2(__retty, __fn)                                         \
+  template <typename __T1, typename __T2>                                      \
+  __DEVICE__ typename __hip_enable_if<                                         \
+      std::numeric_limits<__T1>::is_specialized &&                             \
+          std::numeric_limits<__T2>::is_specialized,                           \
+      typename __hip::__promote<__T1, __T2>::type>::type                       \
+  __fn(__T1 __x, __T2 __y) {                                                   \
+    typedef typename __hip::__promote<__T1, __T2>::type __result_type;         \
+    return __fn((__result_type)__x, (__result_type)__y);                       \
+  }
+#else
+#define __HIP_OVERLOAD2(__retty, __fn)                                         \
+  template <typename __T1, typename __T2>                                      \
+  __DEVICE__                                                                   \
+      typename __hip_enable_if<std::numeric_limits<__T1>::is_specialized &&    \
+                                   std::numeric_limits<__T2>::is_specialized,  \
+                               __retty>::type                                  \
+      __fn(__T1 __x, __T2 __y) {                                               \
+    return __fn((double)__x, (double)__y);                                     \
+  }
+#endif
+
+__HIP_OVERLOAD1(double, abs)
+__HIP_OVERLOAD1(double, acos)
+__HIP_OVERLOAD1(double, acosh)
+__HIP_OVERLOAD1(double, asin)
+__HIP_OVERLOAD1(double, asinh)
+__HIP_OVERLOAD1(double, atan)
+__HIP_OVERLOAD2(double, atan2)
+__HIP_OVERLOAD1(double, atanh)
+__HIP_OVERLOAD1(double, cbrt)
+__HIP_OVERLOAD1(double, ceil)
+__HIP_OVERLOAD2(double, copysign)
+__HIP_OVERLOAD1(double, cos)
+__HIP_OVERLOAD1(double, cosh)
+__HIP_OVERLOAD1(double, erf)
+__HIP_OVERLOAD1(double, erfc)
+__HIP_OVERLOAD1(double, exp)
+__HIP_OVERLOAD1(double, exp2)
+__HIP_OVERLOAD1(double, expm1)
+__HIP_OVERLOAD1(double, fabs)
+__HIP_OVERLOAD2(double, fdim)
+__HIP_OVERLOAD1(double, floor)
+__HIP_OVERLOAD2(double, fmax)
+__HIP_OVERLOAD2(double, fmin)
+__HIP_OVERLOAD2(double, fmod)
+__HIP_OVERLOAD1(int, fpclassify)
+__HIP_OVERLOAD2(double, hypot)
+__HIP_OVERLOAD1(int, ilogb)
+__HIP_OVERLOAD1(bool, isfinite)
+__HIP_OVERLOAD2(bool, isgreater)
+__HIP_OVERLOAD2(bool, isgreaterequal)
+__HIP_OVERLOAD1(bool, isinf)
+__HIP_OVERLOAD2(bool, isless)
+__HIP_OVERLOAD2(bool, islessequal)
+__HIP_OVERLOAD2(bool, islessgreater)
+__HIP_OVERLOAD1(bool, isnan)
+__HIP_OVERLOAD1(bool, isnormal)
+__HIP_OVERLOAD2(bool, isunordered)
+__HIP_OVERLOAD1(double, lgamma)
+__HIP_OVERLOAD1(double, log)
+__HIP_OVERLOAD1(double, log10)
+__HIP_OVERLOAD1(double, log1p)
+__HIP_OVERLOAD1(double, log2)
+__HIP_OVERLOAD1(double, logb)
+__HIP_OVERLOAD1(long long, llrint)
+__HIP_OVERLOAD1(long long, llround)
+__HIP_OVERLOAD1(long, lrint)
+__HIP_OVERLOAD1(long, lround)
+__HIP_OVERLOAD1(double, nearbyint)
+__HIP_OVERLOAD2(double, nextafter)
+__HIP_OVERLOAD2(double, pow)
+__HIP_OVERLOAD2(double, remainder)
+__HIP_OVERLOAD1(double, rint)
+__HIP_OVERLOAD1(double, round)
+__HIP_OVERLOAD1(bool, signbit)
+__HIP_OVERLOAD1(double, sin)
+__HIP_OVERLOAD1(double, sinh)
+__HIP_OVERLOAD1(double, sqrt)
+__HIP_OVERLOAD1(double, tan)
+__HIP_OVERLOAD1(double, tanh)
+__HIP_OVERLOAD1(double, tgamma)
+__HIP_OVERLOAD1(double, trunc)
+
+// Overload these but don't add them to std, they are not part of cmath.
+__HIP_OVERLOAD2(double, max)
+__HIP_OVERLOAD2(double, min)
+
+// Additional Overloads that don't quite match HIP_OVERLOAD.
+#if __cplusplus >= 201103L
+template <typename __T1, typename __T2, typename __T3>
+__DEVICE__ typename __hip_enable_if<
+    std::numeric_limits<__T1>::is_specialized &&
+        std::numeric_limits<__T2>::is_specialized &&
+        std::numeric_limits<__T3>::is_specialized,
+    typename __hip::__promote<__T1, __T2, __T3>::type>::type
+fma(__T1 __x, __T2 __y, __T3 __z) {
+  typedef typename __hip::__promote<__T1, __T2, __T3>::type __result_type;
+  return ::fma((__result_type)__x, (__result_type)__y, (__result_type)__z);
+}
+#else
+template <typename __T1, typename __T2, typename __T3>
+__DEVICE__
+    typename __hip_enable_if<std::numeric_limits<__T1>::is_specialized &&
+                                 std::numeric_limits<__T2>::is_specialized &&
+                                 std::numeric_limits<__T3>::is_specialized,
+                             double>::type
+    fma(__T1 __x, __T2 __y, __T3 __z) {
+  return ::fma((double)__x, (double)__y, (double)__z);
+}
+#endif
+
+template <typename __T>
+__DEVICE__
+    typename __hip_enable_if<std::numeric_limits<__T>::is_integer, double>::type
+    frexp(__T __x, int *__exp) {
+  return ::frexp((double)__x, __exp);
+}
+
+template <typename __T>
+__DEVICE__
+    typename __hip_enable_if<std::numeric_limits<__T>::is_integer, double>::type
+    ldexp(__T __x, int __exp) {
+  return ::ldexp((double)__x, __exp);
+}
+
+template <typename __T>
+__DEVICE__
+    typename __hip_enable_if<std::numeric_limits<__T>::is_integer, double>::type
+    modf(__T __x, double *__exp) {
+  return ::modf((double)__x, __exp);
+}
+
+#if __cplusplus >= 201103L
+template <typename __T1, typename __T2>
+__DEVICE__
+    typename __hip_enable_if<std::numeric_limits<__T1>::is_specialized &&
+                                 std::numeric_limits<__T2>::is_specialized,
+                             typename __hip::__promote<__T1, __T2>::type>::type
+    remquo(__T1 __x, __T2 __y, int *__quo) {
+  typedef typename __hip::__promote<__T1, __T2>::type __result_type;
+  return ::remquo((__result_type)__x, (__result_type)__y, __quo);
+}
+#else
+template <typename __T1, typename __T2>
+__DEVICE__
+    typename __hip_enable_if<std::numeric_limits<__T1>::is_specialized &&
+                                 std::numeric_limits<__T2>::is_specialized,
+                             double>::type
+    remquo(__T1 __x, __T2 __y, int *__quo) {
+  return ::remquo((double)__x, (double)__y, __quo);
+}
+#endif
+
+template <typename __T>
+__DEVICE__
+    typename __hip_enable_if<std::numeric_limits<__T>::is_integer, double>::type
+    scalbln(__T __x, long int __exp) {
+  return ::scalbln((double)__x, __exp);
+}
+
+template <typename __T>
+__DEVICE__
+    typename __hip_enable_if<std::numeric_limits<__T>::is_integer, double>::type
+    scalbn(__T __x, int __exp) {
+  return ::scalbn((double)__x, __exp);
+}
+
+#pragma pop_macro("__HIP_OVERLOAD1")
+#pragma pop_macro("__HIP_OVERLOAD2")
+
+// END HIP_OVERLOAD
+
+// END DEF_FUN and HIP_OVERLOAD
+
+#endif // defined(__cplusplus)
+
+// Define these overloads inside the namespace our standard library uses.
+#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
+_LIBCPP_BEGIN_NAMESPACE_STD
+#else
+namespace std {
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_BEGIN_NAMESPACE_VERSION
+#endif
+#endif
+
+// Pull the new overloads we defined above into namespace std.
+// using ::abs; - This may be considered for C++.
+using ::acos;
+using ::acosh;
+using ::asin;
+using ::asinh;
+using ::atan;
+using ::atan2;
+using ::atanh;
+using ::cbrt;
+using ::ceil;
+using ::copysign;
+using ::cos;
+using ::cosh;
+using ::erf;
+using ::erfc;
+using ::exp;
+using ::exp2;
+using ::expm1;
+using ::fabs;
+using ::fdim;
+using ::floor;
+using ::fma;
+using ::fmax;
+using ::fmin;
+using ::fmod;
+using ::fpclassify;
+using ::frexp;
+using ::hypot;
+using ::ilogb;
+using ::isfinite;
+using ::isgreater;
+using ::isgreaterequal;
+using ::isless;
+using ::islessequal;
+using ::islessgreater;
+using ::isnormal;
+using ::isunordered;
+using ::ldexp;
+using ::lgamma;
+using ::llrint;
+using ::llround;
+using ::log;
+using ::log10;
+using ::log1p;
+using ::log2;
+using ::logb;
+using ::lrint;
+using ::lround;
+using ::modf;
+// using ::nan; - This may be considered for C++.
+// using ::nanf; - This may be considered for C++.
+// using ::nanl; - This is not yet defined.
+using ::nearbyint;
+using ::nextafter;
+// using ::nexttoward; - Omit this since we do not have a definition.
+using ::pow;
+using ::remainder;
+using ::remquo;
+using ::rint;
+using ::round;
+using ::scalbln;
+using ::scalbn;
+using ::signbit;
+using ::sin;
+using ::sinh;
+using ::sqrt;
+using ::tan;
+using ::tanh;
+using ::tgamma;
+using ::trunc;
+
+// Well this is fun: We need to pull these symbols in for libc++, but we can't
+// pull them in with libstdc++, because its ::isinf and ::isnan are different
+// than its std::isinf and std::isnan.
+#ifndef __GLIBCXX__
+using ::isinf;
+using ::isnan;
+#endif
+
+// Finally, pull the "foobarf" functions that HIP defines into std.
+using ::acosf;
+using ::acoshf;
+using ::asinf;
+using ::asinhf;
+using ::atan2f;
+using ::atanf;
+using ::atanhf;
+using ::cbrtf;
+using ::ceilf;
+using ::copysignf;
+using ::cosf;
+using ::coshf;
+using ::erfcf;
+using ::erff;
+using ::exp2f;
+using ::expf;
+using ::expm1f;
+using ::fabsf;
+using ::fdimf;
+using ::floorf;
+using ::fmaf;
+using ::fmaxf;
+using ::fminf;
+using ::fmodf;
+using ::frexpf;
+using ::hypotf;
+using ::ilogbf;
+using ::ldexpf;
+using ::lgammaf;
+using ::llrintf;
+using ::llroundf;
+using ::log10f;
+using ::log1pf;
+using ::log2f;
+using ::logbf;
+using ::logf;
+using ::lrintf;
+using ::lroundf;
+using ::modff;
+using ::nearbyintf;
+using ::nextafterf;
+// using ::nexttowardf; - Omit this since we do not have a definition.
+using ::powf;
+using ::remainderf;
+using ::remquof;
+using ::rintf;
+using ::roundf;
+using ::scalblnf;
+using ::scalbnf;
+using ::sinf;
+using ::sinhf;
+using ::sqrtf;
+using ::tanf;
+using ::tanhf;
+using ::tgammaf;
+using ::truncf;
+
+#ifdef _LIBCPP_END_NAMESPACE_STD
+_LIBCPP_END_NAMESPACE_STD
+#else
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_END_NAMESPACE_VERSION
+#endif
+} // namespace std
+#endif
+
+// Define device-side math functions from <ymath.h> on MSVC.
+#if defined(_MSC_VER)
+
+// Before VS2019, `<ymath.h>` is also included in `<limits>` and other headers.
+// But, from VS2019, it's only included in `<complex>`. Need to include
+// `<ymath.h>` here to ensure C functions declared there won't be markded as
+// `__host__` and `__device__` through `<complex>` wrapper.
+#include <ymath.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif // defined(__cplusplus)
+__DEVICE__ __attribute__((overloadable)) double _Cosh(double x, double y) {
+  return cosh(x) * y;
+}
+__DEVICE__ __attribute__((overloadable)) float _FCosh(float x, float y) {
+  return coshf(x) * y;
+}
+__DEVICE__ __attribute__((overloadable)) short _Dtest(double *p) {
+  return fpclassify(*p);
+}
+__DEVICE__ __attribute__((overloadable)) short _FDtest(float *p) {
+  return fpclassify(*p);
+}
+__DEVICE__ __attribute__((overloadable)) double _Sinh(double x, double y) {
+  return sinh(x) * y;
+}
+__DEVICE__ __attribute__((overloadable)) float _FSinh(float x, float y) {
+  return sinhf(x) * y;
+}
+#if defined(__cplusplus)
+}
+#endif // defined(__cplusplus)
+#endif // defined(_MSC_VER)
+
+#pragma pop_macro("__DEVICE__")
+
+#endif // __CLANG_HIP_CMATH_H__
diff --git a/lib/include/__clang_hip_libdevice_declares.h b/lib/include/__clang_hip_libdevice_declares.h
index e1cd49a39c..ac98907ad5 100644
--- a/lib/include/__clang_hip_libdevice_declares.h
+++ b/lib/include/__clang_hip_libdevice_declares.h
@@ -10,7 +10,9 @@
 #ifndef __CLANG_HIP_LIBDEVICE_DECLARES_H__
 #define __CLANG_HIP_LIBDEVICE_DECLARES_H__
 
+#ifdef __cplusplus
 extern "C" {
+#endif
 
 // BEGIN FLOAT
 __device__ __attribute__((const)) float __ocml_acos_f32(float);
@@ -78,6 +80,7 @@ __device__ __attribute__((const)) float __ocml_len4_f32(float, float, float,
 __device__ __attribute__((pure)) float __ocml_ncdf_f32(float);
 __device__ __attribute__((pure)) float __ocml_ncdfinv_f32(float);
 __device__ __attribute__((pure)) float __ocml_pow_f32(float, float);
+__device__ __attribute__((pure)) float __ocml_pown_f32(float, int);
 __device__ __attribute__((pure)) float __ocml_rcbrt_f32(float);
 __device__ __attribute__((const)) float __ocml_remainder_f32(float, float);
 __device__ float __ocml_remquo_f32(float, float,
@@ -126,10 +129,10 @@ __device__ __attribute__((const)) float __ocml_div_rte_f32(float, float);
 __device__ __attribute__((const)) float __ocml_div_rtn_f32(float, float);
 __device__ __attribute__((const)) float __ocml_div_rtp_f32(float, float);
 __device__ __attribute__((const)) float __ocml_div_rtz_f32(float, float);
-__device__ __attribute__((const)) float __ocml_sqrt_rte_f32(float, float);
-__device__ __attribute__((const)) float __ocml_sqrt_rtn_f32(float, float);
-__device__ __attribute__((const)) float __ocml_sqrt_rtp_f32(float, float);
-__device__ __attribute__((const)) float __ocml_sqrt_rtz_f32(float, float);
+__device__ __attribute__((const)) float __ocml_sqrt_rte_f32(float);
+__device__ __attribute__((const)) float __ocml_sqrt_rtn_f32(float);
+__device__ __attribute__((const)) float __ocml_sqrt_rtp_f32(float);
+__device__ __attribute__((const)) float __ocml_sqrt_rtz_f32(float);
 __device__ __attribute__((const)) float __ocml_fma_rte_f32(float, float, float);
 __device__ __attribute__((const)) float __ocml_fma_rtn_f32(float, float, float);
 __device__ __attribute__((const)) float __ocml_fma_rtp_f32(float, float, float);
@@ -205,6 +208,7 @@ __device__ __attribute__((const)) double __ocml_len4_f64(double, double, double,
 __device__ __attribute__((pure)) double __ocml_ncdf_f64(double);
 __device__ __attribute__((pure)) double __ocml_ncdfinv_f64(double);
 __device__ __attribute__((pure)) double __ocml_pow_f64(double, double);
+__device__ __attribute__((pure)) double __ocml_pown_f64(double, int);
 __device__ __attribute__((pure)) double __ocml_rcbrt_f64(double);
 __device__ __attribute__((const)) double __ocml_remainder_f64(double, double);
 __device__ double __ocml_remquo_f64(double, double,
@@ -252,10 +256,10 @@ __device__ __attribute__((const)) double __ocml_div_rte_f64(double, double);
 __device__ __attribute__((const)) double __ocml_div_rtn_f64(double, double);
 __device__ __attribute__((const)) double __ocml_div_rtp_f64(double, double);
 __device__ __attribute__((const)) double __ocml_div_rtz_f64(double, double);
-__device__ __attribute__((const)) double __ocml_sqrt_rte_f64(double, double);
-__device__ __attribute__((const)) double __ocml_sqrt_rtn_f64(double, double);
-__device__ __attribute__((const)) double __ocml_sqrt_rtp_f64(double, double);
-__device__ __attribute__((const)) double __ocml_sqrt_rtz_f64(double, double);
+__device__ __attribute__((const)) double __ocml_sqrt_rte_f64(double);
+__device__ __attribute__((const)) double __ocml_sqrt_rtn_f64(double);
+__device__ __attribute__((const)) double __ocml_sqrt_rtp_f64(double);
+__device__ __attribute__((const)) double __ocml_sqrt_rtz_f64(double);
 __device__ __attribute__((const)) double __ocml_fma_rte_f64(double, double,
                                                             double);
 __device__ __attribute__((const)) double __ocml_fma_rtn_f64(double, double,
@@ -290,6 +294,7 @@ __device__ __attribute__((const)) _Float16 __ocml_rsqrt_f16(_Float16);
 __device__ _Float16 __ocml_sin_f16(_Float16);
 __device__ __attribute__((const)) _Float16 __ocml_sqrt_f16(_Float16);
 __device__ __attribute__((const)) _Float16 __ocml_trunc_f16(_Float16);
+__device__ __attribute__((pure)) _Float16 __ocml_pown_f16(_Float16, int);
 
 typedef _Float16 __2f16 __attribute__((ext_vector_type(2)));
 typedef short __2i16 __attribute__((ext_vector_type(2)));
@@ -313,14 +318,17 @@ __device__ __attribute__((pure)) __2f16 __ocml_log2_2f16(__2f16);
 __device__ inline __2f16
 __llvm_amdgcn_rcp_2f16(__2f16 __x) // Not currently exposed by ROCDL.
 {
-  return __2f16{__llvm_amdgcn_rcp_f16(__x.x), __llvm_amdgcn_rcp_f16(__x.y)};
+  return (__2f16)(__llvm_amdgcn_rcp_f16(__x.x), __llvm_amdgcn_rcp_f16(__x.y));
 }
 __device__ __attribute__((const)) __2f16 __ocml_rint_2f16(__2f16);
 __device__ __attribute__((const)) __2f16 __ocml_rsqrt_2f16(__2f16);
 __device__ __2f16 __ocml_sin_2f16(__2f16);
 __device__ __attribute__((const)) __2f16 __ocml_sqrt_2f16(__2f16);
 __device__ __attribute__((const)) __2f16 __ocml_trunc_2f16(__2f16);
+__device__ __attribute__((const)) __2f16 __ocml_pown_2f16(__2f16, __2i16);
 
+#ifdef __cplusplus
 } // extern "C"
+#endif
 
 #endif // __CLANG_HIP_LIBDEVICE_DECLARES_H__
diff --git a/lib/include/__clang_hip_math.h b/lib/include/__clang_hip_math.h
index cf7014b9ae..14d91c66b3 100644
--- a/lib/include/__clang_hip_math.h
+++ b/lib/include/__clang_hip_math.h
@@ -1,4 +1,4 @@
-/*===---- __clang_hip_math.h - HIP math decls -------------------------------===
+/*===---- __clang_hip_math.h - Device-side HIP math support ----------------===
  *
  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  * See https://llvm.org/LICENSE.txt for license information.
@@ -6,24 +6,57 @@
  *
  *===-----------------------------------------------------------------------===
  */
-
 #ifndef __CLANG_HIP_MATH_H__
 #define __CLANG_HIP_MATH_H__
 
+#if !defined(__HIP__)
+#error "This file is for HIP and OpenMP AMDGCN device compilation only."
+#endif
+
+#if defined(__cplusplus)
 #include <algorithm>
+#endif
 #include <limits.h>
-#include <limits>
 #include <stdint.h>
 
 #pragma push_macro("__DEVICE__")
-#pragma push_macro("__RETURN_TYPE")
+#define __DEVICE__ static __device__ inline __attribute__((always_inline))
 
-// to be consistent with __clang_cuda_math_forward_declares
-#define __DEVICE__ static __device__
+// A few functions return bool type starting only in C++11.
+#pragma push_macro("__RETURN_TYPE")
+#if defined(__cplusplus)
 #define __RETURN_TYPE bool
+#else
+#define __RETURN_TYPE int
+#endif
+
+#if defined (__cplusplus) && __cplusplus < 201103L
+// emulate static_assert on type sizes
+template<bool>
+struct __compare_result{};
+template<>
+struct __compare_result<true> {
+  static const bool valid;
+};
 
 __DEVICE__
-inline uint64_t __make_mantissa_base8(const char *__tagp) {
+void __suppress_unused_warning(bool b){};
+template <unsigned int S, unsigned int T>
+__DEVICE__ void __static_assert_equal_size() {
+  __suppress_unused_warning(__compare_result<S == T>::valid);
+}
+
+#define __static_assert_type_size_equal(A, B) \
+  __static_assert_equal_size<A,B>()
+
+#else
+#define __static_assert_type_size_equal(A,B) \
+  static_assert((A) == (B), "")
+
+#endif
+
+__DEVICE__
+uint64_t __make_mantissa_base8(const char *__tagp) {
   uint64_t __r = 0;
   while (__tagp) {
     char __tmp = *__tagp;
@@ -40,7 +73,7 @@ inline uint64_t __make_mantissa_base8(const char *__tagp) {
 }
 
 __DEVICE__
-inline uint64_t __make_mantissa_base10(const char *__tagp) {
+uint64_t __make_mantissa_base10(const char *__tagp) {
   uint64_t __r = 0;
   while (__tagp) {
     char __tmp = *__tagp;
@@ -57,7 +90,7 @@ inline uint64_t __make_mantissa_base10(const char *__tagp) {
 }
 
 __DEVICE__
-inline uint64_t __make_mantissa_base16(const char *__tagp) {
+uint64_t __make_mantissa_base16(const char *__tagp) {
   uint64_t __r = 0;
   while (__tagp) {
     char __tmp = *__tagp;
@@ -78,7 +111,7 @@ inline uint64_t __make_mantissa_base16(const char *__tagp) {
 }
 
 __DEVICE__
-inline uint64_t __make_mantissa(const char *__tagp) {
+uint64_t __make_mantissa(const char *__tagp) {
   if (!__tagp)
     return 0u;
 
@@ -95,78 +128,124 @@ inline uint64_t __make_mantissa(const char *__tagp) {
 }
 
 // BEGIN FLOAT
+#if defined(__cplusplus)
 __DEVICE__
-inline float abs(float __x) { return __ocml_fabs_f32(__x); }
-__DEVICE__
-inline float acosf(float __x) { return __ocml_acos_f32(__x); }
-__DEVICE__
-inline float acoshf(float __x) { return __ocml_acosh_f32(__x); }
-__DEVICE__
-inline float asinf(float __x) { return __ocml_asin_f32(__x); }
-__DEVICE__
-inline float asinhf(float __x) { return __ocml_asinh_f32(__x); }
-__DEVICE__
-inline float atan2f(float __x, float __y) { return __ocml_atan2_f32(__x, __y); }
-__DEVICE__
-inline float atanf(float __x) { return __ocml_atan_f32(__x); }
-__DEVICE__
-inline float atanhf(float __x) { return __ocml_atanh_f32(__x); }
-__DEVICE__
-inline float cbrtf(float __x) { return __ocml_cbrt_f32(__x); }
-__DEVICE__
-inline float ceilf(float __x) { return __ocml_ceil_f32(__x); }
-__DEVICE__
-inline float copysignf(float __x, float __y) {
-  return __ocml_copysign_f32(__x, __y);
+int abs(int __x) {
+  int __sgn = __x >> (sizeof(int) * CHAR_BIT - 1);
+  return (__x ^ __sgn) - __sgn;
 }
 __DEVICE__
-inline float cosf(float __x) { return __ocml_cos_f32(__x); }
+long labs(long __x) {
+  long __sgn = __x >> (sizeof(long) * CHAR_BIT - 1);
+  return (__x ^ __sgn) - __sgn;
+}
 __DEVICE__
-inline float coshf(float __x) { return __ocml_cosh_f32(__x); }
+long long llabs(long long __x) {
+  long long __sgn = __x >> (sizeof(long long) * CHAR_BIT - 1);
+  return (__x ^ __sgn) - __sgn;
+}
+#endif
+
 __DEVICE__
-inline float cospif(float __x) { return __ocml_cospi_f32(__x); }
+float acosf(float __x) { return __ocml_acos_f32(__x); }
+
 __DEVICE__
-inline float cyl_bessel_i0f(float __x) { return __ocml_i0_f32(__x); }
+float acoshf(float __x) { return __ocml_acosh_f32(__x); }
+
 __DEVICE__
-inline float cyl_bessel_i1f(float __x) { return __ocml_i1_f32(__x); }
+float asinf(float __x) { return __ocml_asin_f32(__x); }
+
 __DEVICE__
-inline float erfcf(float __x) { return __ocml_erfc_f32(__x); }
+float asinhf(float __x) { return __ocml_asinh_f32(__x); }
+
 __DEVICE__
-inline float erfcinvf(float __x) { return __ocml_erfcinv_f32(__x); }
+float atan2f(float __x, float __y) { return __ocml_atan2_f32(__x, __y); }
+
 __DEVICE__
-inline float erfcxf(float __x) { return __ocml_erfcx_f32(__x); }
+float atanf(float __x) { return __ocml_atan_f32(__x); }
+
 __DEVICE__
-inline float erff(float __x) { return __ocml_erf_f32(__x); }
+float atanhf(float __x) { return __ocml_atanh_f32(__x); }
+
 __DEVICE__
-inline float erfinvf(float __x) { return __ocml_erfinv_f32(__x); }
+float cbrtf(float __x) { return __ocml_cbrt_f32(__x); }
+
 __DEVICE__
-inline float exp10f(float __x) { return __ocml_exp10_f32(__x); }
+float ceilf(float __x) { return __ocml_ceil_f32(__x); }
+
 __DEVICE__
-inline float exp2f(float __x) { return __ocml_exp2_f32(__x); }
+float copysignf(float __x, float __y) { return __ocml_copysign_f32(__x, __y); }
+
 __DEVICE__
-inline float expf(float __x) { return __ocml_exp_f32(__x); }
+float cosf(float __x) { return __ocml_cos_f32(__x); }
+
 __DEVICE__
-inline float expm1f(float __x) { return __ocml_expm1_f32(__x); }
+float coshf(float __x) { return __ocml_cosh_f32(__x); }
+
 __DEVICE__
-inline float fabsf(float __x) { return __ocml_fabs_f32(__x); }
+float cospif(float __x) { return __ocml_cospi_f32(__x); }
+
 __DEVICE__
-inline float fdimf(float __x, float __y) { return __ocml_fdim_f32(__x, __y); }
+float cyl_bessel_i0f(float __x) { return __ocml_i0_f32(__x); }
+
 __DEVICE__
-inline float fdividef(float __x, float __y) { return __x / __y; }
+float cyl_bessel_i1f(float __x) { return __ocml_i1_f32(__x); }
+
 __DEVICE__
-inline float floorf(float __x) { return __ocml_floor_f32(__x); }
+float erfcf(float __x) { return __ocml_erfc_f32(__x); }
+
 __DEVICE__
-inline float fmaf(float __x, float __y, float __z) {
+float erfcinvf(float __x) { return __ocml_erfcinv_f32(__x); }
+
+__DEVICE__
+float erfcxf(float __x) { return __ocml_erfcx_f32(__x); }
+
+__DEVICE__
+float erff(float __x) { return __ocml_erf_f32(__x); }
+
+__DEVICE__
+float erfinvf(float __x) { return __ocml_erfinv_f32(__x); }
+
+__DEVICE__
+float exp10f(float __x) { return __ocml_exp10_f32(__x); }
+
+__DEVICE__
+float exp2f(float __x) { return __ocml_exp2_f32(__x); }
+
+__DEVICE__
+float expf(float __x) { return __ocml_exp_f32(__x); }
+
+__DEVICE__
+float expm1f(float __x) { return __ocml_expm1_f32(__x); }
+
+__DEVICE__
+float fabsf(float __x) { return __ocml_fabs_f32(__x); }
+
+__DEVICE__
+float fdimf(float __x, float __y) { return __ocml_fdim_f32(__x, __y); }
+
+__DEVICE__
+float fdividef(float __x, float __y) { return __x / __y; }
+
+__DEVICE__
+float floorf(float __x) { return __ocml_floor_f32(__x); }
+
+__DEVICE__
+float fmaf(float __x, float __y, float __z) {
   return __ocml_fma_f32(__x, __y, __z);
 }
+
 __DEVICE__
-inline float fmaxf(float __x, float __y) { return __ocml_fmax_f32(__x, __y); }
+float fmaxf(float __x, float __y) { return __ocml_fmax_f32(__x, __y); }
+
 __DEVICE__
-inline float fminf(float __x, float __y) { return __ocml_fmin_f32(__x, __y); }
+float fminf(float __x, float __y) { return __ocml_fmin_f32(__x, __y); }
+
 __DEVICE__
-inline float fmodf(float __x, float __y) { return __ocml_fmod_f32(__x, __y); }
+float fmodf(float __x, float __y) { return __ocml_fmod_f32(__x, __y); }
+
 __DEVICE__
-inline float frexpf(float __x, int *__nptr) {
+float frexpf(float __x, int *__nptr) {
   int __tmp;
   float __r =
       __ocml_frexp_f32(__x, (__attribute__((address_space(5))) int *)&__tmp);
@@ -174,24 +253,31 @@ inline float frexpf(float __x, int *__nptr) {
 
   return __r;
 }
+
 __DEVICE__
-inline float hypotf(float __x, float __y) { return __ocml_hypot_f32(__x, __y); }
+float hypotf(float __x, float __y) { return __ocml_hypot_f32(__x, __y); }
+
 __DEVICE__
-inline int ilogbf(float __x) { return __ocml_ilogb_f32(__x); }
+int ilogbf(float __x) { return __ocml_ilogb_f32(__x); }
+
 __DEVICE__
-inline __RETURN_TYPE isfinite(float __x) { return __ocml_isfinite_f32(__x); }
+__RETURN_TYPE __finitef(float __x) { return __ocml_isfinite_f32(__x); }
+
 __DEVICE__
-inline __RETURN_TYPE isinf(float __x) { return __ocml_isinf_f32(__x); }
+__RETURN_TYPE __isinff(float __x) { return __ocml_isinf_f32(__x); }
+
 __DEVICE__
-inline __RETURN_TYPE isnan(float __x) { return __ocml_isnan_f32(__x); }
+__RETURN_TYPE __isnanf(float __x) { return __ocml_isnan_f32(__x); }
+
 __DEVICE__
-inline float j0f(float __x) { return __ocml_j0_f32(__x); }
+float j0f(float __x) { return __ocml_j0_f32(__x); }
+
 __DEVICE__
-inline float j1f(float __x) { return __ocml_j1_f32(__x); }
+float j1f(float __x) { return __ocml_j1_f32(__x); }
+
 __DEVICE__
-inline float jnf(int __n,
-                 float __x) { // TODO: we could use Ahmes multiplication
-                              // and the Miller & Brown algorithm
+float jnf(int __n, float __x) { // TODO: we could use Ahmes multiplication
+                                // and the Miller & Brown algorithm
   //       for linear recurrences to get O(log n) steps, but it's unclear if
   //       it'd be beneficial in this case.
   if (__n == 0)
@@ -209,50 +295,61 @@ inline float jnf(int __n,
 
   return __x1;
 }
+
 __DEVICE__
-inline float ldexpf(float __x, int __e) { return __ocml_ldexp_f32(__x, __e); }
+float ldexpf(float __x, int __e) { return __ocml_ldexp_f32(__x, __e); }
+
 __DEVICE__
-inline float lgammaf(float __x) { return __ocml_lgamma_f32(__x); }
+float lgammaf(float __x) { return __ocml_lgamma_f32(__x); }
+
 __DEVICE__
-inline long long int llrintf(float __x) { return __ocml_rint_f32(__x); }
+long long int llrintf(float __x) { return __ocml_rint_f32(__x); }
+
 __DEVICE__
-inline long long int llroundf(float __x) { return __ocml_round_f32(__x); }
+long long int llroundf(float __x) { return __ocml_round_f32(__x); }
+
 __DEVICE__
-inline float log10f(float __x) { return __ocml_log10_f32(__x); }
+float log10f(float __x) { return __ocml_log10_f32(__x); }
+
 __DEVICE__
-inline float log1pf(float __x) { return __ocml_log1p_f32(__x); }
+float log1pf(float __x) { return __ocml_log1p_f32(__x); }
+
 __DEVICE__
-inline float log2f(float __x) { return __ocml_log2_f32(__x); }
+float log2f(float __x) { return __ocml_log2_f32(__x); }
+
 __DEVICE__
-inline float logbf(float __x) { return __ocml_logb_f32(__x); }
+float logbf(float __x) { return __ocml_logb_f32(__x); }
+
 __DEVICE__
-inline float logf(float __x) { return __ocml_log_f32(__x); }
+float logf(float __x) { return __ocml_log_f32(__x); }
+
 __DEVICE__
-inline long int lrintf(float __x) { return __ocml_rint_f32(__x); }
+long int lrintf(float __x) { return __ocml_rint_f32(__x); }
+
 __DEVICE__
-inline long int lroundf(float __x) { return __ocml_round_f32(__x); }
+long int lroundf(float __x) { return __ocml_round_f32(__x); }
+
 __DEVICE__
-inline float modff(float __x, float *__iptr) {
+float modff(float __x, float *__iptr) {
   float __tmp;
   float __r =
       __ocml_modf_f32(__x, (__attribute__((address_space(5))) float *)&__tmp);
   *__iptr = __tmp;
-
   return __r;
 }
+
 __DEVICE__
-inline float nanf(const char *__tagp) {
+float nanf(const char *__tagp) {
   union {
     float val;
     struct ieee_float {
-      uint32_t mantissa : 22;
-      uint32_t quiet : 1;
-      uint32_t exponent : 8;
-      uint32_t sign : 1;
+      unsigned int mantissa : 22;
+      unsigned int quiet : 1;
+      unsigned int exponent : 8;
+      unsigned int sign : 1;
     } bits;
-
-    static_assert(sizeof(float) == sizeof(ieee_float), "");
   } __tmp;
+  __static_assert_type_size_equal(sizeof(__tmp.val), sizeof(__tmp.bits));
 
   __tmp.bits.sign = 0u;
   __tmp.bits.exponent = ~0u;
@@ -261,28 +358,34 @@ inline float nanf(const char *__tagp) {
 
   return __tmp.val;
 }
+
 __DEVICE__
-inline float nearbyintf(float __x) { return __ocml_nearbyint_f32(__x); }
+float nearbyintf(float __x) { return __ocml_nearbyint_f32(__x); }
+
 __DEVICE__
-inline float nextafterf(float __x, float __y) {
+float nextafterf(float __x, float __y) {
   return __ocml_nextafter_f32(__x, __y);
 }
+
 __DEVICE__
-inline float norm3df(float __x, float __y, float __z) {
+float norm3df(float __x, float __y, float __z) {
   return __ocml_len3_f32(__x, __y, __z);
 }
+
 __DEVICE__
-inline float norm4df(float __x, float __y, float __z, float __w) {
+float norm4df(float __x, float __y, float __z, float __w) {
   return __ocml_len4_f32(__x, __y, __z, __w);
 }
+
 __DEVICE__
-inline float normcdff(float __x) { return __ocml_ncdf_f32(__x); }
+float normcdff(float __x) { return __ocml_ncdf_f32(__x); }
+
 __DEVICE__
-inline float normcdfinvf(float __x) { return __ocml_ncdfinv_f32(__x); }
+float normcdfinvf(float __x) { return __ocml_ncdfinv_f32(__x); }
+
 __DEVICE__
-inline float
-normf(int __dim,
-      const float *__a) { // TODO: placeholder until OCML adds support.
+float normf(int __dim,
+            const float *__a) { // TODO: placeholder until OCML adds support.
   float __r = 0;
   while (__dim--) {
     __r += __a[0] * __a[0];
@@ -291,16 +394,23 @@ normf(int __dim,
 
   return __ocml_sqrt_f32(__r);
 }
+
 __DEVICE__
-inline float powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); }
+float powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); }
+
 __DEVICE__
-inline float rcbrtf(float __x) { return __ocml_rcbrt_f32(__x); }
+float powif(float __x, int __y) { return __ocml_pown_f32(__x, __y); }
+
 __DEVICE__
-inline float remainderf(float __x, float __y) {
+float rcbrtf(float __x) { return __ocml_rcbrt_f32(__x); }
+
+__DEVICE__
+float remainderf(float __x, float __y) {
   return __ocml_remainder_f32(__x, __y);
 }
+
 __DEVICE__
-inline float remquof(float __x, float __y, int *__quo) {
+float remquof(float __x, float __y, int *__quo) {
   int __tmp;
   float __r = __ocml_remquo_f32(
       __x, __y, (__attribute__((address_space(5))) int *)&__tmp);
@@ -308,25 +418,26 @@ inline float remquof(float __x, float __y, int *__quo) {
 
   return __r;
 }
+
 __DEVICE__
-inline float rhypotf(float __x, float __y) {
-  return __ocml_rhypot_f32(__x, __y);
-}
+float rhypotf(float __x, float __y) { return __ocml_rhypot_f32(__x, __y); }
+
 __DEVICE__
-inline float rintf(float __x) { return __ocml_rint_f32(__x); }
+float rintf(float __x) { return __ocml_rint_f32(__x); }
+
 __DEVICE__
-inline float rnorm3df(float __x, float __y, float __z) {
+float rnorm3df(float __x, float __y, float __z) {
   return __ocml_rlen3_f32(__x, __y, __z);
 }
 
 __DEVICE__
-inline float rnorm4df(float __x, float __y, float __z, float __w) {
+float rnorm4df(float __x, float __y, float __z, float __w) {
   return __ocml_rlen4_f32(__x, __y, __z, __w);
 }
+
 __DEVICE__
-inline float
-rnormf(int __dim,
-       const float *__a) { // TODO: placeholder until OCML adds support.
+float rnormf(int __dim,
+             const float *__a) { // TODO: placeholder until OCML adds support.
   float __r = 0;
   while (__dim--) {
     __r += __a[0] * __a[0];
@@ -335,59 +446,74 @@ rnormf(int __dim,
 
   return __ocml_rsqrt_f32(__r);
 }
+
 __DEVICE__
-inline float roundf(float __x) { return __ocml_round_f32(__x); }
+float roundf(float __x) { return __ocml_round_f32(__x); }
+
 __DEVICE__
-inline float rsqrtf(float __x) { return __ocml_rsqrt_f32(__x); }
+float rsqrtf(float __x) { return __ocml_rsqrt_f32(__x); }
+
 __DEVICE__
-inline float scalblnf(float __x, long int __n) {
+float scalblnf(float __x, long int __n) {
   return (__n < INT_MAX) ? __ocml_scalbn_f32(__x, __n)
                          : __ocml_scalb_f32(__x, __n);
 }
-__DEVICE__
-inline float scalbnf(float __x, int __n) { return __ocml_scalbn_f32(__x, __n); }
-__DEVICE__
-inline __RETURN_TYPE signbit(float __x) { return __ocml_signbit_f32(__x); }
-__DEVICE__
-inline void sincosf(float __x, float *__sinptr, float *__cosptr) {
-  float __tmp;
 
+__DEVICE__
+float scalbnf(float __x, int __n) { return __ocml_scalbn_f32(__x, __n); }
+
+__DEVICE__
+__RETURN_TYPE __signbitf(float __x) { return __ocml_signbit_f32(__x); }
+
+__DEVICE__
+void sincosf(float __x, float *__sinptr, float *__cosptr) {
+  float __tmp;
   *__sinptr =
       __ocml_sincos_f32(__x, (__attribute__((address_space(5))) float *)&__tmp);
   *__cosptr = __tmp;
 }
-__DEVICE__
-inline void sincospif(float __x, float *__sinptr, float *__cosptr) {
-  float __tmp;
 
+__DEVICE__
+void sincospif(float __x, float *__sinptr, float *__cosptr) {
+  float __tmp;
   *__sinptr = __ocml_sincospi_f32(
       __x, (__attribute__((address_space(5))) float *)&__tmp);
   *__cosptr = __tmp;
 }
+
 __DEVICE__
-inline float sinf(float __x) { return __ocml_sin_f32(__x); }
+float sinf(float __x) { return __ocml_sin_f32(__x); }
+
 __DEVICE__
-inline float sinhf(float __x) { return __ocml_sinh_f32(__x); }
+float sinhf(float __x) { return __ocml_sinh_f32(__x); }
+
 __DEVICE__
-inline float sinpif(float __x) { return __ocml_sinpi_f32(__x); }
+float sinpif(float __x) { return __ocml_sinpi_f32(__x); }
+
 __DEVICE__
-inline float sqrtf(float __x) { return __ocml_sqrt_f32(__x); }
+float sqrtf(float __x) { return __ocml_sqrt_f32(__x); }
+
 __DEVICE__
-inline float tanf(float __x) { return __ocml_tan_f32(__x); }
+float tanf(float __x) { return __ocml_tan_f32(__x); }
+
 __DEVICE__
-inline float tanhf(float __x) { return __ocml_tanh_f32(__x); }
+float tanhf(float __x) { return __ocml_tanh_f32(__x); }
+
 __DEVICE__
-inline float tgammaf(float __x) { return __ocml_tgamma_f32(__x); }
+float tgammaf(float __x) { return __ocml_tgamma_f32(__x); }
+
 __DEVICE__
-inline float truncf(float __x) { return __ocml_trunc_f32(__x); }
+float truncf(float __x) { return __ocml_trunc_f32(__x); }
+
 __DEVICE__
-inline float y0f(float __x) { return __ocml_y0_f32(__x); }
+float y0f(float __x) { return __ocml_y0_f32(__x); }
+
 __DEVICE__
-inline float y1f(float __x) { return __ocml_y1_f32(__x); }
+float y1f(float __x) { return __ocml_y1_f32(__x); }
+
 __DEVICE__
-inline float ynf(int __n,
-                 float __x) { // TODO: we could use Ahmes multiplication
-                              // and the Miller & Brown algorithm
+float ynf(int __n, float __x) { // TODO: we could use Ahmes multiplication
+                                // and the Miller & Brown algorithm
   //       for linear recurrences to get O(log n) steps, but it's unclear if
   //       it'd be beneficial in this case. Placeholder until OCML adds
   //       support.
@@ -408,290 +534,343 @@ inline float ynf(int __n,
 }
 
 // BEGIN INTRINSICS
+
 __DEVICE__
-inline float __cosf(float __x) { return __ocml_native_cos_f32(__x); }
+float __cosf(float __x) { return __ocml_native_cos_f32(__x); }
+
 __DEVICE__
-inline float __exp10f(float __x) { return __ocml_native_exp10_f32(__x); }
+float __exp10f(float __x) { return __ocml_native_exp10_f32(__x); }
+
 __DEVICE__
-inline float __expf(float __x) { return __ocml_native_exp_f32(__x); }
+float __expf(float __x) { return __ocml_native_exp_f32(__x); }
+
 #if defined OCML_BASIC_ROUNDED_OPERATIONS
 __DEVICE__
-inline float __fadd_rd(float __x, float __y) {
-  return __ocml_add_rtn_f32(__x, __y);
-}
+float __fadd_rd(float __x, float __y) { return __ocml_add_rtn_f32(__x, __y); }
+__DEVICE__
+float __fadd_rn(float __x, float __y) { return __ocml_add_rte_f32(__x, __y); }
+__DEVICE__
+float __fadd_ru(float __x, float __y) { return __ocml_add_rtp_f32(__x, __y); }
+__DEVICE__
+float __fadd_rz(float __x, float __y) { return __ocml_add_rtz_f32(__x, __y); }
+#else
+__DEVICE__
+float __fadd_rn(float __x, float __y) { return __x + __y; }
 #endif
-__DEVICE__
-inline float __fadd_rn(float __x, float __y) { return __x + __y; }
+
 #if defined OCML_BASIC_ROUNDED_OPERATIONS
 __DEVICE__
-inline float __fadd_ru(float __x, float __y) {
-  return __ocml_add_rtp_f32(__x, __y);
-}
+float __fdiv_rd(float __x, float __y) { return __ocml_div_rtn_f32(__x, __y); }
 __DEVICE__
-inline float __fadd_rz(float __x, float __y) {
-  return __ocml_add_rtz_f32(__x, __y);
-}
+float __fdiv_rn(float __x, float __y) { return __ocml_div_rte_f32(__x, __y); }
 __DEVICE__
-inline float __fdiv_rd(float __x, float __y) {
-  return __ocml_div_rtn_f32(__x, __y);
-}
+float __fdiv_ru(float __x, float __y) { return __ocml_div_rtp_f32(__x, __y); }
+__DEVICE__
+float __fdiv_rz(float __x, float __y) { return __ocml_div_rtz_f32(__x, __y); }
+#else
+__DEVICE__
+float __fdiv_rn(float __x, float __y) { return __x / __y; }
 #endif
+
 __DEVICE__
-inline float __fdiv_rn(float __x, float __y) { return __x / __y; }
+float __fdividef(float __x, float __y) { return __x / __y; }
+
 #if defined OCML_BASIC_ROUNDED_OPERATIONS
 __DEVICE__
-inline float __fdiv_ru(float __x, float __y) {
-  return __ocml_div_rtp_f32(__x, __y);
-}
-__DEVICE__
-inline float __fdiv_rz(float __x, float __y) {
-  return __ocml_div_rtz_f32(__x, __y);
-}
-#endif
-__DEVICE__
-inline float __fdividef(float __x, float __y) { return __x / __y; }
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-__DEVICE__
-inline float __fmaf_rd(float __x, float __y, float __z) {
+float __fmaf_rd(float __x, float __y, float __z) {
   return __ocml_fma_rtn_f32(__x, __y, __z);
 }
-#endif
 __DEVICE__
-inline float __fmaf_rn(float __x, float __y, float __z) {
-  return __ocml_fma_f32(__x, __y, __z);
+float __fmaf_rn(float __x, float __y, float __z) {
+  return __ocml_fma_rte_f32(__x, __y, __z);
 }
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
 __DEVICE__
-inline float __fmaf_ru(float __x, float __y, float __z) {
+float __fmaf_ru(float __x, float __y, float __z) {
   return __ocml_fma_rtp_f32(__x, __y, __z);
 }
 __DEVICE__
-inline float __fmaf_rz(float __x, float __y, float __z) {
+float __fmaf_rz(float __x, float __y, float __z) {
   return __ocml_fma_rtz_f32(__x, __y, __z);
 }
+#else
 __DEVICE__
-inline float __fmul_rd(float __x, float __y) {
-  return __ocml_mul_rtn_f32(__x, __y);
+float __fmaf_rn(float __x, float __y, float __z) {
+  return __ocml_fma_f32(__x, __y, __z);
 }
 #endif
-__DEVICE__
-inline float __fmul_rn(float __x, float __y) { return __x * __y; }
+
 #if defined OCML_BASIC_ROUNDED_OPERATIONS
 __DEVICE__
-inline float __fmul_ru(float __x, float __y) {
-  return __ocml_mul_rtp_f32(__x, __y);
-}
+float __fmul_rd(float __x, float __y) { return __ocml_mul_rtn_f32(__x, __y); }
 __DEVICE__
-inline float __fmul_rz(float __x, float __y) {
-  return __ocml_mul_rtz_f32(__x, __y);
-}
+float __fmul_rn(float __x, float __y) { return __ocml_mul_rte_f32(__x, __y); }
 __DEVICE__
-inline float __frcp_rd(float __x) { return __llvm_amdgcn_rcp_f32(__x); }
+float __fmul_ru(float __x, float __y) { return __ocml_mul_rtp_f32(__x, __y); }
+__DEVICE__
+float __fmul_rz(float __x, float __y) { return __ocml_mul_rtz_f32(__x, __y); }
+#else
+__DEVICE__
+float __fmul_rn(float __x, float __y) { return __x * __y; }
 #endif
-__DEVICE__
-inline float __frcp_rn(float __x) { return __llvm_amdgcn_rcp_f32(__x); }
+
 #if defined OCML_BASIC_ROUNDED_OPERATIONS
 __DEVICE__
-inline float __frcp_ru(float __x) { return __llvm_amdgcn_rcp_f32(__x); }
+float __frcp_rd(float __x) { return __ocml_div_rtn_f32(1.0f, __x); }
 __DEVICE__
-inline float __frcp_rz(float __x) { return __llvm_amdgcn_rcp_f32(__x); }
+float __frcp_rn(float __x) { return __ocml_div_rte_f32(1.0f, __x); }
+__DEVICE__
+float __frcp_ru(float __x) { return __ocml_div_rtp_f32(1.0f, __x); }
+__DEVICE__
+float __frcp_rz(float __x) { return __ocml_div_rtz_f32(1.0f, __x); }
+#else
+__DEVICE__
+float __frcp_rn(float __x) { return 1.0f / __x; }
 #endif
+
 __DEVICE__
-inline float __frsqrt_rn(float __x) { return __llvm_amdgcn_rsq_f32(__x); }
+float __frsqrt_rn(float __x) { return __llvm_amdgcn_rsq_f32(__x); }
+
 #if defined OCML_BASIC_ROUNDED_OPERATIONS
 __DEVICE__
-inline float __fsqrt_rd(float __x) { return __ocml_sqrt_rtn_f32(__x); }
-#endif
+float __fsqrt_rd(float __x) { return __ocml_sqrt_rtn_f32(__x); }
 __DEVICE__
-inline float __fsqrt_rn(float __x) { return __ocml_native_sqrt_f32(__x); }
+float __fsqrt_rn(float __x) { return __ocml_sqrt_rte_f32(__x); }
+__DEVICE__
+float __fsqrt_ru(float __x) { return __ocml_sqrt_rtp_f32(__x); }
+__DEVICE__
+float __fsqrt_rz(float __x) { return __ocml_sqrt_rtz_f32(__x); }
+#else
+__DEVICE__
+float __fsqrt_rn(float __x) { return __ocml_native_sqrt_f32(__x); }
+#endif
+
 #if defined OCML_BASIC_ROUNDED_OPERATIONS
 __DEVICE__
-inline float __fsqrt_ru(float __x) { return __ocml_sqrt_rtp_f32(__x); }
+float __fsub_rd(float __x, float __y) { return __ocml_sub_rtn_f32(__x, __y); }
 __DEVICE__
-inline float __fsqrt_rz(float __x) { return __ocml_sqrt_rtz_f32(__x); }
+float __fsub_rn(float __x, float __y) { return __ocml_sub_rte_f32(__x, __y); }
 __DEVICE__
-inline float __fsub_rd(float __x, float __y) {
-  return __ocml_sub_rtn_f32(__x, __y);
-}
+float __fsub_ru(float __x, float __y) { return __ocml_sub_rtp_f32(__x, __y); }
+__DEVICE__
+float __fsub_rz(float __x, float __y) { return __ocml_sub_rtz_f32(__x, __y); }
+#else
+__DEVICE__
+float __fsub_rn(float __x, float __y) { return __x - __y; }
 #endif
+
 __DEVICE__
-inline float __fsub_rn(float __x, float __y) { return __x - __y; }
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
+float __log10f(float __x) { return __ocml_native_log10_f32(__x); }
+
 __DEVICE__
-inline float __fsub_ru(float __x, float __y) {
-  return __ocml_sub_rtp_f32(__x, __y);
-}
+float __log2f(float __x) { return __ocml_native_log2_f32(__x); }
+
 __DEVICE__
-inline float __fsub_rz(float __x, float __y) {
-  return __ocml_sub_rtz_f32(__x, __y);
-}
-#endif
+float __logf(float __x) { return __ocml_native_log_f32(__x); }
+
 __DEVICE__
-inline float __log10f(float __x) { return __ocml_native_log10_f32(__x); }
+float __powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); }
+
 __DEVICE__
-inline float __log2f(float __x) { return __ocml_native_log2_f32(__x); }
+float __saturatef(float __x) { return (__x < 0) ? 0 : ((__x > 1) ? 1 : __x); }
+
 __DEVICE__
-inline float __logf(float __x) { return __ocml_native_log_f32(__x); }
-__DEVICE__
-inline float __powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); }
-__DEVICE__
-inline float __saturatef(float __x) {
-  return (__x < 0) ? 0 : ((__x > 1) ? 1 : __x);
-}
-__DEVICE__
-inline void __sincosf(float __x, float *__sinptr, float *__cosptr) {
+void __sincosf(float __x, float *__sinptr, float *__cosptr) {
   *__sinptr = __ocml_native_sin_f32(__x);
   *__cosptr = __ocml_native_cos_f32(__x);
 }
+
 __DEVICE__
-inline float __sinf(float __x) { return __ocml_native_sin_f32(__x); }
+float __sinf(float __x) { return __ocml_native_sin_f32(__x); }
+
 __DEVICE__
-inline float __tanf(float __x) { return __ocml_tan_f32(__x); }
+float __tanf(float __x) { return __ocml_tan_f32(__x); }
 // END INTRINSICS
 // END FLOAT
 
 // BEGIN DOUBLE
 __DEVICE__
-inline double abs(double __x) { return __ocml_fabs_f64(__x); }
+double acos(double __x) { return __ocml_acos_f64(__x); }
+
 __DEVICE__
-inline double acos(double __x) { return __ocml_acos_f64(__x); }
+double acosh(double __x) { return __ocml_acosh_f64(__x); }
+
 __DEVICE__
-inline double acosh(double __x) { return __ocml_acosh_f64(__x); }
+double asin(double __x) { return __ocml_asin_f64(__x); }
+
 __DEVICE__
-inline double asin(double __x) { return __ocml_asin_f64(__x); }
+double asinh(double __x) { return __ocml_asinh_f64(__x); }
+
 __DEVICE__
-inline double asinh(double __x) { return __ocml_asinh_f64(__x); }
+double atan(double __x) { return __ocml_atan_f64(__x); }
+
 __DEVICE__
-inline double atan(double __x) { return __ocml_atan_f64(__x); }
+double atan2(double __x, double __y) { return __ocml_atan2_f64(__x, __y); }
+
 __DEVICE__
-inline double atan2(double __x, double __y) {
-  return __ocml_atan2_f64(__x, __y);
-}
+double atanh(double __x) { return __ocml_atanh_f64(__x); }
+
 __DEVICE__
-inline double atanh(double __x) { return __ocml_atanh_f64(__x); }
+double cbrt(double __x) { return __ocml_cbrt_f64(__x); }
+
 __DEVICE__
-inline double cbrt(double __x) { return __ocml_cbrt_f64(__x); }
+double ceil(double __x) { return __ocml_ceil_f64(__x); }
+
 __DEVICE__
-inline double ceil(double __x) { return __ocml_ceil_f64(__x); }
-__DEVICE__
-inline double copysign(double __x, double __y) {
+double copysign(double __x, double __y) {
   return __ocml_copysign_f64(__x, __y);
 }
+
 __DEVICE__
-inline double cos(double __x) { return __ocml_cos_f64(__x); }
+double cos(double __x) { return __ocml_cos_f64(__x); }
+
 __DEVICE__
-inline double cosh(double __x) { return __ocml_cosh_f64(__x); }
+double cosh(double __x) { return __ocml_cosh_f64(__x); }
+
 __DEVICE__
-inline double cospi(double __x) { return __ocml_cospi_f64(__x); }
+double cospi(double __x) { return __ocml_cospi_f64(__x); }
+
 __DEVICE__
-inline double cyl_bessel_i0(double __x) { return __ocml_i0_f64(__x); }
+double cyl_bessel_i0(double __x) { return __ocml_i0_f64(__x); }
+
 __DEVICE__
-inline double cyl_bessel_i1(double __x) { return __ocml_i1_f64(__x); }
+double cyl_bessel_i1(double __x) { return __ocml_i1_f64(__x); }
+
 __DEVICE__
-inline double erf(double __x) { return __ocml_erf_f64(__x); }
+double erf(double __x) { return __ocml_erf_f64(__x); }
+
 __DEVICE__
-inline double erfc(double __x) { return __ocml_erfc_f64(__x); }
+double erfc(double __x) { return __ocml_erfc_f64(__x); }
+
 __DEVICE__
-inline double erfcinv(double __x) { return __ocml_erfcinv_f64(__x); }
+double erfcinv(double __x) { return __ocml_erfcinv_f64(__x); }
+
 __DEVICE__
-inline double erfcx(double __x) { return __ocml_erfcx_f64(__x); }
+double erfcx(double __x) { return __ocml_erfcx_f64(__x); }
+
 __DEVICE__
-inline double erfinv(double __x) { return __ocml_erfinv_f64(__x); }
+double erfinv(double __x) { return __ocml_erfinv_f64(__x); }
+
 __DEVICE__
-inline double exp(double __x) { return __ocml_exp_f64(__x); }
+double exp(double __x) { return __ocml_exp_f64(__x); }
+
 __DEVICE__
-inline double exp10(double __x) { return __ocml_exp10_f64(__x); }
+double exp10(double __x) { return __ocml_exp10_f64(__x); }
+
 __DEVICE__
-inline double exp2(double __x) { return __ocml_exp2_f64(__x); }
+double exp2(double __x) { return __ocml_exp2_f64(__x); }
+
 __DEVICE__
-inline double expm1(double __x) { return __ocml_expm1_f64(__x); }
+double expm1(double __x) { return __ocml_expm1_f64(__x); }
+
 __DEVICE__
-inline double fabs(double __x) { return __ocml_fabs_f64(__x); }
+double fabs(double __x) { return __ocml_fabs_f64(__x); }
+
 __DEVICE__
-inline double fdim(double __x, double __y) { return __ocml_fdim_f64(__x, __y); }
+double fdim(double __x, double __y) { return __ocml_fdim_f64(__x, __y); }
+
 __DEVICE__
-inline double floor(double __x) { return __ocml_floor_f64(__x); }
+double floor(double __x) { return __ocml_floor_f64(__x); }
+
 __DEVICE__
-inline double fma(double __x, double __y, double __z) {
+double fma(double __x, double __y, double __z) {
   return __ocml_fma_f64(__x, __y, __z);
 }
+
 __DEVICE__
-inline double fmax(double __x, double __y) { return __ocml_fmax_f64(__x, __y); }
+double fmax(double __x, double __y) { return __ocml_fmax_f64(__x, __y); }
+
 __DEVICE__
-inline double fmin(double __x, double __y) { return __ocml_fmin_f64(__x, __y); }
+double fmin(double __x, double __y) { return __ocml_fmin_f64(__x, __y); }
+
 __DEVICE__
-inline double fmod(double __x, double __y) { return __ocml_fmod_f64(__x, __y); }
+double fmod(double __x, double __y) { return __ocml_fmod_f64(__x, __y); }
+
 __DEVICE__
-inline double frexp(double __x, int *__nptr) {
+double frexp(double __x, int *__nptr) {
   int __tmp;
   double __r =
       __ocml_frexp_f64(__x, (__attribute__((address_space(5))) int *)&__tmp);
   *__nptr = __tmp;
-
   return __r;
 }
+
 __DEVICE__
-inline double hypot(double __x, double __y) {
-  return __ocml_hypot_f64(__x, __y);
-}
+double hypot(double __x, double __y) { return __ocml_hypot_f64(__x, __y); }
+
 __DEVICE__
-inline int ilogb(double __x) { return __ocml_ilogb_f64(__x); }
+int ilogb(double __x) { return __ocml_ilogb_f64(__x); }
+
 __DEVICE__
-inline __RETURN_TYPE isfinite(double __x) { return __ocml_isfinite_f64(__x); }
+__RETURN_TYPE __finite(double __x) { return __ocml_isfinite_f64(__x); }
+
 __DEVICE__
-inline __RETURN_TYPE isinf(double __x) { return __ocml_isinf_f64(__x); }
+__RETURN_TYPE __isinf(double __x) { return __ocml_isinf_f64(__x); }
+
 __DEVICE__
-inline __RETURN_TYPE isnan(double __x) { return __ocml_isnan_f64(__x); }
+__RETURN_TYPE __isnan(double __x) { return __ocml_isnan_f64(__x); }
+
 __DEVICE__
-inline double j0(double __x) { return __ocml_j0_f64(__x); }
+double j0(double __x) { return __ocml_j0_f64(__x); }
+
 __DEVICE__
-inline double j1(double __x) { return __ocml_j1_f64(__x); }
+double j1(double __x) { return __ocml_j1_f64(__x); }
+
 __DEVICE__
-inline double jn(int __n,
-                 double __x) { // TODO: we could use Ahmes multiplication
-                               // and the Miller & Brown algorithm
+double jn(int __n, double __x) { // TODO: we could use Ahmes multiplication
+                                 // and the Miller & Brown algorithm
   //       for linear recurrences to get O(log n) steps, but it's unclear if
   //       it'd be beneficial in this case. Placeholder until OCML adds
   //       support.
   if (__n == 0)
-    return j0f(__x);
+    return j0(__x);
   if (__n == 1)
-    return j1f(__x);
+    return j1(__x);
 
-  double __x0 = j0f(__x);
-  double __x1 = j1f(__x);
+  double __x0 = j0(__x);
+  double __x1 = j1(__x);
   for (int __i = 1; __i < __n; ++__i) {
     double __x2 = (2 * __i) / __x * __x1 - __x0;
     __x0 = __x1;
     __x1 = __x2;
   }
-
   return __x1;
 }
+
 __DEVICE__
-inline double ldexp(double __x, int __e) { return __ocml_ldexp_f64(__x, __e); }
+double ldexp(double __x, int __e) { return __ocml_ldexp_f64(__x, __e); }
+
 __DEVICE__
-inline double lgamma(double __x) { return __ocml_lgamma_f64(__x); }
+double lgamma(double __x) { return __ocml_lgamma_f64(__x); }
+
 __DEVICE__
-inline long long int llrint(double __x) { return __ocml_rint_f64(__x); }
+long long int llrint(double __x) { return __ocml_rint_f64(__x); }
+
 __DEVICE__
-inline long long int llround(double __x) { return __ocml_round_f64(__x); }
+long long int llround(double __x) { return __ocml_round_f64(__x); }
+
 __DEVICE__
-inline double log(double __x) { return __ocml_log_f64(__x); }
+double log(double __x) { return __ocml_log_f64(__x); }
+
 __DEVICE__
-inline double log10(double __x) { return __ocml_log10_f64(__x); }
+double log10(double __x) { return __ocml_log10_f64(__x); }
+
 __DEVICE__
-inline double log1p(double __x) { return __ocml_log1p_f64(__x); }
+double log1p(double __x) { return __ocml_log1p_f64(__x); }
+
 __DEVICE__
-inline double log2(double __x) { return __ocml_log2_f64(__x); }
+double log2(double __x) { return __ocml_log2_f64(__x); }
+
 __DEVICE__
-inline double logb(double __x) { return __ocml_logb_f64(__x); }
+double logb(double __x) { return __ocml_logb_f64(__x); }
+
 __DEVICE__
-inline long int lrint(double __x) { return __ocml_rint_f64(__x); }
+long int lrint(double __x) { return __ocml_rint_f64(__x); }
+
 __DEVICE__
-inline long int lround(double __x) { return __ocml_round_f64(__x); }
+long int lround(double __x) { return __ocml_round_f64(__x); }
+
 __DEVICE__
-inline double modf(double __x, double *__iptr) {
+double modf(double __x, double *__iptr) {
   double __tmp;
   double __r =
       __ocml_modf_f64(__x, (__attribute__((address_space(5))) double *)&__tmp);
@@ -699,8 +878,9 @@ inline double modf(double __x, double *__iptr) {
 
   return __r;
 }
+
 __DEVICE__
-inline double nan(const char *__tagp) {
+double nan(const char *__tagp) {
 #if !_WIN32
   union {
     double val;
@@ -710,8 +890,8 @@ inline double nan(const char *__tagp) {
       uint32_t exponent : 11;
       uint32_t sign : 1;
     } bits;
-    static_assert(sizeof(double) == sizeof(ieee_double), "");
   } __tmp;
+  __static_assert_type_size_equal(sizeof(__tmp.val), sizeof(__tmp.bits));
 
   __tmp.bits.sign = 0u;
   __tmp.bits.exponent = ~0u;
@@ -720,22 +900,24 @@ inline double nan(const char *__tagp) {
 
   return __tmp.val;
 #else
-  static_assert(sizeof(uint64_t) == sizeof(double));
-  uint64_t val = __make_mantissa(__tagp);
-  val |= 0xFFF << 51;
-  return *reinterpret_cast<double *>(&val);
+  __static_assert_type_size_equal(sizeof(uint64_t), sizeof(double));
+  uint64_t __val = __make_mantissa(__tagp);
+  __val |= 0xFFF << 51;
+  return *reinterpret_cast<double *>(&__val);
 #endif
 }
+
 __DEVICE__
-inline double nearbyint(double __x) { return __ocml_nearbyint_f64(__x); }
+double nearbyint(double __x) { return __ocml_nearbyint_f64(__x); }
+
 __DEVICE__
-inline double nextafter(double __x, double __y) {
+double nextafter(double __x, double __y) {
   return __ocml_nextafter_f64(__x, __y);
 }
+
 __DEVICE__
-inline double
-norm(int __dim,
-     const double *__a) { // TODO: placeholder until OCML adds support.
+double norm(int __dim,
+            const double *__a) { // TODO: placeholder until OCML adds support.
   double __r = 0;
   while (__dim--) {
     __r += __a[0] * __a[0];
@@ -744,28 +926,39 @@ norm(int __dim,
 
   return __ocml_sqrt_f64(__r);
 }
+
 __DEVICE__
-inline double norm3d(double __x, double __y, double __z) {
+double norm3d(double __x, double __y, double __z) {
   return __ocml_len3_f64(__x, __y, __z);
 }
+
 __DEVICE__
-inline double norm4d(double __x, double __y, double __z, double __w) {
+double norm4d(double __x, double __y, double __z, double __w) {
   return __ocml_len4_f64(__x, __y, __z, __w);
 }
+
 __DEVICE__
-inline double normcdf(double __x) { return __ocml_ncdf_f64(__x); }
+double normcdf(double __x) { return __ocml_ncdf_f64(__x); }
+
 __DEVICE__
-inline double normcdfinv(double __x) { return __ocml_ncdfinv_f64(__x); }
+double normcdfinv(double __x) { return __ocml_ncdfinv_f64(__x); }
+
 __DEVICE__
-inline double pow(double __x, double __y) { return __ocml_pow_f64(__x, __y); }
+double pow(double __x, double __y) { return __ocml_pow_f64(__x, __y); }
+
 __DEVICE__
-inline double rcbrt(double __x) { return __ocml_rcbrt_f64(__x); }
+double powi(double __x, int __y) { return __ocml_pown_f64(__x, __y); }
+
 __DEVICE__
-inline double remainder(double __x, double __y) {
+double rcbrt(double __x) { return __ocml_rcbrt_f64(__x); }
+
+__DEVICE__
+double remainder(double __x, double __y) {
   return __ocml_remainder_f64(__x, __y);
 }
+
 __DEVICE__
-inline double remquo(double __x, double __y, int *__quo) {
+double remquo(double __x, double __y, int *__quo) {
   int __tmp;
   double __r = __ocml_remquo_f64(
       __x, __y, (__attribute__((address_space(5))) int *)&__tmp);
@@ -773,16 +966,16 @@ inline double remquo(double __x, double __y, int *__quo) {
 
   return __r;
 }
+
 __DEVICE__
-inline double rhypot(double __x, double __y) {
-  return __ocml_rhypot_f64(__x, __y);
-}
+double rhypot(double __x, double __y) { return __ocml_rhypot_f64(__x, __y); }
+
 __DEVICE__
-inline double rint(double __x) { return __ocml_rint_f64(__x); }
+double rint(double __x) { return __ocml_rint_f64(__x); }
+
 __DEVICE__
-inline double
-rnorm(int __dim,
-      const double *__a) { // TODO: placeholder until OCML adds support.
+double rnorm(int __dim,
+             const double *__a) { // TODO: placeholder until OCML adds support.
   double __r = 0;
   while (__dim--) {
     __r += __a[0] * __a[0];
@@ -791,77 +984,93 @@ rnorm(int __dim,
 
   return __ocml_rsqrt_f64(__r);
 }
+
 __DEVICE__
-inline double rnorm3d(double __x, double __y, double __z) {
+double rnorm3d(double __x, double __y, double __z) {
   return __ocml_rlen3_f64(__x, __y, __z);
 }
+
 __DEVICE__
-inline double rnorm4d(double __x, double __y, double __z, double __w) {
+double rnorm4d(double __x, double __y, double __z, double __w) {
   return __ocml_rlen4_f64(__x, __y, __z, __w);
 }
+
 __DEVICE__
-inline double round(double __x) { return __ocml_round_f64(__x); }
+double round(double __x) { return __ocml_round_f64(__x); }
+
 __DEVICE__
-inline double rsqrt(double __x) { return __ocml_rsqrt_f64(__x); }
+double rsqrt(double __x) { return __ocml_rsqrt_f64(__x); }
+
 __DEVICE__
-inline double scalbln(double __x, long int __n) {
+double scalbln(double __x, long int __n) {
   return (__n < INT_MAX) ? __ocml_scalbn_f64(__x, __n)
                          : __ocml_scalb_f64(__x, __n);
 }
 __DEVICE__
-inline double scalbn(double __x, int __n) {
-  return __ocml_scalbn_f64(__x, __n);
-}
+double scalbn(double __x, int __n) { return __ocml_scalbn_f64(__x, __n); }
+
 __DEVICE__
-inline __RETURN_TYPE signbit(double __x) { return __ocml_signbit_f64(__x); }
+__RETURN_TYPE __signbit(double __x) { return __ocml_signbit_f64(__x); }
+
 __DEVICE__
-inline double sin(double __x) { return __ocml_sin_f64(__x); }
+double sin(double __x) { return __ocml_sin_f64(__x); }
+
 __DEVICE__
-inline void sincos(double __x, double *__sinptr, double *__cosptr) {
+void sincos(double __x, double *__sinptr, double *__cosptr) {
   double __tmp;
   *__sinptr = __ocml_sincos_f64(
       __x, (__attribute__((address_space(5))) double *)&__tmp);
   *__cosptr = __tmp;
 }
+
 __DEVICE__
-inline void sincospi(double __x, double *__sinptr, double *__cosptr) {
+void sincospi(double __x, double *__sinptr, double *__cosptr) {
   double __tmp;
   *__sinptr = __ocml_sincospi_f64(
       __x, (__attribute__((address_space(5))) double *)&__tmp);
   *__cosptr = __tmp;
 }
+
 __DEVICE__
-inline double sinh(double __x) { return __ocml_sinh_f64(__x); }
+double sinh(double __x) { return __ocml_sinh_f64(__x); }
+
 __DEVICE__
-inline double sinpi(double __x) { return __ocml_sinpi_f64(__x); }
+double sinpi(double __x) { return __ocml_sinpi_f64(__x); }
+
 __DEVICE__
-inline double sqrt(double __x) { return __ocml_sqrt_f64(__x); }
+double sqrt(double __x) { return __ocml_sqrt_f64(__x); }
+
 __DEVICE__
-inline double tan(double __x) { return __ocml_tan_f64(__x); }
+double tan(double __x) { return __ocml_tan_f64(__x); }
+
 __DEVICE__
-inline double tanh(double __x) { return __ocml_tanh_f64(__x); }
+double tanh(double __x) { return __ocml_tanh_f64(__x); }
+
 __DEVICE__
-inline double tgamma(double __x) { return __ocml_tgamma_f64(__x); }
+double tgamma(double __x) { return __ocml_tgamma_f64(__x); }
+
 __DEVICE__
-inline double trunc(double __x) { return __ocml_trunc_f64(__x); }
+double trunc(double __x) { return __ocml_trunc_f64(__x); }
+
 __DEVICE__
-inline double y0(double __x) { return __ocml_y0_f64(__x); }
+double y0(double __x) { return __ocml_y0_f64(__x); }
+
 __DEVICE__
-inline double y1(double __x) { return __ocml_y1_f64(__x); }
+double y1(double __x) { return __ocml_y1_f64(__x); }
+
 __DEVICE__
-inline double yn(int __n,
-                 double __x) { // TODO: we could use Ahmes multiplication
-                               // and the Miller & Brown algorithm
+double yn(int __n, double __x) { // TODO: we could use Ahmes multiplication
+                                 // and the Miller & Brown algorithm
   //       for linear recurrences to get O(log n) steps, but it's unclear if
   //       it'd be beneficial in this case. Placeholder until OCML adds
   //       support.
   if (__n == 0)
-    return j0f(__x);
+    return y0(__x);
   if (__n == 1)
-    return j1f(__x);
+    return y1(__x);
 
-  double __x0 = j0f(__x);
-  double __x1 = j1f(__x);
+  double __x0 = y0(__x);
+  double __x1 = y1(__x);
   for (int __i = 1; __i < __n; ++__i) {
     double __x2 = (2 * __i) / __x * __x1 - __x0;
     __x0 = __x1;
@@ -874,296 +1083,182 @@ inline double yn(int __n,
 // BEGIN INTRINSICS
 #if defined OCML_BASIC_ROUNDED_OPERATIONS
 __DEVICE__
-inline double __dadd_rd(double __x, double __y) {
+double __dadd_rd(double __x, double __y) {
   return __ocml_add_rtn_f64(__x, __y);
 }
-#endif
 __DEVICE__
-inline double __dadd_rn(double __x, double __y) { return __x + __y; }
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
+double __dadd_rn(double __x, double __y) {
+  return __ocml_add_rte_f64(__x, __y);
+}
 __DEVICE__
-inline double __dadd_ru(double __x, double __y) {
+double __dadd_ru(double __x, double __y) {
   return __ocml_add_rtp_f64(__x, __y);
 }
 __DEVICE__
-inline double __dadd_rz(double __x, double __y) {
+double __dadd_rz(double __x, double __y) {
   return __ocml_add_rtz_f64(__x, __y);
 }
+#else
 __DEVICE__
-inline double __ddiv_rd(double __x, double __y) {
-  return __ocml_div_rtn_f64(__x, __y);
-}
+double __dadd_rn(double __x, double __y) { return __x + __y; }
 #endif
-__DEVICE__
-inline double __ddiv_rn(double __x, double __y) { return __x / __y; }
+
 #if defined OCML_BASIC_ROUNDED_OPERATIONS
 __DEVICE__
-inline double __ddiv_ru(double __x, double __y) {
+double __ddiv_rd(double __x, double __y) {
+  return __ocml_div_rtn_f64(__x, __y);
+}
+__DEVICE__
+double __ddiv_rn(double __x, double __y) {
+  return __ocml_div_rte_f64(__x, __y);
+}
+__DEVICE__
+double __ddiv_ru(double __x, double __y) {
   return __ocml_div_rtp_f64(__x, __y);
 }
 __DEVICE__
-inline double __ddiv_rz(double __x, double __y) {
+double __ddiv_rz(double __x, double __y) {
   return __ocml_div_rtz_f64(__x, __y);
 }
+#else
 __DEVICE__
-inline double __dmul_rd(double __x, double __y) {
-  return __ocml_mul_rtn_f64(__x, __y);
-}
+double __ddiv_rn(double __x, double __y) { return __x / __y; }
 #endif
-__DEVICE__
-inline double __dmul_rn(double __x, double __y) { return __x * __y; }
+
 #if defined OCML_BASIC_ROUNDED_OPERATIONS
 __DEVICE__
-inline double __dmul_ru(double __x, double __y) {
+double __dmul_rd(double __x, double __y) {
+  return __ocml_mul_rtn_f64(__x, __y);
+}
+__DEVICE__
+double __dmul_rn(double __x, double __y) {
+  return __ocml_mul_rte_f64(__x, __y);
+}
+__DEVICE__
+double __dmul_ru(double __x, double __y) {
   return __ocml_mul_rtp_f64(__x, __y);
 }
 __DEVICE__
-inline double __dmul_rz(double __x, double __y) {
+double __dmul_rz(double __x, double __y) {
   return __ocml_mul_rtz_f64(__x, __y);
 }
+#else
 __DEVICE__
-inline double __drcp_rd(double __x) { return __llvm_amdgcn_rcp_f64(__x); }
+double __dmul_rn(double __x, double __y) { return __x * __y; }
 #endif
-__DEVICE__
-inline double __drcp_rn(double __x) { return __llvm_amdgcn_rcp_f64(__x); }
+
 #if defined OCML_BASIC_ROUNDED_OPERATIONS
 __DEVICE__
-inline double __drcp_ru(double __x) { return __llvm_amdgcn_rcp_f64(__x); }
+double __drcp_rd(double __x) { return __ocml_div_rtn_f64(1.0, __x); }
 __DEVICE__
-inline double __drcp_rz(double __x) { return __llvm_amdgcn_rcp_f64(__x); }
+double __drcp_rn(double __x) { return __ocml_div_rte_f64(1.0, __x); }
 __DEVICE__
-inline double __dsqrt_rd(double __x) { return __ocml_sqrt_rtn_f64(__x); }
+double __drcp_ru(double __x) { return __ocml_div_rtp_f64(1.0, __x); }
+__DEVICE__
+double __drcp_rz(double __x) { return __ocml_div_rtz_f64(1.0, __x); }
+#else
+__DEVICE__
+double __drcp_rn(double __x) { return 1.0 / __x; }
 #endif
-__DEVICE__
-inline double __dsqrt_rn(double __x) { return __ocml_sqrt_f64(__x); }
+
 #if defined OCML_BASIC_ROUNDED_OPERATIONS
 __DEVICE__
-inline double __dsqrt_ru(double __x) { return __ocml_sqrt_rtp_f64(__x); }
+double __dsqrt_rd(double __x) { return __ocml_sqrt_rtn_f64(__x); }
 __DEVICE__
-inline double __dsqrt_rz(double __x) { return __ocml_sqrt_rtz_f64(__x); }
+double __dsqrt_rn(double __x) { return __ocml_sqrt_rte_f64(__x); }
 __DEVICE__
-inline double __dsub_rd(double __x, double __y) {
+double __dsqrt_ru(double __x) { return __ocml_sqrt_rtp_f64(__x); }
+__DEVICE__
+double __dsqrt_rz(double __x) { return __ocml_sqrt_rtz_f64(__x); }
+#else
+__DEVICE__
+double __dsqrt_rn(double __x) { return __ocml_sqrt_f64(__x); }
+#endif
+
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+double __dsub_rd(double __x, double __y) {
   return __ocml_sub_rtn_f64(__x, __y);
 }
-#endif
 __DEVICE__
-inline double __dsub_rn(double __x, double __y) { return __x - __y; }
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
+double __dsub_rn(double __x, double __y) {
+  return __ocml_sub_rte_f64(__x, __y);
+}
 __DEVICE__
-inline double __dsub_ru(double __x, double __y) {
+double __dsub_ru(double __x, double __y) {
   return __ocml_sub_rtp_f64(__x, __y);
 }
 __DEVICE__
-inline double __dsub_rz(double __x, double __y) {
+double __dsub_rz(double __x, double __y) {
   return __ocml_sub_rtz_f64(__x, __y);
 }
+#else
 __DEVICE__
-inline double __fma_rd(double __x, double __y, double __z) {
-  return __ocml_fma_rtn_f64(__x, __y, __z);
-}
+double __dsub_rn(double __x, double __y) { return __x - __y; }
 #endif
-__DEVICE__
-inline double __fma_rn(double __x, double __y, double __z) {
-  return __ocml_fma_f64(__x, __y, __z);
-}
+
 #if defined OCML_BASIC_ROUNDED_OPERATIONS
 __DEVICE__
-inline double __fma_ru(double __x, double __y, double __z) {
+double __fma_rd(double __x, double __y, double __z) {
+  return __ocml_fma_rtn_f64(__x, __y, __z);
+}
+__DEVICE__
+double __fma_rn(double __x, double __y, double __z) {
+  return __ocml_fma_rte_f64(__x, __y, __z);
+}
+__DEVICE__
+double __fma_ru(double __x, double __y, double __z) {
   return __ocml_fma_rtp_f64(__x, __y, __z);
 }
 __DEVICE__
-inline double __fma_rz(double __x, double __y, double __z) {
+double __fma_rz(double __x, double __y, double __z) {
   return __ocml_fma_rtz_f64(__x, __y, __z);
 }
+#else
+__DEVICE__
+double __fma_rn(double __x, double __y, double __z) {
+  return __ocml_fma_f64(__x, __y, __z);
+}
 #endif
 // END INTRINSICS
 // END DOUBLE
 
-// BEGIN INTEGER
-__DEVICE__
-inline int abs(int __x) {
-  int __sgn = __x >> (sizeof(int) * CHAR_BIT - 1);
-  return (__x ^ __sgn) - __sgn;
-}
-__DEVICE__
-inline long labs(long __x) {
-  long __sgn = __x >> (sizeof(long) * CHAR_BIT - 1);
-  return (__x ^ __sgn) - __sgn;
-}
-__DEVICE__
-inline long long llabs(long long __x) {
-  long long __sgn = __x >> (sizeof(long long) * CHAR_BIT - 1);
-  return (__x ^ __sgn) - __sgn;
-}
+// C only macros
+#if !defined(__cplusplus) && __STDC_VERSION__ >= 201112L
+#define isfinite(__x) _Generic((__x), float : __finitef, double : __finite)(__x)
+#define isinf(__x) _Generic((__x), float : __isinff, double : __isinf)(__x)
+#define isnan(__x) _Generic((__x), float : __isnanf, double : __isnan)(__x)
+#define signbit(__x)                                                           \
+  _Generic((__x), float : __signbitf, double : __signbit)(__x)
+#endif // !defined(__cplusplus) && __STDC_VERSION__ >= 201112L
 
 #if defined(__cplusplus)
-__DEVICE__
-inline long abs(long __x) { return labs(__x); }
-__DEVICE__
-inline long long abs(long long __x) { return llabs(__x); }
-#endif
-// END INTEGER
-
-__DEVICE__
-inline _Float16 fma(_Float16 __x, _Float16 __y, _Float16 __z) {
-  return __ocml_fma_f16(__x, __y, __z);
-}
-
-__DEVICE__
-inline float fma(float __x, float __y, float __z) {
-  return fmaf(__x, __y, __z);
-}
-
-#pragma push_macro("__DEF_FUN1")
-#pragma push_macro("__DEF_FUN2")
-#pragma push_macro("__DEF_FUNI")
-#pragma push_macro("__DEF_FLOAT_FUN2I")
-#pragma push_macro("__HIP_OVERLOAD1")
-#pragma push_macro("__HIP_OVERLOAD2")
-
-// __hip_enable_if::type is a type function which returns __T if __B is true.
-template <bool __B, class __T = void> struct __hip_enable_if {};
-
-template <class __T> struct __hip_enable_if<true, __T> { typedef __T type; };
-
-// __HIP_OVERLOAD1 is used to resolve function calls with integer argument to
-// avoid compilation error due to ambibuity. e.g. floor(5) is resolved with
-// floor(double).
-#define __HIP_OVERLOAD1(__retty, __fn)                                         \
-  template <typename __T>                                                      \
-  __DEVICE__ typename __hip_enable_if<std::numeric_limits<__T>::is_integer,    \
-                                      __retty>::type                           \
-  __fn(__T __x) {                                                              \
-    return ::__fn((double)__x);                                                \
-  }
-
-// __HIP_OVERLOAD2 is used to resolve function calls with mixed float/double
-// or integer argument to avoid compilation error due to ambibuity. e.g.
-// max(5.0f, 6.0) is resolved with max(double, double).
-#define __HIP_OVERLOAD2(__retty, __fn)                                         \
-  template <typename __T1, typename __T2>                                      \
-  __DEVICE__                                                                   \
-      typename __hip_enable_if<std::numeric_limits<__T1>::is_specialized &&    \
-                                   std::numeric_limits<__T2>::is_specialized,  \
-                               __retty>::type                                  \
-      __fn(__T1 __x, __T2 __y) {                                               \
-    return __fn((double)__x, (double)__y);                                     \
-  }
-
-// Define cmath functions with float argument and returns float.
-#define __DEF_FUN1(__retty, __func)                                            \
-  __DEVICE__                                                                   \
-  inline float __func(float __x) { return __func##f(__x); }                    \
-  __HIP_OVERLOAD1(__retty, __func)
-
-// Define cmath functions with float argument and returns __retty.
-#define __DEF_FUNI(__retty, __func)                                            \
-  __DEVICE__                                                                   \
-  inline __retty __func(float __x) { return __func##f(__x); }                  \
-  __HIP_OVERLOAD1(__retty, __func)
-
-// define cmath functions with two float arguments.
-#define __DEF_FUN2(__retty, __func)                                            \
-  __DEVICE__                                                                   \
-  inline float __func(float __x, float __y) { return __func##f(__x, __y); }    \
-  __HIP_OVERLOAD2(__retty, __func)
-
-__DEF_FUN1(double, acos)
-__DEF_FUN1(double, acosh)
-__DEF_FUN1(double, asin)
-__DEF_FUN1(double, asinh)
-__DEF_FUN1(double, atan)
-__DEF_FUN2(double, atan2);
-__DEF_FUN1(double, atanh)
-__DEF_FUN1(double, cbrt)
-__DEF_FUN1(double, ceil)
-__DEF_FUN2(double, copysign);
-__DEF_FUN1(double, cos)
-__DEF_FUN1(double, cosh)
-__DEF_FUN1(double, erf)
-__DEF_FUN1(double, erfc)
-__DEF_FUN1(double, exp)
-__DEF_FUN1(double, exp2)
-__DEF_FUN1(double, expm1)
-__DEF_FUN1(double, fabs)
-__DEF_FUN2(double, fdim);
-__DEF_FUN1(double, floor)
-__DEF_FUN2(double, fmax);
-__DEF_FUN2(double, fmin);
-__DEF_FUN2(double, fmod);
-//__HIP_OVERLOAD1(int, fpclassify)
-__DEF_FUN2(double, hypot);
-__DEF_FUNI(int, ilogb)
-__HIP_OVERLOAD1(bool, isfinite)
-__HIP_OVERLOAD2(bool, isgreater);
-__HIP_OVERLOAD2(bool, isgreaterequal);
-__HIP_OVERLOAD1(bool, isinf);
-__HIP_OVERLOAD2(bool, isless);
-__HIP_OVERLOAD2(bool, islessequal);
-__HIP_OVERLOAD2(bool, islessgreater);
-__HIP_OVERLOAD1(bool, isnan);
-//__HIP_OVERLOAD1(bool, isnormal)
-__HIP_OVERLOAD2(bool, isunordered);
-__DEF_FUN1(double, lgamma)
-__DEF_FUN1(double, log)
-__DEF_FUN1(double, log10)
-__DEF_FUN1(double, log1p)
-__DEF_FUN1(double, log2)
-__DEF_FUN1(double, logb)
-__DEF_FUNI(long long, llrint)
-__DEF_FUNI(long long, llround)
-__DEF_FUNI(long, lrint)
-__DEF_FUNI(long, lround)
-__DEF_FUN1(double, nearbyint);
-__DEF_FUN2(double, nextafter);
-__DEF_FUN2(double, pow);
-__DEF_FUN2(double, remainder);
-__DEF_FUN1(double, rint);
-__DEF_FUN1(double, round);
-__HIP_OVERLOAD1(bool, signbit)
-__DEF_FUN1(double, sin)
-__DEF_FUN1(double, sinh)
-__DEF_FUN1(double, sqrt)
-__DEF_FUN1(double, tan)
-__DEF_FUN1(double, tanh)
-__DEF_FUN1(double, tgamma)
-__DEF_FUN1(double, trunc);
-
-// define cmath functions with a float and an integer argument.
-#define __DEF_FLOAT_FUN2I(__func)                                              \
-  __DEVICE__                                                                   \
-  inline float __func(float __x, int __y) { return __func##f(__x, __y); }
-__DEF_FLOAT_FUN2I(scalbn)
-
-template <class T> __DEVICE__ inline T min(T __arg1, T __arg2) {
+template <class T> __DEVICE__ T min(T __arg1, T __arg2) {
   return (__arg1 < __arg2) ? __arg1 : __arg2;
 }
 
-template <class T> __DEVICE__ inline T max(T __arg1, T __arg2) {
+template <class T> __DEVICE__ T max(T __arg1, T __arg2) {
   return (__arg1 > __arg2) ? __arg1 : __arg2;
 }
 
-__DEVICE__ inline int min(int __arg1, int __arg2) {
+__DEVICE__ int min(int __arg1, int __arg2) {
   return (__arg1 < __arg2) ? __arg1 : __arg2;
 }
-__DEVICE__ inline int max(int __arg1, int __arg2) {
+__DEVICE__ int max(int __arg1, int __arg2) {
   return (__arg1 > __arg2) ? __arg1 : __arg2;
 }
 
 __DEVICE__
-inline float max(float __x, float __y) { return fmaxf(__x, __y); }
+float max(float __x, float __y) { return fmaxf(__x, __y); }
 
 __DEVICE__
-inline double max(double __x, double __y) { return fmax(__x, __y); }
+double max(double __x, double __y) { return fmax(__x, __y); }
 
 __DEVICE__
-inline float min(float __x, float __y) { return fminf(__x, __y); }
+float min(float __x, float __y) { return fminf(__x, __y); }
 
 __DEVICE__
-inline double min(double __x, double __y) { return fmin(__x, __y); }
-
-__HIP_OVERLOAD2(double, max)
-__HIP_OVERLOAD2(double, min)
+double min(double __x, double __y) { return fmin(__x, __y); }
 
 __host__ inline static int min(int __arg1, int __arg2) {
   return std::min(__arg1, __arg2);
@@ -1172,13 +1267,8 @@ __host__ inline static int min(int __arg1, int __arg2) {
 __host__ inline static int max(int __arg1, int __arg2) {
   return std::max(__arg1, __arg2);
 }
+#endif
 
-#pragma pop_macro("__DEF_FUN1")
-#pragma pop_macro("__DEF_FUN2")
-#pragma pop_macro("__DEF_FUNI")
-#pragma pop_macro("__DEF_FLOAT_FUN2I")
-#pragma pop_macro("__HIP_OVERLOAD1")
-#pragma pop_macro("__HIP_OVERLOAD2")
 #pragma pop_macro("__DEVICE__")
 #pragma pop_macro("__RETURN_TYPE")
 
diff --git a/lib/include/__clang_hip_runtime_wrapper.h b/lib/include/__clang_hip_runtime_wrapper.h
index addae5605a..81a16a265a 100644
--- a/lib/include/__clang_hip_runtime_wrapper.h
+++ b/lib/include/__clang_hip_runtime_wrapper.h
@@ -28,6 +28,10 @@
 #define __shared__ __attribute__((shared))
 #define __constant__ __attribute__((constant))
 
+#if !defined(__cplusplus) || __cplusplus < 201103L
+  #define nullptr NULL;
+#endif
+
 #if __HIP_ENABLE_DEVICE_MALLOC__
 extern "C" __device__ void *__hip_malloc(size_t __size);
 extern "C" __device__ void *__hip_free(void *__ptr);
@@ -51,6 +55,7 @@ static inline __device__ void *free(void *__ptr) {
 
 #if !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
 #include <__clang_cuda_math_forward_declares.h>
+#include <__clang_hip_cmath.h>
 #include <__clang_cuda_complex_builtins.h>
 
 #include <algorithm>
diff --git a/lib/include/altivec.h b/lib/include/altivec.h
index ac5f438363..4d50d47d51 100644
--- a/lib/include/altivec.h
+++ b/lib/include/altivec.h
@@ -1709,6 +1709,20 @@ vec_cmpeq(vector double __a, vector double __b) {
 }
 #endif
 
+#ifdef __POWER10_VECTOR__
+static __inline__ vector bool __int128 __ATTRS_o_ai
+vec_cmpeq(vector signed __int128 __a, vector signed __int128 __b) {
+  return (vector bool __int128)__builtin_altivec_vcmpequq(
+      (vector bool __int128)__a, (vector bool __int128)__b);
+}
+
+static __inline__ vector bool __int128 __ATTRS_o_ai
+vec_cmpeq(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return (vector bool __int128)__builtin_altivec_vcmpequq(
+      (vector bool __int128)__a, (vector bool __int128)__b);
+}
+#endif
+
 #ifdef __POWER9_VECTOR__
 /* vec_cmpne */
 
@@ -1766,36 +1780,26 @@ vec_cmpne(vector unsigned int __a, vector unsigned int __b) {
                                                     (vector int)__b);
 }
 
-static __inline__ vector bool long long __ATTRS_o_ai
-vec_cmpne(vector bool long long __a, vector bool long long __b) {
-  return (vector bool long long)
-    ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
-}
-
-static __inline__ vector bool long long __ATTRS_o_ai
-vec_cmpne(vector signed long long __a, vector signed long long __b) {
-  return (vector bool long long)
-    ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
-}
-
-static __inline__ vector bool long long __ATTRS_o_ai
-vec_cmpne(vector unsigned long long __a, vector unsigned long long __b) {
-  return (vector bool long long)
-    ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
-}
-
 static __inline__ vector bool int __ATTRS_o_ai
 vec_cmpne(vector float __a, vector float __b) {
   return (vector bool int)__builtin_altivec_vcmpnew((vector int)__a,
                                                     (vector int)__b);
 }
 
-static __inline__ vector bool long long __ATTRS_o_ai
-vec_cmpne(vector double __a, vector double __b) {
-  return (vector bool long long)
-    ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
+#ifdef __POWER10_VECTOR__
+static __inline__ vector bool __int128 __ATTRS_o_ai
+vec_cmpne(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return (vector bool __int128) ~(__builtin_altivec_vcmpequq(
+      (vector bool __int128)__a, (vector bool __int128)__b));
 }
 
+static __inline__ vector bool __int128 __ATTRS_o_ai
+vec_cmpne(vector signed __int128 __a, vector signed __int128 __b) {
+  return (vector bool __int128) ~(__builtin_altivec_vcmpequq(
+      (vector bool __int128)__a, (vector bool __int128)__b));
+}
+#endif
+
 /* vec_cmpnez */
 
 static __inline__ vector bool char __ATTRS_o_ai
@@ -1900,6 +1904,86 @@ vec_parity_lsbb(vector signed long long __a) {
   return __builtin_altivec_vprtybd(__a);
 }
 
+#else
+/* vec_cmpne */
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpne(vector bool char __a, vector bool char __b) {
+  return ~(vec_cmpeq(__a, __b));
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpne(vector signed char __a, vector signed char __b) {
+  return ~(vec_cmpeq(__a, __b));
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpne(vector unsigned char __a, vector unsigned char __b) {
+  return ~(vec_cmpeq(__a, __b));
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpne(vector bool short __a, vector bool short __b) {
+  return ~(vec_cmpeq(__a, __b));
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpne(vector signed short __a, vector signed short __b) {
+  return ~(vec_cmpeq(__a, __b));
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpne(vector unsigned short __a, vector unsigned short __b) {
+  return ~(vec_cmpeq(__a, __b));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpne(vector bool int __a, vector bool int __b) {
+  return ~(vec_cmpeq(__a, __b));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpne(vector signed int __a, vector signed int __b) {
+  return ~(vec_cmpeq(__a, __b));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpne(vector unsigned int __a, vector unsigned int __b) {
+  return ~(vec_cmpeq(__a, __b));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpne(vector float __a, vector float __b) {
+  return ~(vec_cmpeq(__a, __b));
+}
+#endif
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpne(vector bool long long __a, vector bool long long __b) {
+  return (vector bool long long)
+    ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpne(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)
+    ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpne(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)
+    ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
+}
+#endif
+
+#ifdef __VSX__
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpne(vector double __a, vector double __b) {
+  return (vector bool long long)
+    ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
+}
 #endif
 
 /* vec_cmpgt */
@@ -1962,6 +2046,20 @@ vec_cmpgt(vector double __a, vector double __b) {
 }
 #endif
 
+#ifdef __POWER10_VECTOR__
+static __inline__ vector bool __int128 __ATTRS_o_ai
+vec_cmpgt(vector signed __int128 __a, vector signed __int128 __b) {
+  return (vector bool __int128)__builtin_altivec_vcmpgtsq(
+      (vector bool __int128)__a, (vector bool __int128)__b);
+}
+
+static __inline__ vector bool __int128 __ATTRS_o_ai
+vec_cmpgt(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return (vector bool __int128)__builtin_altivec_vcmpgtuq(
+      (vector bool __int128)__a, (vector bool __int128)__b);
+}
+#endif
+
 /* vec_cmpge */
 
 static __inline__ vector bool char __ATTRS_o_ai
@@ -2022,6 +2120,18 @@ vec_cmpge(vector unsigned long long __a, vector unsigned long long __b) {
 }
 #endif
 
+#ifdef __POWER10_VECTOR__
+static __inline__ vector bool __int128 __ATTRS_o_ai
+vec_cmpge(vector signed __int128 __a, vector signed __int128 __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+
+static __inline__ vector bool __int128 __ATTRS_o_ai
+vec_cmpge(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+#endif
+
 /* vec_vcmpgefp */
 
 static __inline__ vector bool int __attribute__((__always_inline__))
@@ -2134,6 +2244,18 @@ vec_cmple(vector unsigned long long __a, vector unsigned long long __b) {
 }
 #endif
 
+#ifdef __POWER10_VECTOR__
+static __inline__ vector bool __int128 __ATTRS_o_ai
+vec_cmple(vector signed __int128 __a, vector signed __int128 __b) {
+  return vec_cmpge(__b, __a);
+}
+
+static __inline__ vector bool __int128 __ATTRS_o_ai
+vec_cmple(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return vec_cmpge(__b, __a);
+}
+#endif
+
 /* vec_cmplt */
 
 static __inline__ vector bool char __ATTRS_o_ai
@@ -2178,6 +2300,18 @@ vec_cmplt(vector double __a, vector double __b) {
 }
 #endif
 
+#ifdef __POWER10_VECTOR__
+static __inline__ vector bool __int128 __ATTRS_o_ai
+vec_cmplt(vector signed __int128 __a, vector signed __int128 __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+static __inline__ vector bool __int128 __ATTRS_o_ai
+vec_cmplt(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return vec_cmpgt(__b, __a);
+}
+#endif
+
 #ifdef __POWER8_VECTOR__
 static __inline__ vector bool long long __ATTRS_o_ai
 vec_cmplt(vector signed long long __a, vector signed long long __b) {
@@ -2702,67 +2836,67 @@ vec_insert_exp(vector unsigned int __a, vector unsigned int __b) {
 }
 
 #if defined(__powerpc64__)
-static __inline__ vector signed char __ATTRS_o_ai vec_xl_len(signed char *__a,
+static __inline__ vector signed char __ATTRS_o_ai vec_xl_len(const signed char *__a,
                                                              size_t __b) {
   return (vector signed char)__builtin_vsx_lxvl(__a, (__b << 56));
 }
 
 static __inline__ vector unsigned char __ATTRS_o_ai
-vec_xl_len(unsigned char *__a, size_t __b) {
+vec_xl_len(const unsigned char *__a, size_t __b) {
   return (vector unsigned char)__builtin_vsx_lxvl(__a, (__b << 56));
 }
 
-static __inline__ vector signed short __ATTRS_o_ai vec_xl_len(signed short *__a,
+static __inline__ vector signed short __ATTRS_o_ai vec_xl_len(const signed short *__a,
                                                               size_t __b) {
   return (vector signed short)__builtin_vsx_lxvl(__a, (__b << 56));
 }
 
 static __inline__ vector unsigned short __ATTRS_o_ai
-vec_xl_len(unsigned short *__a, size_t __b) {
+vec_xl_len(const unsigned short *__a, size_t __b) {
   return (vector unsigned short)__builtin_vsx_lxvl(__a, (__b << 56));
 }
 
-static __inline__ vector signed int __ATTRS_o_ai vec_xl_len(signed int *__a,
+static __inline__ vector signed int __ATTRS_o_ai vec_xl_len(const signed int *__a,
                                                             size_t __b) {
   return (vector signed int)__builtin_vsx_lxvl(__a, (__b << 56));
 }
 
-static __inline__ vector unsigned int __ATTRS_o_ai vec_xl_len(unsigned int *__a,
+static __inline__ vector unsigned int __ATTRS_o_ai vec_xl_len(const unsigned int *__a,
                                                               size_t __b) {
   return (vector unsigned int)__builtin_vsx_lxvl(__a, (__b << 56));
 }
 
-static __inline__ vector float __ATTRS_o_ai vec_xl_len(float *__a, size_t __b) {
+static __inline__ vector float __ATTRS_o_ai vec_xl_len(const float *__a, size_t __b) {
   return (vector float)__builtin_vsx_lxvl(__a, (__b << 56));
 }
 
 static __inline__ vector signed __int128 __ATTRS_o_ai
-vec_xl_len(signed __int128 *__a, size_t __b) {
+vec_xl_len(const signed __int128 *__a, size_t __b) {
   return (vector signed __int128)__builtin_vsx_lxvl(__a, (__b << 56));
 }
 
 static __inline__ vector unsigned __int128 __ATTRS_o_ai
-vec_xl_len(unsigned __int128 *__a, size_t __b) {
+vec_xl_len(const unsigned __int128 *__a, size_t __b) {
   return (vector unsigned __int128)__builtin_vsx_lxvl(__a, (__b << 56));
 }
 
 static __inline__ vector signed long long __ATTRS_o_ai
-vec_xl_len(signed long long *__a, size_t __b) {
+vec_xl_len(const signed long long *__a, size_t __b) {
   return (vector signed long long)__builtin_vsx_lxvl(__a, (__b << 56));
 }
 
 static __inline__ vector unsigned long long __ATTRS_o_ai
-vec_xl_len(unsigned long long *__a, size_t __b) {
+vec_xl_len(const unsigned long long *__a, size_t __b) {
   return (vector unsigned long long)__builtin_vsx_lxvl(__a, (__b << 56));
 }
 
-static __inline__ vector double __ATTRS_o_ai vec_xl_len(double *__a,
+static __inline__ vector double __ATTRS_o_ai vec_xl_len(const double *__a,
                                                         size_t __b) {
   return (vector double)__builtin_vsx_lxvl(__a, (__b << 56));
 }
 
 static __inline__ vector unsigned char __ATTRS_o_ai
-vec_xl_len_r(unsigned char *__a, size_t __b) {
+vec_xl_len_r(const unsigned char *__a, size_t __b) {
   vector unsigned char __res =
       (vector unsigned char)__builtin_vsx_lxvll(__a, (__b << 56));
 #ifdef __LITTLE_ENDIAN__
@@ -2862,12 +2996,12 @@ static __inline__ void __ATTRS_o_ai vec_xst_len_r(vector unsigned char __a,
 #ifdef __VSX__
 static __inline__ vector float __ATTRS_o_ai vec_cpsgn(vector float __a,
                                                       vector float __b) {
-  return __builtin_vsx_xvcpsgnsp(__a, __b);
+  return __builtin_vsx_xvcpsgnsp(__b, __a);
 }
 
 static __inline__ vector double __ATTRS_o_ai vec_cpsgn(vector double __a,
                                                        vector double __b) {
-  return __builtin_vsx_xvcpsgndp(__a, __b);
+  return __builtin_vsx_xvcpsgndp(__b, __a);
 }
 #endif
 
@@ -2951,6 +3085,42 @@ static __inline__ vector double __ATTRS_o_ai vec_cpsgn(vector double __a,
 
 #define vec_vctuxs __builtin_altivec_vctuxs
 
+/* vec_signext */
+
+#ifdef __POWER9_VECTOR__
+static __inline__ vector signed int __ATTRS_o_ai
+vec_signexti(vector signed char __a) {
+  return __builtin_altivec_vextsb2w(__a);
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_signexti(vector signed short __a) {
+  return __builtin_altivec_vextsh2w(__a);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_signextll(vector signed char __a) {
+  return __builtin_altivec_vextsb2d(__a);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_signextll(vector signed short __a) {
+  return __builtin_altivec_vextsh2d(__a);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_signextll(vector signed int __a) {
+  return __builtin_altivec_vextsw2d(__a);
+}
+#endif
+
+#ifdef __POWER10_VECTOR__
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_signextq(vector signed long long __a) {
+  return __builtin_altivec_vextsd2q(__a);
+}
+#endif
+
 /* vec_signed */
 
 static __inline__ vector signed int __ATTRS_o_ai
@@ -3288,6 +3458,66 @@ static __inline__ vector double __ATTRS_o_ai vec_div(vector double __a,
 }
 #endif
 
+/* vec_dive */
+
+#ifdef __POWER10_VECTOR__
+static __inline__ vector signed int __ATTRS_o_ai
+vec_dive(vector signed int __a, vector signed int __b) {
+  return __builtin_altivec_vdivesw(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_dive(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_vdiveuw(__a, __b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_dive(vector signed long long __a, vector signed long long __b) {
+  return __builtin_altivec_vdivesd(__a, __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_dive(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_altivec_vdiveud(__a, __b);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_dive(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return __builtin_altivec_vdiveuq(__a, __b);
+}
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_dive(vector signed __int128 __a, vector signed __int128 __b) {
+  return __builtin_altivec_vdivesq(__a, __b);
+}
+#endif
+
+#ifdef __POWER10_VECTOR__
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_div(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return __a / __b;
+}
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_div(vector signed __int128 __a, vector signed __int128 __b) {
+  return __a / __b;
+}
+#endif /* __POWER10_VECTOR__ */
+
+/* vec_xvtdiv */
+
+#ifdef __VSX__
+static __inline__ int __ATTRS_o_ai vec_test_swdiv(vector double __a,
+                                                  vector double __b) {
+  return __builtin_vsx_xvtdivdp(__a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_test_swdivs(vector float __a,
+                                                   vector float __b) {
+  return __builtin_vsx_xvtdivsp(__a, __b);
+}
+#endif
+
 /* vec_dss */
 
 #define vec_dss __builtin_altivec_dss
@@ -3300,23 +3530,19 @@ static __inline__ void __attribute__((__always_inline__)) vec_dssall(void) {
 
 /* vec_dst */
 #define vec_dst(__PTR, __CW, __STR) \
-  __extension__(                    \
-      { __builtin_altivec_dst((const void *)(__PTR), (__CW), (__STR)); })
+  __builtin_altivec_dst((const void *)(__PTR), (__CW), (__STR))
 
 /* vec_dstst */
 #define vec_dstst(__PTR, __CW, __STR) \
-  __extension__(                      \
-      { __builtin_altivec_dstst((const void *)(__PTR), (__CW), (__STR)); })
+  __builtin_altivec_dstst((const void *)(__PTR), (__CW), (__STR))
 
 /* vec_dststt */
 #define vec_dststt(__PTR, __CW, __STR) \
-  __extension__(                       \
-      { __builtin_altivec_dststt((const void *)(__PTR), (__CW), (__STR)); })
+  __builtin_altivec_dststt((const void *)(__PTR), (__CW), (__STR))
 
 /* vec_dstt */
 #define vec_dstt(__PTR, __CW, __STR) \
-  __extension__(                     \
-      { __builtin_altivec_dstt((const void *)(__PTR), (__CW), (__STR)); })
+  __builtin_altivec_dstt((const void *)(__PTR), (__CW), (__STR))
 
 /* vec_eqv */
 
@@ -5467,6 +5693,16 @@ vec_msum(vector unsigned short __a, vector unsigned short __b,
   return __builtin_altivec_vmsumuhm(__a, __b, __c);
 }
 
+/* vec_msumc */
+
+#ifdef __POWER10_VECTOR__
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_msumc(vector unsigned long long __a, vector unsigned long long __b,
+          vector unsigned __int128 __c) {
+  return __builtin_altivec_vmsumcud(__a, __b, __c);
+}
+#endif
+
 /* vec_vmsummbm */
 
 static __inline__ vector int __attribute__((__always_inline__))
@@ -5693,6 +5929,26 @@ vec_mule(vector unsigned int __a, vector unsigned int __b) {
 }
 #endif
 
+#ifdef __POWER10_VECTOR__
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_mule(vector signed long long __a, vector signed long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulosd(__a, __b);
+#else
+  return __builtin_altivec_vmulesd(__a, __b);
+#endif
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_mule(vector unsigned long long __a, vector unsigned long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuloud(__a, __b);
+#else
+  return __builtin_altivec_vmuleud(__a, __b);
+#endif
+}
+#endif
+
 /* vec_vmulesb */
 
 static __inline__ vector short __attribute__((__always_inline__))
@@ -5737,6 +5993,30 @@ vec_vmuleuh(vector unsigned short __a, vector unsigned short __b) {
 #endif
 }
 
+/* vec_mulh */
+
+#ifdef __POWER10_VECTOR__
+static __inline__ vector signed int __ATTRS_o_ai
+vec_mulh(vector signed int __a, vector signed int __b) {
+  return __builtin_altivec_vmulhsw(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_mulh(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_vmulhuw(__a, __b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_mulh(vector signed long long __a, vector signed long long __b) {
+  return __builtin_altivec_vmulhsd(__a, __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_mulh(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_altivec_vmulhud(__a, __b);
+}
+#endif
+
 /* vec_mulo */
 
 static __inline__ vector short __ATTRS_o_ai vec_mulo(vector signed char __a,
@@ -5795,6 +6075,26 @@ vec_mulo(vector unsigned int __a, vector unsigned int __b) {
 }
 #endif
 
+#ifdef __POWER10_VECTOR__
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_mulo(vector signed long long __a, vector signed long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulesd(__a, __b);
+#else
+  return __builtin_altivec_vmulosd(__a, __b);
+#endif
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_mulo(vector unsigned long long __a, vector unsigned long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuleud(__a, __b);
+#else
+  return __builtin_altivec_vmuloud(__a, __b);
+#endif
+}
+#endif
+
 /* vec_vmulosb */
 
 static __inline__ vector short __attribute__((__always_inline__))
@@ -7627,6 +7927,18 @@ vec_rl(vector unsigned long long __a, vector unsigned long long __b) {
 }
 #endif
 
+#ifdef __POWER10_VECTOR__
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_rl(vector signed __int128 __a, vector unsigned __int128 __b) {
+  return (__b << __a)|(__b >> ((__CHAR_BIT__ * sizeof(vector signed __int128)) - __a));
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_rl(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return (__b << __a)|(__b >> ((__CHAR_BIT__ * sizeof(vector unsigned __int128)) - __a));
+}
+#endif
+
 /* vec_rlmi */
 #ifdef __POWER9_VECTOR__
 static __inline__ vector unsigned int __ATTRS_o_ai
@@ -7640,8 +7952,24 @@ vec_rlmi(vector unsigned long long __a, vector unsigned long long __b,
          vector unsigned long long __c) {
   return __builtin_altivec_vrldmi(__a, __c, __b);
 }
+#endif
+
+#ifdef __POWER10_VECTOR__
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_rlmi(vector unsigned __int128 __a, vector unsigned __int128 __b,
+         vector unsigned __int128 __c) {
+  return __builtin_altivec_vrlqmi(__a, __c, __b);
+}
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_rlmi(vector signed __int128 __a, vector signed __int128 __b,
+         vector signed __int128 __c) {
+  return __builtin_altivec_vrlqmi(__a, __c, __b);
+}
+#endif
 
 /* vec_rlnm */
+#ifdef __POWER9_VECTOR__
 static __inline__ vector unsigned int __ATTRS_o_ai
 vec_rlnm(vector unsigned int __a, vector unsigned int __b,
          vector unsigned int __c) {
@@ -7657,6 +7985,42 @@ vec_rlnm(vector unsigned long long __a, vector unsigned long long __b,
 }
 #endif
 
+#ifdef __POWER10_VECTOR__
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_rlnm(vector unsigned __int128 __a, vector unsigned __int128 __b,
+         vector unsigned __int128 __c) {
+  // Merge __b and __c using an appropriate shuffle.
+  vector unsigned char TmpB = (vector unsigned char)__b;
+  vector unsigned char TmpC = (vector unsigned char)__c;
+  vector unsigned char MaskAndShift =
+#ifdef __LITTLE_ENDIAN__
+      __builtin_shufflevector(TmpB, TmpC, -1, -1, -1, -1, -1, -1, -1, -1, 16, 0,
+                              1, -1, -1, -1, -1, -1);
+#else
+      __builtin_shufflevector(TmpB, TmpC, -1, -1, -1, -1, -1, 31, 30, 15, -1,
+                              -1, -1, -1, -1, -1, -1, -1);
+#endif
+   return __builtin_altivec_vrlqnm(__a, (vector unsigned __int128) MaskAndShift);
+}
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_rlnm(vector signed __int128 __a, vector signed __int128 __b,
+         vector signed __int128 __c) {
+  // Merge __b and __c using an appropriate shuffle.
+  vector unsigned char TmpB = (vector unsigned char)__b;
+  vector unsigned char TmpC = (vector unsigned char)__c;
+  vector unsigned char MaskAndShift =
+#ifdef __LITTLE_ENDIAN__
+      __builtin_shufflevector(TmpB, TmpC, -1, -1, -1, -1, -1, -1, -1, -1, 16, 0,
+                              1, -1, -1, -1, -1, -1);
+#else
+      __builtin_shufflevector(TmpB, TmpC, -1, -1, -1, -1, -1, 31, 30, 15, -1,
+                              -1, -1, -1, -1, -1, -1, -1);
+#endif
+  return __builtin_altivec_vrlqnm(__a, (vector unsigned __int128) MaskAndShift);
+}
+#endif
+
 /* vec_vrlb */
 
 static __inline__ vector signed char __ATTRS_o_ai
@@ -7771,6 +8135,18 @@ vec_vrsqrtefp(vector float __a) {
   return __builtin_altivec_vrsqrtefp(__a);
 }
 
+/* vec_xvtsqrt */
+
+#ifdef __VSX__
+static __inline__ int __ATTRS_o_ai vec_test_swsqrt(vector double __a) {
+  return __builtin_vsx_xvtsqrtdp(__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_test_swsqrts(vector float __a) {
+  return __builtin_vsx_xvtsqrtsp(__a);
+}
+#endif
+
 /* vec_sel */
 
 #define __builtin_altivec_vsel_4si vec_sel
@@ -7905,6 +8281,46 @@ vec_sel(vector double __a, vector double __b, vector unsigned long long __c) {
                            ((vector long long)__b & (vector long long)__c);
   return (vector double)__res;
 }
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_sel(vector bool long long __a, vector bool long long __b,
+        vector bool long long __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_sel(vector bool long long __a, vector bool long long __b,
+        vector unsigned long long __c) {
+  return (__a & ~(vector bool long long)__c) |
+         (__b & (vector bool long long)__c);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_sel(vector signed long long __a, vector signed long long __b,
+        vector bool long long __c) {
+  return (__a & ~(vector signed long long)__c) |
+         (__b & (vector signed long long)__c);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_sel(vector signed long long __a, vector signed long long __b,
+        vector unsigned long long __c) {
+  return (__a & ~(vector signed long long)__c) |
+         (__b & (vector signed long long)__c);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_sel(vector unsigned long long __a, vector unsigned long long __b,
+        vector bool long long __c) {
+  return (__a & ~(vector unsigned long long)__c) |
+         (__b & (vector unsigned long long)__c);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_sel(vector unsigned long long __a, vector unsigned long long __b,
+        vector unsigned long long __c) {
+  return (__a & ~__c) | (__b & __c);
+}
 #endif
 
 /* vec_vsel */
@@ -13900,6 +14316,18 @@ static __inline__ int __ATTRS_o_ai vec_all_eq(vector double __a,
 }
 #endif
 
+#ifdef __POWER10_VECTOR__
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector signed __int128 __a,
+                                              vector signed __int128 __b) {
+  return __builtin_altivec_vcmpequq_p(__CR6_LT, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector unsigned __int128 __a,
+                                              vector unsigned __int128 __b) {
+  return __builtin_altivec_vcmpequq_p(__CR6_LT, __a, __b);
+}
+#endif
+
 /* vec_all_ge */
 
 static __inline__ int __ATTRS_o_ai vec_all_ge(vector signed char __a,
@@ -14071,6 +14499,18 @@ static __inline__ int __ATTRS_o_ai vec_all_ge(vector double __a,
 }
 #endif
 
+#ifdef __POWER10_VECTOR__
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector signed __int128 __a,
+                                              vector signed __int128 __b) {
+  return __builtin_altivec_vcmpgtsq_p(__CR6_EQ, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector unsigned __int128 __a,
+                                              vector unsigned __int128 __b) {
+  return __builtin_altivec_vcmpgtuq_p(__CR6_EQ, __b, __a);
+}
+#endif
+
 /* vec_all_gt */
 
 static __inline__ int __ATTRS_o_ai vec_all_gt(vector signed char __a,
@@ -14242,6 +14682,18 @@ static __inline__ int __ATTRS_o_ai vec_all_gt(vector double __a,
 }
 #endif
 
+#ifdef __POWER10_VECTOR__
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector signed __int128 __a,
+                                              vector signed __int128 __b) {
+  return __builtin_altivec_vcmpgtsq_p(__CR6_LT, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector unsigned __int128 __a,
+                                              vector unsigned __int128 __b) {
+  return __builtin_altivec_vcmpgtuq_p(__CR6_LT, __a, __b);
+}
+#endif
+
 /* vec_all_in */
 
 static __inline__ int __attribute__((__always_inline__))
@@ -14421,6 +14873,18 @@ static __inline__ int __ATTRS_o_ai vec_all_le(vector double __a,
 }
 #endif
 
+#ifdef __POWER10_VECTOR__
+static __inline__ int __ATTRS_o_ai vec_all_le(vector signed __int128 __a,
+                                              vector signed __int128 __b) {
+  return __builtin_altivec_vcmpgtsq_p(__CR6_EQ, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector unsigned __int128 __a,
+                                              vector unsigned __int128 __b) {
+  return __builtin_altivec_vcmpgtuq_p(__CR6_EQ, __a, __b);
+}
+#endif
+
 /* vec_all_lt */
 
 static __inline__ int __ATTRS_o_ai vec_all_lt(vector signed char __a,
@@ -14593,6 +15057,18 @@ static __inline__ int __ATTRS_o_ai vec_all_lt(vector double __a,
 }
 #endif
 
+#ifdef __POWER10_VECTOR__
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector signed __int128 __a,
+                                              vector signed __int128 __b) {
+  return __builtin_altivec_vcmpgtsq_p(__CR6_LT, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector unsigned __int128 __a,
+                                              vector unsigned __int128 __b) {
+  return __builtin_altivec_vcmpgtuq_p(__CR6_LT, __b, __a);
+}
+#endif
+
 /* vec_all_nan */
 
 static __inline__ int __ATTRS_o_ai vec_all_nan(vector float __a) {
@@ -14797,6 +15273,18 @@ static __inline__ int __ATTRS_o_ai vec_all_ne(vector double __a,
 }
 #endif
 
+#ifdef __POWER10_VECTOR__
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector signed __int128 __a,
+                                              vector signed __int128 __b) {
+  return __builtin_altivec_vcmpequq_p(__CR6_EQ, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector unsigned __int128 __a,
+                                              vector unsigned __int128 __b) {
+  return __builtin_altivec_vcmpequq_p(__CR6_EQ, __a, __b);
+}
+#endif
+
 /* vec_all_nge */
 
 static __inline__ int __ATTRS_o_ai vec_all_nge(vector float __a,
@@ -15042,6 +15530,18 @@ static __inline__ int __ATTRS_o_ai vec_any_eq(vector double __a,
 }
 #endif
 
+#ifdef __POWER10_VECTOR__
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector signed __int128 __a,
+                                              vector signed __int128 __b) {
+  return __builtin_altivec_vcmpequq_p(__CR6_EQ_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector unsigned __int128 __a,
+                                              vector unsigned __int128 __b) {
+  return __builtin_altivec_vcmpequq_p(__CR6_EQ_REV, __a, __b);
+}
+#endif
+
 /* vec_any_ge */
 
 static __inline__ int __ATTRS_o_ai vec_any_ge(vector signed char __a,
@@ -15221,6 +15721,18 @@ static __inline__ int __ATTRS_o_ai vec_any_ge(vector double __a,
 }
 #endif
 
+#ifdef __POWER10_VECTOR__
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector signed __int128 __a,
+                                              vector signed __int128 __b) {
+  return __builtin_altivec_vcmpgtsq_p(__CR6_LT_REV, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector unsigned __int128 __a,
+                                              vector unsigned __int128 __b) {
+  return __builtin_altivec_vcmpgtuq_p(__CR6_LT_REV, __b, __a);
+}
+#endif
+
 /* vec_any_gt */
 
 static __inline__ int __ATTRS_o_ai vec_any_gt(vector signed char __a,
@@ -15400,6 +15912,18 @@ static __inline__ int __ATTRS_o_ai vec_any_gt(vector double __a,
 }
 #endif
 
+#ifdef __POWER10_VECTOR__
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector signed __int128 __a,
+                                              vector signed __int128 __b) {
+  return __builtin_altivec_vcmpgtsq_p(__CR6_EQ_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector unsigned __int128 __a,
+                                              vector unsigned __int128 __b) {
+  return __builtin_altivec_vcmpgtuq_p(__CR6_EQ_REV, __a, __b);
+}
+#endif
+
 /* vec_any_le */
 
 static __inline__ int __ATTRS_o_ai vec_any_le(vector signed char __a,
@@ -15579,6 +16103,18 @@ static __inline__ int __ATTRS_o_ai vec_any_le(vector double __a,
 }
 #endif
 
+#ifdef __POWER10_VECTOR__
+static __inline__ int __ATTRS_o_ai vec_any_le(vector signed __int128 __a,
+                                              vector signed __int128 __b) {
+  return __builtin_altivec_vcmpgtsq_p(__CR6_LT_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector unsigned __int128 __a,
+                                              vector unsigned __int128 __b) {
+  return __builtin_altivec_vcmpgtuq_p(__CR6_LT_REV, __a, __b);
+}
+#endif
+
 /* vec_any_lt */
 
 static __inline__ int __ATTRS_o_ai vec_any_lt(vector signed char __a,
@@ -15758,6 +16294,18 @@ static __inline__ int __ATTRS_o_ai vec_any_lt(vector double __a,
 }
 #endif
 
+#ifdef __POWER10_VECTOR__
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector signed __int128 __a,
+                                              vector signed __int128 __b) {
+  return __builtin_altivec_vcmpgtsq_p(__CR6_EQ_REV, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector unsigned __int128 __a,
+                                              vector unsigned __int128 __b) {
+  return __builtin_altivec_vcmpgtuq_p(__CR6_EQ_REV, __b, __a);
+}
+#endif
+
 /* vec_any_nan */
 
 static __inline__ int __attribute__((__always_inline__))
@@ -15953,6 +16501,18 @@ static __inline__ int __ATTRS_o_ai vec_any_ne(vector double __a,
 }
 #endif
 
+#ifdef __POWER10_VECTOR__
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector signed __int128 __a,
+                                              vector signed __int128 __b) {
+  return __builtin_altivec_vcmpequq_p(__CR6_LT_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector unsigned __int128 __a,
+                                              vector unsigned __int128 __b) {
+  return __builtin_altivec_vcmpequq_p(__CR6_LT_REV, __a, __b);
+}
+#endif
+
 /* vec_any_nge */
 
 static __inline__ int __attribute__((__always_inline__))
@@ -16353,41 +16913,41 @@ typedef vector unsigned int unaligned_vec_uint __attribute__((aligned(1)));
 typedef vector float unaligned_vec_float __attribute__((aligned(1)));
 
 static inline __ATTRS_o_ai vector signed char vec_xl(signed long long __offset,
-                                                     signed char *__ptr) {
+                                                     const signed char *__ptr) {
   return *(unaligned_vec_schar *)(__ptr + __offset);
 }
 
 static inline __ATTRS_o_ai vector unsigned char
-vec_xl(signed long long __offset, unsigned char *__ptr) {
+vec_xl(signed long long __offset, const unsigned char *__ptr) {
   return *(unaligned_vec_uchar*)(__ptr + __offset);
 }
 
 static inline __ATTRS_o_ai vector signed short vec_xl(signed long long __offset,
-                                                      signed short *__ptr) {
+                                                      const signed short *__ptr) {
   signed char *__addr = (signed char *)__ptr + __offset;
   return *(unaligned_vec_sshort *)__addr;
 }
 
 static inline __ATTRS_o_ai vector unsigned short
-vec_xl(signed long long __offset, unsigned short *__ptr) {
+vec_xl(signed long long __offset, const unsigned short *__ptr) {
   signed char *__addr = (signed char *)__ptr + __offset;
   return *(unaligned_vec_ushort *)__addr;
 }
 
 static inline __ATTRS_o_ai vector signed int vec_xl(signed long long __offset,
-                                                    signed int *__ptr) {
+                                                    const signed int *__ptr) {
   signed char *__addr = (signed char *)__ptr + __offset;
   return *(unaligned_vec_sint *)__addr;
 }
 
 static inline __ATTRS_o_ai vector unsigned int vec_xl(signed long long __offset,
-                                                      unsigned int *__ptr) {
+                                                      const unsigned int *__ptr) {
   signed char *__addr = (signed char *)__ptr + __offset;
   return *(unaligned_vec_uint *)__addr;
 }
 
 static inline __ATTRS_o_ai vector float vec_xl(signed long long __offset,
-                                               float *__ptr) {
+                                               const float *__ptr) {
   signed char *__addr = (signed char *)__ptr + __offset;
   return *(unaligned_vec_float *)__addr;
 }
@@ -16398,19 +16958,19 @@ typedef vector unsigned long long unaligned_vec_ull __attribute__((aligned(1)));
 typedef vector double unaligned_vec_double __attribute__((aligned(1)));
 
 static inline __ATTRS_o_ai vector signed long long
-vec_xl(signed long long __offset, signed long long *__ptr) {
+vec_xl(signed long long __offset, const signed long long *__ptr) {
   signed char *__addr = (signed char *)__ptr + __offset;
   return *(unaligned_vec_sll *)__addr;
 }
 
 static inline __ATTRS_o_ai vector unsigned long long
-vec_xl(signed long long __offset, unsigned long long *__ptr) {
+vec_xl(signed long long __offset, const unsigned long long *__ptr) {
   signed char *__addr = (signed char *)__ptr + __offset;
   return *(unaligned_vec_ull *)__addr;
 }
 
 static inline __ATTRS_o_ai vector double vec_xl(signed long long __offset,
-                                                double *__ptr) {
+                                                const double *__ptr) {
   signed char *__addr = (signed char *)__ptr + __offset;
   return *(unaligned_vec_double *)__addr;
 }
@@ -16421,13 +16981,13 @@ typedef vector signed __int128 unaligned_vec_si128 __attribute__((aligned(1)));
 typedef vector unsigned __int128 unaligned_vec_ui128
     __attribute__((aligned(1)));
 static inline __ATTRS_o_ai vector signed __int128
-vec_xl(signed long long __offset, signed __int128 *__ptr) {
+vec_xl(signed long long __offset, const signed __int128 *__ptr) {
   signed char *__addr = (signed char *)__ptr + __offset;
   return *(unaligned_vec_si128 *)__addr;
 }
 
 static inline __ATTRS_o_ai vector unsigned __int128
-vec_xl(signed long long __offset, unsigned __int128 *__ptr) {
+vec_xl(signed long long __offset, const unsigned __int128 *__ptr) {
   signed char *__addr = (signed char *)__ptr + __offset;
   return *(unaligned_vec_ui128 *)__addr;
 }
@@ -16437,71 +16997,71 @@ vec_xl(signed long long __offset, unsigned __int128 *__ptr) {
 
 #ifdef __LITTLE_ENDIAN__
 static __inline__ vector signed char __ATTRS_o_ai
-vec_xl_be(signed long long __offset, signed char *__ptr) {
+vec_xl_be(signed long long __offset, const signed char *__ptr) {
   vector signed char __vec = (vector signed char)__builtin_vsx_lxvd2x_be(__offset, __ptr);
   return __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
                                  13, 12, 11, 10, 9, 8);
 }
 
 static __inline__ vector unsigned char __ATTRS_o_ai
-vec_xl_be(signed long long __offset, unsigned char *__ptr) {
+vec_xl_be(signed long long __offset, const unsigned char *__ptr) {
   vector unsigned char __vec = (vector unsigned char)__builtin_vsx_lxvd2x_be(__offset, __ptr);
   return __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
                                  13, 12, 11, 10, 9, 8);
 }
 
 static __inline__ vector signed short  __ATTRS_o_ai
-vec_xl_be(signed long long __offset, signed short *__ptr) {
+vec_xl_be(signed long long __offset, const signed short *__ptr) {
   vector signed short __vec = (vector signed short)__builtin_vsx_lxvd2x_be(__offset, __ptr);
   return __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4);
 }
 
 static __inline__ vector unsigned short __ATTRS_o_ai
-vec_xl_be(signed long long __offset, unsigned short *__ptr) {
+vec_xl_be(signed long long __offset, const unsigned short *__ptr) {
   vector unsigned short __vec = (vector unsigned short)__builtin_vsx_lxvd2x_be(__offset, __ptr);
   return __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4);
 }
 
 static __inline__ vector signed int __ATTRS_o_ai
-vec_xl_be(signed long long  __offset, signed int *__ptr) {
+vec_xl_be(signed long long  __offset, const signed int *__ptr) {
   return (vector signed int)__builtin_vsx_lxvw4x_be(__offset, __ptr);
 }
 
 static __inline__ vector unsigned int __ATTRS_o_ai
-vec_xl_be(signed long long  __offset, unsigned int *__ptr) {
+vec_xl_be(signed long long  __offset, const unsigned int *__ptr) {
   return (vector unsigned int)__builtin_vsx_lxvw4x_be(__offset, __ptr);
 }
 
 static __inline__ vector float __ATTRS_o_ai
-vec_xl_be(signed long long  __offset, float *__ptr) {
+vec_xl_be(signed long long  __offset, const float *__ptr) {
   return (vector float)__builtin_vsx_lxvw4x_be(__offset, __ptr);
 }
 
 #ifdef __VSX__
 static __inline__ vector signed long long __ATTRS_o_ai
-vec_xl_be(signed long long  __offset, signed long long *__ptr) {
+vec_xl_be(signed long long  __offset, const signed long long *__ptr) {
   return (vector signed long long)__builtin_vsx_lxvd2x_be(__offset, __ptr);
 }
 
 static __inline__ vector unsigned long long __ATTRS_o_ai
-vec_xl_be(signed long long  __offset, unsigned long long *__ptr) {
+vec_xl_be(signed long long  __offset, const unsigned long long *__ptr) {
   return (vector unsigned long long)__builtin_vsx_lxvd2x_be(__offset, __ptr);
 }
 
 static __inline__ vector double __ATTRS_o_ai
-vec_xl_be(signed long long  __offset, double *__ptr) {
+vec_xl_be(signed long long  __offset, const double *__ptr) {
   return (vector double)__builtin_vsx_lxvd2x_be(__offset, __ptr);
 }
 #endif
 
 #if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
 static __inline__ vector signed __int128 __ATTRS_o_ai
-vec_xl_be(signed long long  __offset, signed __int128 *__ptr) {
+vec_xl_be(signed long long  __offset, const signed __int128 *__ptr) {
   return vec_xl(__offset, __ptr);
 }
 
 static __inline__ vector unsigned __int128 __ATTRS_o_ai
-vec_xl_be(signed long long  __offset, unsigned __int128 *__ptr) {
+vec_xl_be(signed long long  __offset, const unsigned __int128 *__ptr) {
   return vec_xl(__offset, __ptr);
 }
 #endif
@@ -16509,6 +17069,54 @@ vec_xl_be(signed long long  __offset, unsigned __int128 *__ptr) {
   #define vec_xl_be vec_xl
 #endif
 
+#if defined(__POWER10_VECTOR__) && defined(__VSX__)
+
+/* vect_xl_sext */
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_xl_sext(signed long long __offset, const signed char *__pointer) {
+  return (vector unsigned __int128)*(__pointer + __offset);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_xl_sext(signed long long __offset, const signed short *__pointer) {
+  return (vector unsigned __int128)*(__pointer + __offset);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_xl_sext(signed long long __offset, const signed int *__pointer) {
+  return (vector unsigned __int128)*(__pointer + __offset);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_xl_sext(signed long long __offset, const signed long long *__pointer) {
+  return (vector unsigned __int128)*(__pointer + __offset);
+}
+
+/* vec_xl_zext */
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_xl_zext(signed long long __offset, const unsigned char *__pointer) {
+  return (vector unsigned __int128)*(__pointer + __offset);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_xl_zext(signed long long __offset, const unsigned short *__pointer) {
+  return (vector unsigned __int128)*(__pointer + __offset);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_xl_zext(signed long long __offset, const unsigned int *__pointer) {
+  return (vector unsigned __int128)*(__pointer + __offset);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_xl_zext(signed long long __offset, const unsigned long long *__pointer) {
+  return (vector unsigned __int128)*(__pointer + __offset);
+}
+
+#endif
+
 /* vec_xst */
 
 static inline __ATTRS_o_ai void vec_xst(vector signed char __vec,
@@ -16597,6 +17205,58 @@ static inline __ATTRS_o_ai void vec_xst(vector unsigned __int128 __vec,
 }
 #endif
 
+/* vec_xst_trunc */
+
+#if defined(__POWER10_VECTOR__) && defined(__VSX__)
+static inline __ATTRS_o_ai void vec_xst_trunc(vector signed __int128 __vec,
+                                              signed long long __offset,
+                                              signed char *__ptr) {
+  *(__ptr + __offset) = (signed char)__vec[0];
+}
+
+static inline __ATTRS_o_ai void vec_xst_trunc(vector unsigned __int128 __vec,
+                                              signed long long __offset,
+                                              unsigned char *__ptr) {
+  *(__ptr + __offset) = (unsigned char)__vec[0];
+}
+
+static inline __ATTRS_o_ai void vec_xst_trunc(vector signed __int128 __vec,
+                                              signed long long __offset,
+                                              signed short *__ptr) {
+  *(__ptr + __offset) = (signed short)__vec[0];
+}
+
+static inline __ATTRS_o_ai void vec_xst_trunc(vector unsigned __int128 __vec,
+                                              signed long long __offset,
+                                              unsigned short *__ptr) {
+  *(__ptr + __offset) = (unsigned short)__vec[0];
+}
+
+static inline __ATTRS_o_ai void vec_xst_trunc(vector signed __int128 __vec,
+                                              signed long long __offset,
+                                              signed int *__ptr) {
+  *(__ptr + __offset) = (signed int)__vec[0];
+}
+
+static inline __ATTRS_o_ai void vec_xst_trunc(vector unsigned __int128 __vec,
+                                              signed long long __offset,
+                                              unsigned int *__ptr) {
+  *(__ptr + __offset) = (unsigned int)__vec[0];
+}
+
+static inline __ATTRS_o_ai void vec_xst_trunc(vector signed __int128 __vec,
+                                              signed long long __offset,
+                                              signed long long *__ptr) {
+  *(__ptr + __offset) = (signed long long)__vec[0];
+}
+
+static inline __ATTRS_o_ai void vec_xst_trunc(vector unsigned __int128 __vec,
+                                              signed long long __offset,
+                                              unsigned long long *__ptr) {
+  *(__ptr + __offset) = (unsigned long long)__vec[0];
+}
+#endif
+
 /* vec_xst_be */
 
 #ifdef __LITTLE_ENDIAN__
@@ -16763,6 +17423,100 @@ static vector signed char __ATTRS_o_ai vec_nabs(vector signed char __a) {
 }
 
 #ifdef __POWER10_VECTOR__
+
+/* vec_extractm */
+
+static __inline__ unsigned int __ATTRS_o_ai
+vec_extractm(vector unsigned char __a) {
+  return __builtin_altivec_vextractbm(__a);
+}
+
+static __inline__ unsigned int __ATTRS_o_ai
+vec_extractm(vector unsigned short __a) {
+  return __builtin_altivec_vextracthm(__a);
+}
+
+static __inline__ unsigned int __ATTRS_o_ai
+vec_extractm(vector unsigned int __a) {
+  return __builtin_altivec_vextractwm(__a);
+}
+
+static __inline__ unsigned int __ATTRS_o_ai
+vec_extractm(vector unsigned long long __a) {
+  return __builtin_altivec_vextractdm(__a);
+}
+
+static __inline__ unsigned int __ATTRS_o_ai
+vec_extractm(vector unsigned __int128 __a) {
+  return __builtin_altivec_vextractqm(__a);
+}
+
+/* vec_expandm */
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_expandm(vector unsigned char __a) {
+  return __builtin_altivec_vexpandbm(__a);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_expandm(vector unsigned short __a) {
+  return __builtin_altivec_vexpandhm(__a);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_expandm(vector unsigned int __a) {
+  return __builtin_altivec_vexpandwm(__a);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_expandm(vector unsigned long long __a) {
+  return __builtin_altivec_vexpanddm(__a);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_expandm(vector unsigned __int128 __a) {
+  return __builtin_altivec_vexpandqm(__a);
+}
+
+/* vec_cntm */
+
+#define vec_cntm(__a, __mp)                                                    \
+  _Generic((__a), vector unsigned char                                         \
+           : __builtin_altivec_vcntmbb((__a), (unsigned int)(__mp)),           \
+             vector unsigned short                                             \
+           : __builtin_altivec_vcntmbh((__a), (unsigned int)(__mp)),           \
+             vector unsigned int                                               \
+           : __builtin_altivec_vcntmbw((__a), (unsigned int)(__mp)),           \
+             vector unsigned long long                                         \
+           : __builtin_altivec_vcntmbd((__a), (unsigned int)(__mp)))
+
+/* vec_gen[b|h|w|d|q]m */
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_genbm(unsigned long long __bm) {
+  return __builtin_altivec_mtvsrbm(__bm);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_genhm(unsigned long long __bm) {
+  return __builtin_altivec_mtvsrhm(__bm);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_genwm(unsigned long long __bm) {
+  return __builtin_altivec_mtvsrwm(__bm);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_gendm(unsigned long long __bm) {
+  return __builtin_altivec_mtvsrdm(__bm);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_genqm(unsigned long long __bm) {
+  return __builtin_altivec_mtvsrqm(__bm);
+}
+
 /* vec_pdep */
 
 static __inline__ vector unsigned long long __ATTRS_o_ai
@@ -16881,6 +17635,38 @@ vec_cnttzm(vector unsigned long long __a, vector unsigned long long __b) {
   return __builtin_altivec_vctzdm(__a, __b);
 }
 
+/* vec_mod */
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_mod(vector signed int __a, vector signed int __b) {
+  return __a % __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_mod(vector unsigned int __a, vector unsigned int __b) {
+  return __a % __b;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_mod(vector signed long long __a, vector signed long long __b) {
+  return __a % __b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_mod(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a % __b;
+}
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_mod(vector signed __int128 __a, vector signed __int128 __b) {
+  return __a % __b;
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_mod(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return  __a % __b;
+}
+
 /* vec_sldbi */
 
 #define vec_sldb(__a, __b, __c) __builtin_altivec_vsldbi(__a, __b, (__c & 0x7))
@@ -17027,6 +17813,92 @@ vec_inserth(vector unsigned int __a, vector unsigned int __b,
 #endif
 }
 
+/* vec_extractl */
+
+static __inline__ vector unsigned long long __ATTRS_o_ai vec_extractl(
+    vector unsigned char __a, vector unsigned char __b, unsigned int __c) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vextdubvrx(__a, __b, __c);
+#else
+  vector unsigned long long __ret = __builtin_altivec_vextdubvlx(__a, __b, __c);
+  return vec_sld(__ret, __ret, 8);
+#endif
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai vec_extractl(
+    vector unsigned short __a, vector unsigned short __b, unsigned int __c) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vextduhvrx(__a, __b, __c);
+#else
+  vector unsigned long long __ret = __builtin_altivec_vextduhvlx(__a, __b, __c);
+  return vec_sld(__ret, __ret, 8);
+#endif
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai vec_extractl(
+    vector unsigned int __a, vector unsigned int __b, unsigned int __c) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vextduwvrx(__a, __b, __c);
+#else
+  vector unsigned long long __ret = __builtin_altivec_vextduwvlx(__a, __b, __c);
+  return vec_sld(__ret, __ret, 8);
+#endif
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_extractl(vector unsigned long long __a, vector unsigned long long __b,
+             unsigned int __c) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vextddvrx(__a, __b, __c);
+#else
+  vector unsigned long long __ret = __builtin_altivec_vextddvlx(__a, __b, __c);
+  return vec_sld(__ret, __ret, 8);
+#endif
+}
+
+/* vec_extracth */
+
+static __inline__ vector unsigned long long __ATTRS_o_ai vec_extracth(
+    vector unsigned char __a, vector unsigned char __b, unsigned int __c) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vextdubvlx(__a, __b, __c);
+#else
+  vector unsigned long long __ret = __builtin_altivec_vextdubvrx(__a, __b, __c);
+  return vec_sld(__ret, __ret, 8);
+#endif
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai vec_extracth(
+    vector unsigned short __a, vector unsigned short __b, unsigned int __c) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vextduhvlx(__a, __b, __c);
+#else
+  vector unsigned long long __ret = __builtin_altivec_vextduhvrx(__a, __b, __c);
+  return vec_sld(__ret, __ret, 8);
+#endif
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai vec_extracth(
+    vector unsigned int __a, vector unsigned int __b, unsigned int __c) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vextduwvlx(__a, __b, __c);
+#else
+  vector unsigned long long __ret = __builtin_altivec_vextduwvrx(__a, __b, __c);
+  return vec_sld(__ret, __ret, 8);
+#endif
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_extracth(vector unsigned long long __a, vector unsigned long long __b,
+             unsigned int __c) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vextddvlx(__a, __b, __c);
+#else
+  vector unsigned long long __ret = __builtin_altivec_vextddvrx(__a, __b, __c);
+  return vec_sld(__ret, __ret, 8);
+#endif
+}
+
 #ifdef __VSX__
 
 /* vec_permx */
@@ -17095,6 +17967,14 @@ vec_blendv(vector double __a, vector double __b,
   return __builtin_vsx_xxblendvd(__a, __b, __c);
 }
 
+/* vec_replace_elt */
+
+#define vec_replace_elt __builtin_altivec_vec_replace_elt
+
+/* vec_replace_unaligned */
+
+#define vec_replace_unaligned __builtin_altivec_vec_replace_unaligned
+
 /* vec_splati */
 
 #define vec_splati(__a)                                                        \
@@ -17161,6 +18041,197 @@ vec_test_lsbb_all_zeros(vector unsigned char __a) {
   return __builtin_vsx_xvtlsbb(__a, 0);
 }
 #endif /* __VSX__ */
+
+/* vec_stril */
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_stril(vector unsigned char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vstribr((vector signed char)__a);
+#else
+  return __builtin_altivec_vstribl((vector signed char)__a);
+#endif
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_stril(vector signed char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vstribr(__a);
+#else
+  return __builtin_altivec_vstribl(__a);
+#endif
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_stril(vector unsigned short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vstrihr((vector signed short)__a);
+#else
+  return __builtin_altivec_vstrihl((vector signed short)__a);
+#endif
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_stril(vector signed short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vstrihr(__a);
+#else
+  return __builtin_altivec_vstrihl(__a);
+#endif
+}
+
+/* vec_stril_p */
+
+static __inline__ int __ATTRS_o_ai vec_stril_p(vector unsigned char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vstribr_p(__CR6_EQ, (vector signed char)__a);
+#else
+  return __builtin_altivec_vstribl_p(__CR6_EQ, (vector signed char)__a);
+#endif
+}
+
+static __inline__ int __ATTRS_o_ai vec_stril_p(vector signed char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vstribr_p(__CR6_EQ, __a);
+#else
+  return __builtin_altivec_vstribl_p(__CR6_EQ, __a);
+#endif
+}
+
+static __inline__ int __ATTRS_o_ai vec_stril_p(vector unsigned short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vstrihr_p(__CR6_EQ, (vector signed short)__a);
+#else
+  return __builtin_altivec_vstrihl_p(__CR6_EQ, (vector signed short)__a);
+#endif
+}
+
+static __inline__ int __ATTRS_o_ai vec_stril_p(vector signed short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vstrihr_p(__CR6_EQ, __a);
+#else
+  return __builtin_altivec_vstrihl_p(__CR6_EQ, __a);
+#endif
+}
+
+/* vec_strir */
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_strir(vector unsigned char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vstribl((vector signed char)__a);
+#else
+  return __builtin_altivec_vstribr((vector signed char)__a);
+#endif
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_strir(vector signed char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vstribl(__a);
+#else
+  return __builtin_altivec_vstribr(__a);
+#endif
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_strir(vector unsigned short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vstrihl((vector signed short)__a);
+#else
+  return __builtin_altivec_vstrihr((vector signed short)__a);
+#endif
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_strir(vector signed short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vstrihl(__a);
+#else
+  return __builtin_altivec_vstrihr(__a);
+#endif
+}
+
+/* vec_strir_p */
+
+static __inline__ int __ATTRS_o_ai vec_strir_p(vector unsigned char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vstribl_p(__CR6_EQ, (vector signed char)__a);
+#else
+  return __builtin_altivec_vstribr_p(__CR6_EQ, (vector signed char)__a);
+#endif
+}
+
+static __inline__ int __ATTRS_o_ai vec_strir_p(vector signed char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vstribl_p(__CR6_EQ, __a);
+#else
+  return __builtin_altivec_vstribr_p(__CR6_EQ, __a);
+#endif
+}
+
+static __inline__ int __ATTRS_o_ai vec_strir_p(vector unsigned short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vstrihl_p(__CR6_EQ, (vector signed short)__a);
+#else
+  return __builtin_altivec_vstrihr_p(__CR6_EQ, (vector signed short)__a);
+#endif
+}
+
+static __inline__ int __ATTRS_o_ai vec_strir_p(vector signed short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vstrihl_p(__CR6_EQ, __a);
+#else
+  return __builtin_altivec_vstrihr_p(__CR6_EQ, __a);
+#endif
+}
+
+/* vs[l | r | ra] */
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_sl(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return __a << (__b % (vector unsigned __int128)(sizeof(unsigned __int128) *
+                                                  __CHAR_BIT__));
+}
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_sl(vector signed __int128 __a, vector unsigned __int128 __b) {
+  return __a << (__b % (vector unsigned __int128)(sizeof(unsigned __int128) *
+                                                  __CHAR_BIT__));
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_sr(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return __a >> (__b % (vector unsigned __int128)(sizeof(unsigned __int128) *
+                                                  __CHAR_BIT__));
+}
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_sr(vector signed __int128 __a, vector unsigned __int128 __b) {
+  return (
+      vector signed __int128)(((vector unsigned __int128)__a) >>
+                              (__b %
+                               (vector unsigned __int128)(sizeof(
+                                                              unsigned __int128) *
+                                                          __CHAR_BIT__)));
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_sra(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return (
+      vector unsigned __int128)(((vector signed __int128)__a) >>
+                                (__b %
+                                 (vector unsigned __int128)(sizeof(
+                                                                unsigned __int128) *
+                                                            __CHAR_BIT__)));
+}
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_sra(vector signed __int128 __a, vector unsigned __int128 __b) {
+  return __a >> (__b % (vector unsigned __int128)(sizeof(unsigned __int128) *
+                                                  __CHAR_BIT__));
+}
+
 #endif /* __POWER10_VECTOR__ */
 
 #undef __ATTRS_o_ai
diff --git a/lib/include/amxintrin.h b/lib/include/amxintrin.h
index 58254e21c8..823c7ca1f0 100644
--- a/lib/include/amxintrin.h
+++ b/lib/include/amxintrin.h
@@ -15,8 +15,8 @@
 #define __AMXINTRIN_H
 #ifdef __x86_64__
 
-#define __DEFAULT_FN_ATTRS \
-  __attribute__((__always_inline__, __nodebug__,  __target__("amx-tile")))
+#define __DEFAULT_FN_ATTRS_TILE                                                \
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
 
 /// Load tile configuration from a 64-byte memory location specified by
 /// "mem_addr". The tile configuration includes the tile type palette, the
@@ -31,9 +31,8 @@
 ///
 /// \param __config
 ///    A pointer to 512-bits configuration
-static __inline__ void __DEFAULT_FN_ATTRS
-_tile_loadconfig(const void *__config)
-{
+static __inline__ void __DEFAULT_FN_ATTRS_TILE
+_tile_loadconfig(const void *__config) {
   __builtin_ia32_tile_loadconfig(__config);
 }
 
@@ -48,9 +47,8 @@ _tile_loadconfig(const void *__config)
 ///
 /// \param __config
 ///    A pointer to 512-bits configuration
-static __inline__ void __DEFAULT_FN_ATTRS
-_tile_storeconfig(void *__config)
-{
+static __inline__ void __DEFAULT_FN_ATTRS_TILE
+_tile_storeconfig(void *__config) {
   __builtin_ia32_tile_storeconfig(__config);
 }
 
@@ -60,9 +58,7 @@ _tile_storeconfig(void *__config)
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
-static __inline__ void __DEFAULT_FN_ATTRS
-_tile_release(void)
-{
+static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
   __builtin_ia32_tilerelease();
 }
 
@@ -80,8 +76,9 @@ _tile_release(void)
 ///    A pointer to base address.
 /// \param stride
 ///    The stride between the rows' data to be loaded in memory.
-#define _tile_loadd(dst, base, stride) \
-  __builtin_ia32_tileloadd64((dst), ((const void *)(base)), (__SIZE_TYPE__)(stride))
+#define _tile_loadd(dst, base, stride)                                         \
+  __builtin_ia32_tileloadd64((dst), ((const void *)(base)),                    \
+                             (__SIZE_TYPE__)(stride))
 
 /// Load tile rows from memory specifieid by "base" address and "stride" into
 /// destination tile "dst" using the tile configuration previously configured
@@ -99,8 +96,9 @@ _tile_release(void)
 ///    A pointer to base address.
 /// \param stride
 ///    The stride between the rows' data to be loaded in memory.
-#define _tile_stream_loadd(dst, base, stride) \
-  __builtin_ia32_tileloaddt164((dst), ((const void *)(base)), (__SIZE_TYPE__)(stride))
+#define _tile_stream_loadd(dst, base, stride)                                  \
+  __builtin_ia32_tileloaddt164((dst), ((const void *)(base)),                  \
+                               (__SIZE_TYPE__)(stride))
 
 /// Store the tile specified by "src" to memory specifieid by "base" address and
 /// "stride" using the tile configuration previously configured via
@@ -116,7 +114,7 @@ _tile_release(void)
 ///    A pointer to base address.
 /// \param stride
 ///    The stride between the rows' data to be stored in memory.
-#define _tile_stored(dst, base, stride) \
+#define _tile_stored(dst, base, stride)                                        \
   __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
 
 /// Zero the tile specified by "tdest".
@@ -145,7 +143,8 @@ _tile_release(void)
 ///    The 1st source tile. Max size is 1024 Bytes.
 /// \param src1
 ///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbssd(dst, src0, src1) __builtin_ia32_tdpbssd((dst), (src0), (src1))
+#define _tile_dpbssd(dst, src0, src1)                                          \
+  __builtin_ia32_tdpbssd((dst), (src0), (src1))
 
 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
@@ -163,7 +162,8 @@ _tile_release(void)
 ///    The 1st source tile. Max size is 1024 Bytes.
 /// \param src1
 ///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbsud(dst, src0, src1) __builtin_ia32_tdpbsud((dst), (src0), (src1))
+#define _tile_dpbsud(dst, src0, src1)                                          \
+  __builtin_ia32_tdpbsud((dst), (src0), (src1))
 
 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
@@ -181,7 +181,8 @@ _tile_release(void)
 ///    The 1st source tile. Max size is 1024 Bytes.
 /// \param src1
 ///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbusd(dst, src0, src1) __builtin_ia32_tdpbusd((dst), (src0), (src1))
+#define _tile_dpbusd(dst, src0, src1)                                          \
+  __builtin_ia32_tdpbusd((dst), (src0), (src1))
 
 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
@@ -199,7 +200,8 @@ _tile_release(void)
 ///    The 1st source tile. Max size is 1024 Bytes.
 /// \param src1
 ///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbuud(dst, src0, src1) __builtin_ia32_tdpbuud((dst), (src0), (src1))
+#define _tile_dpbuud(dst, src0, src1)                                          \
+  __builtin_ia32_tdpbuud((dst), (src0), (src1))
 
 /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
 /// src1, accumulating the intermediate single-precision (32-bit) floating-point
@@ -216,10 +218,61 @@ _tile_release(void)
 ///    The 1st source tile. Max size is 1024 Bytes.
 /// \param src1
 ///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbf16ps(dst, src0, src1) \
+#define _tile_dpbf16ps(dst, src0, src1)                                        \
   __builtin_ia32_tdpbf16ps((dst), (src0), (src1))
 
-#undef __DEFAULT_FN_ATTRS
+#define __DEFAULT_FN_ATTRS_INT8                                                \
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
+
+typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+_tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
+                     __SIZE_TYPE__ stride) {
+  return __builtin_ia32_tileloadd64_internal(m, n, base,
+                                             (__SIZE_TYPE__)(stride));
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+_tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
+                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS_INT8
+_tile_stored_internal(unsigned short m, unsigned short n, void *base,
+                      __SIZE_TYPE__ stride, _tile1024i tile) {
+  return __builtin_ia32_tilestored64_internal(m, n, base,
+                                              (__SIZE_TYPE__)(stride), tile);
+}
+
+typedef struct __tile1024i_str {
+  const unsigned short row;
+  const unsigned short col;
+  _tile1024i tile;
+} __tile1024i;
+
+__DEFAULT_FN_ATTRS_TILE
+static void __tile_loadd(__tile1024i *dst, const void *base,
+                         __SIZE_TYPE__ stride) {
+  dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
+}
+
+__DEFAULT_FN_ATTRS_INT8
+static void __tile_dpbssd(__tile1024i *dst, __tile1024i src1,
+                          __tile1024i src2) {
+  dst->tile = _tile_dpbssd_internal(src1.row, src2.col, src1.col, dst->tile,
+                                    src1.tile, src2.tile);
+}
+
+__DEFAULT_FN_ATTRS_TILE
+static void __tile_stored(void *base, __SIZE_TYPE__ stride, __tile1024i src) {
+  _tile_stored_internal(src.row, src.col, base, stride, src.tile);
+}
+
+__DEFAULT_FN_ATTRS_TILE
+static void __tile_zero(__tile1024i *dst) {
+  dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
+}
 
 #endif /* __x86_64__ */
 #endif /* __AMXINTRIN_H */
diff --git a/lib/include/arm_acle.h b/lib/include/arm_acle.h
index de568b4ff9..c156d89c1f 100644
--- a/lib/include/arm_acle.h
+++ b/lib/include/arm_acle.h
@@ -639,6 +639,32 @@ __jcvt(double __a) {
 }
 #endif
 
+/* Armv8.7-A load/store 64-byte intrinsics */
+#if __ARM_64BIT_STATE && defined(__ARM_FEATURE_LS64)
+typedef struct {
+    uint64_t val[8];
+} data512_t;
+
+static __inline__ data512_t __attribute__((__always_inline__, __nodebug__))
+__arm_ld64b(const void *__addr) {
+    data512_t __value;
+    __builtin_arm_ld64b(__addr, __value.val);
+    return __value;
+}
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__arm_st64b(void *__addr, data512_t __value) {
+    __builtin_arm_st64b(__addr, __value.val);
+}
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__arm_st64bv(void *__addr, data512_t __value) {
+    return __builtin_arm_st64bv(__addr, __value.val);
+}
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__arm_st64bv0(void *__addr, data512_t __value) {
+    return __builtin_arm_st64bv0(__addr, __value.val);
+}
+#endif
+
 /* 10.1 Special register intrinsics */
 #define __arm_rsr(sysreg) __builtin_arm_rsr(sysreg)
 #define __arm_rsr64(sysreg) __builtin_arm_rsr64(sysreg)
diff --git a/lib/include/arm_neon.h b/lib/include/arm_neon.h
index da1e17cc00..dbb65b8a49 100644
--- a/lib/include/arm_neon.h
+++ b/lib/include/arm_neon.h
@@ -40429,6 +40429,638 @@ __ai float32x4_t vcaddq_rot90_f32(float32x4_t __p0, float32x4_t __p1) {
 }
 #endif
 
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vcmlaq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vcmlaq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vcmlaq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vcmlaq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai float32x4_t __noswap_vcmlaq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vcmlaq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vcmla_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vcmla_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vcmla_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vcmla_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai float32x2_t __noswap_vcmla_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vcmla_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmla_lane_f32(__p0_169, __p1_169, __p2_169, __p3_169) __extension__ ({ \
+  float32x2_t __s0_169 = __p0_169; \
+  float32x2_t __s1_169 = __p1_169; \
+  float32x2_t __s2_169 = __p2_169; \
+  float32x2_t __ret_169; \
+float32x2_t __reint_169 = __s2_169; \
+uint64x1_t __reint1_169 = (uint64x1_t) {vget_lane_u64(*(uint64x1_t *) &__reint_169, __p3_169)}; \
+  __ret_169 = vcmla_f32(__s0_169, __s1_169, *(float32x2_t *) &__reint1_169); \
+  __ret_169; \
+})
+#else
+#define vcmla_lane_f32(__p0_170, __p1_170, __p2_170, __p3_170) __extension__ ({ \
+  float32x2_t __s0_170 = __p0_170; \
+  float32x2_t __s1_170 = __p1_170; \
+  float32x2_t __s2_170 = __p2_170; \
+  float32x2_t __rev0_170;  __rev0_170 = __builtin_shufflevector(__s0_170, __s0_170, 1, 0); \
+  float32x2_t __rev1_170;  __rev1_170 = __builtin_shufflevector(__s1_170, __s1_170, 1, 0); \
+  float32x2_t __rev2_170;  __rev2_170 = __builtin_shufflevector(__s2_170, __s2_170, 1, 0); \
+  float32x2_t __ret_170; \
+float32x2_t __reint_170 = __rev2_170; \
+uint64x1_t __reint1_170 = (uint64x1_t) {vget_lane_u64(*(uint64x1_t *) &__reint_170, __p3_170)}; \
+  __ret_170 = __noswap_vcmla_f32(__rev0_170, __rev1_170, *(float32x2_t *) &__reint1_170); \
+  __ret_170 = __builtin_shufflevector(__ret_170, __ret_170, 1, 0); \
+  __ret_170; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmlaq_lane_f32(__p0_171, __p1_171, __p2_171, __p3_171) __extension__ ({ \
+  float32x4_t __s0_171 = __p0_171; \
+  float32x4_t __s1_171 = __p1_171; \
+  float32x2_t __s2_171 = __p2_171; \
+  float32x4_t __ret_171; \
+float32x2_t __reint_171 = __s2_171; \
+uint64x2_t __reint1_171 = (uint64x2_t) {vget_lane_u64(*(uint64x1_t *) &__reint_171, __p3_171), vget_lane_u64(*(uint64x1_t *) &__reint_171, __p3_171)}; \
+  __ret_171 = vcmlaq_f32(__s0_171, __s1_171, *(float32x4_t *) &__reint1_171); \
+  __ret_171; \
+})
+#else
+#define vcmlaq_lane_f32(__p0_172, __p1_172, __p2_172, __p3_172) __extension__ ({ \
+  float32x4_t __s0_172 = __p0_172; \
+  float32x4_t __s1_172 = __p1_172; \
+  float32x2_t __s2_172 = __p2_172; \
+  float32x4_t __rev0_172;  __rev0_172 = __builtin_shufflevector(__s0_172, __s0_172, 3, 2, 1, 0); \
+  float32x4_t __rev1_172;  __rev1_172 = __builtin_shufflevector(__s1_172, __s1_172, 3, 2, 1, 0); \
+  float32x2_t __rev2_172;  __rev2_172 = __builtin_shufflevector(__s2_172, __s2_172, 1, 0); \
+  float32x4_t __ret_172; \
+float32x2_t __reint_172 = __rev2_172; \
+uint64x2_t __reint1_172 = (uint64x2_t) {vget_lane_u64(*(uint64x1_t *) &__reint_172, __p3_172), vget_lane_u64(*(uint64x1_t *) &__reint_172, __p3_172)}; \
+  __ret_172 = __noswap_vcmlaq_f32(__rev0_172, __rev1_172, *(float32x4_t *) &__reint1_172); \
+  __ret_172 = __builtin_shufflevector(__ret_172, __ret_172, 3, 2, 1, 0); \
+  __ret_172; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmla_laneq_f32(__p0_173, __p1_173, __p2_173, __p3_173) __extension__ ({ \
+  float32x2_t __s0_173 = __p0_173; \
+  float32x2_t __s1_173 = __p1_173; \
+  float32x4_t __s2_173 = __p2_173; \
+  float32x2_t __ret_173; \
+float32x4_t __reint_173 = __s2_173; \
+uint64x1_t __reint1_173 = (uint64x1_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_173, __p3_173)}; \
+  __ret_173 = vcmla_f32(__s0_173, __s1_173, *(float32x2_t *) &__reint1_173); \
+  __ret_173; \
+})
+#else
+#define vcmla_laneq_f32(__p0_174, __p1_174, __p2_174, __p3_174) __extension__ ({ \
+  float32x2_t __s0_174 = __p0_174; \
+  float32x2_t __s1_174 = __p1_174; \
+  float32x4_t __s2_174 = __p2_174; \
+  float32x2_t __rev0_174;  __rev0_174 = __builtin_shufflevector(__s0_174, __s0_174, 1, 0); \
+  float32x2_t __rev1_174;  __rev1_174 = __builtin_shufflevector(__s1_174, __s1_174, 1, 0); \
+  float32x4_t __rev2_174;  __rev2_174 = __builtin_shufflevector(__s2_174, __s2_174, 3, 2, 1, 0); \
+  float32x2_t __ret_174; \
+float32x4_t __reint_174 = __rev2_174; \
+uint64x1_t __reint1_174 = (uint64x1_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_174, __p3_174)}; \
+  __ret_174 = __noswap_vcmla_f32(__rev0_174, __rev1_174, *(float32x2_t *) &__reint1_174); \
+  __ret_174 = __builtin_shufflevector(__ret_174, __ret_174, 1, 0); \
+  __ret_174; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmlaq_laneq_f32(__p0_175, __p1_175, __p2_175, __p3_175) __extension__ ({ \
+  float32x4_t __s0_175 = __p0_175; \
+  float32x4_t __s1_175 = __p1_175; \
+  float32x4_t __s2_175 = __p2_175; \
+  float32x4_t __ret_175; \
+float32x4_t __reint_175 = __s2_175; \
+uint64x2_t __reint1_175 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_175, __p3_175), vgetq_lane_u64(*(uint64x2_t *) &__reint_175, __p3_175)}; \
+  __ret_175 = vcmlaq_f32(__s0_175, __s1_175, *(float32x4_t *) &__reint1_175); \
+  __ret_175; \
+})
+#else
+#define vcmlaq_laneq_f32(__p0_176, __p1_176, __p2_176, __p3_176) __extension__ ({ \
+  float32x4_t __s0_176 = __p0_176; \
+  float32x4_t __s1_176 = __p1_176; \
+  float32x4_t __s2_176 = __p2_176; \
+  float32x4_t __rev0_176;  __rev0_176 = __builtin_shufflevector(__s0_176, __s0_176, 3, 2, 1, 0); \
+  float32x4_t __rev1_176;  __rev1_176 = __builtin_shufflevector(__s1_176, __s1_176, 3, 2, 1, 0); \
+  float32x4_t __rev2_176;  __rev2_176 = __builtin_shufflevector(__s2_176, __s2_176, 3, 2, 1, 0); \
+  float32x4_t __ret_176; \
+float32x4_t __reint_176 = __rev2_176; \
+uint64x2_t __reint1_176 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_176, __p3_176), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_176, __p3_176)}; \
+  __ret_176 = __noswap_vcmlaq_f32(__rev0_176, __rev1_176, *(float32x4_t *) &__reint1_176); \
+  __ret_176 = __builtin_shufflevector(__ret_176, __ret_176, 3, 2, 1, 0); \
+  __ret_176; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vcmlaq_rot180_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vcmlaq_rot180_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vcmlaq_rot180_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vcmlaq_rot180_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai float32x4_t __noswap_vcmlaq_rot180_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vcmlaq_rot180_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vcmla_rot180_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vcmla_rot180_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vcmla_rot180_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vcmla_rot180_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai float32x2_t __noswap_vcmla_rot180_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vcmla_rot180_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmla_rot180_lane_f32(__p0_177, __p1_177, __p2_177, __p3_177) __extension__ ({ \
+  float32x2_t __s0_177 = __p0_177; \
+  float32x2_t __s1_177 = __p1_177; \
+  float32x2_t __s2_177 = __p2_177; \
+  float32x2_t __ret_177; \
+float32x2_t __reint_177 = __s2_177; \
+uint64x1_t __reint1_177 = (uint64x1_t) {vget_lane_u64(*(uint64x1_t *) &__reint_177, __p3_177)}; \
+  __ret_177 = vcmla_rot180_f32(__s0_177, __s1_177, *(float32x2_t *) &__reint1_177); \
+  __ret_177; \
+})
+#else
+#define vcmla_rot180_lane_f32(__p0_178, __p1_178, __p2_178, __p3_178) __extension__ ({ \
+  float32x2_t __s0_178 = __p0_178; \
+  float32x2_t __s1_178 = __p1_178; \
+  float32x2_t __s2_178 = __p2_178; \
+  float32x2_t __rev0_178;  __rev0_178 = __builtin_shufflevector(__s0_178, __s0_178, 1, 0); \
+  float32x2_t __rev1_178;  __rev1_178 = __builtin_shufflevector(__s1_178, __s1_178, 1, 0); \
+  float32x2_t __rev2_178;  __rev2_178 = __builtin_shufflevector(__s2_178, __s2_178, 1, 0); \
+  float32x2_t __ret_178; \
+float32x2_t __reint_178 = __rev2_178; \
+uint64x1_t __reint1_178 = (uint64x1_t) {vget_lane_u64(*(uint64x1_t *) &__reint_178, __p3_178)}; \
+  __ret_178 = __noswap_vcmla_rot180_f32(__rev0_178, __rev1_178, *(float32x2_t *) &__reint1_178); \
+  __ret_178 = __builtin_shufflevector(__ret_178, __ret_178, 1, 0); \
+  __ret_178; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmlaq_rot180_lane_f32(__p0_179, __p1_179, __p2_179, __p3_179) __extension__ ({ \
+  float32x4_t __s0_179 = __p0_179; \
+  float32x4_t __s1_179 = __p1_179; \
+  float32x2_t __s2_179 = __p2_179; \
+  float32x4_t __ret_179; \
+float32x2_t __reint_179 = __s2_179; \
+uint64x2_t __reint1_179 = (uint64x2_t) {vget_lane_u64(*(uint64x1_t *) &__reint_179, __p3_179), vget_lane_u64(*(uint64x1_t *) &__reint_179, __p3_179)}; \
+  __ret_179 = vcmlaq_rot180_f32(__s0_179, __s1_179, *(float32x4_t *) &__reint1_179); \
+  __ret_179; \
+})
+#else
+#define vcmlaq_rot180_lane_f32(__p0_180, __p1_180, __p2_180, __p3_180) __extension__ ({ \
+  float32x4_t __s0_180 = __p0_180; \
+  float32x4_t __s1_180 = __p1_180; \
+  float32x2_t __s2_180 = __p2_180; \
+  float32x4_t __rev0_180;  __rev0_180 = __builtin_shufflevector(__s0_180, __s0_180, 3, 2, 1, 0); \
+  float32x4_t __rev1_180;  __rev1_180 = __builtin_shufflevector(__s1_180, __s1_180, 3, 2, 1, 0); \
+  float32x2_t __rev2_180;  __rev2_180 = __builtin_shufflevector(__s2_180, __s2_180, 1, 0); \
+  float32x4_t __ret_180; \
+float32x2_t __reint_180 = __rev2_180; \
+uint64x2_t __reint1_180 = (uint64x2_t) {vget_lane_u64(*(uint64x1_t *) &__reint_180, __p3_180), vget_lane_u64(*(uint64x1_t *) &__reint_180, __p3_180)}; \
+  __ret_180 = __noswap_vcmlaq_rot180_f32(__rev0_180, __rev1_180, *(float32x4_t *) &__reint1_180); \
+  __ret_180 = __builtin_shufflevector(__ret_180, __ret_180, 3, 2, 1, 0); \
+  __ret_180; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmla_rot180_laneq_f32(__p0_181, __p1_181, __p2_181, __p3_181) __extension__ ({ \
+  float32x2_t __s0_181 = __p0_181; \
+  float32x2_t __s1_181 = __p1_181; \
+  float32x4_t __s2_181 = __p2_181; \
+  float32x2_t __ret_181; \
+float32x4_t __reint_181 = __s2_181; \
+uint64x1_t __reint1_181 = (uint64x1_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_181, __p3_181)}; \
+  __ret_181 = vcmla_rot180_f32(__s0_181, __s1_181, *(float32x2_t *) &__reint1_181); \
+  __ret_181; \
+})
+#else
+#define vcmla_rot180_laneq_f32(__p0_182, __p1_182, __p2_182, __p3_182) __extension__ ({ \
+  float32x2_t __s0_182 = __p0_182; \
+  float32x2_t __s1_182 = __p1_182; \
+  float32x4_t __s2_182 = __p2_182; \
+  float32x2_t __rev0_182;  __rev0_182 = __builtin_shufflevector(__s0_182, __s0_182, 1, 0); \
+  float32x2_t __rev1_182;  __rev1_182 = __builtin_shufflevector(__s1_182, __s1_182, 1, 0); \
+  float32x4_t __rev2_182;  __rev2_182 = __builtin_shufflevector(__s2_182, __s2_182, 3, 2, 1, 0); \
+  float32x2_t __ret_182; \
+float32x4_t __reint_182 = __rev2_182; \
+uint64x1_t __reint1_182 = (uint64x1_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_182, __p3_182)}; \
+  __ret_182 = __noswap_vcmla_rot180_f32(__rev0_182, __rev1_182, *(float32x2_t *) &__reint1_182); \
+  __ret_182 = __builtin_shufflevector(__ret_182, __ret_182, 1, 0); \
+  __ret_182; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmlaq_rot180_laneq_f32(__p0_183, __p1_183, __p2_183, __p3_183) __extension__ ({ \
+  float32x4_t __s0_183 = __p0_183; \
+  float32x4_t __s1_183 = __p1_183; \
+  float32x4_t __s2_183 = __p2_183; \
+  float32x4_t __ret_183; \
+float32x4_t __reint_183 = __s2_183; \
+uint64x2_t __reint1_183 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_183, __p3_183), vgetq_lane_u64(*(uint64x2_t *) &__reint_183, __p3_183)}; \
+  __ret_183 = vcmlaq_rot180_f32(__s0_183, __s1_183, *(float32x4_t *) &__reint1_183); \
+  __ret_183; \
+})
+#else
+#define vcmlaq_rot180_laneq_f32(__p0_184, __p1_184, __p2_184, __p3_184) __extension__ ({ \
+  float32x4_t __s0_184 = __p0_184; \
+  float32x4_t __s1_184 = __p1_184; \
+  float32x4_t __s2_184 = __p2_184; \
+  float32x4_t __rev0_184;  __rev0_184 = __builtin_shufflevector(__s0_184, __s0_184, 3, 2, 1, 0); \
+  float32x4_t __rev1_184;  __rev1_184 = __builtin_shufflevector(__s1_184, __s1_184, 3, 2, 1, 0); \
+  float32x4_t __rev2_184;  __rev2_184 = __builtin_shufflevector(__s2_184, __s2_184, 3, 2, 1, 0); \
+  float32x4_t __ret_184; \
+float32x4_t __reint_184 = __rev2_184; \
+uint64x2_t __reint1_184 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_184, __p3_184), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_184, __p3_184)}; \
+  __ret_184 = __noswap_vcmlaq_rot180_f32(__rev0_184, __rev1_184, *(float32x4_t *) &__reint1_184); \
+  __ret_184 = __builtin_shufflevector(__ret_184, __ret_184, 3, 2, 1, 0); \
+  __ret_184; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vcmlaq_rot270_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vcmlaq_rot270_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vcmlaq_rot270_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vcmlaq_rot270_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai float32x4_t __noswap_vcmlaq_rot270_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vcmlaq_rot270_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vcmla_rot270_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vcmla_rot270_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vcmla_rot270_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vcmla_rot270_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai float32x2_t __noswap_vcmla_rot270_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vcmla_rot270_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmla_rot270_lane_f32(__p0_185, __p1_185, __p2_185, __p3_185) __extension__ ({ \
+  float32x2_t __s0_185 = __p0_185; \
+  float32x2_t __s1_185 = __p1_185; \
+  float32x2_t __s2_185 = __p2_185; \
+  float32x2_t __ret_185; \
+float32x2_t __reint_185 = __s2_185; \
+uint64x1_t __reint1_185 = (uint64x1_t) {vget_lane_u64(*(uint64x1_t *) &__reint_185, __p3_185)}; \
+  __ret_185 = vcmla_rot270_f32(__s0_185, __s1_185, *(float32x2_t *) &__reint1_185); \
+  __ret_185; \
+})
+#else
+#define vcmla_rot270_lane_f32(__p0_186, __p1_186, __p2_186, __p3_186) __extension__ ({ \
+  float32x2_t __s0_186 = __p0_186; \
+  float32x2_t __s1_186 = __p1_186; \
+  float32x2_t __s2_186 = __p2_186; \
+  float32x2_t __rev0_186;  __rev0_186 = __builtin_shufflevector(__s0_186, __s0_186, 1, 0); \
+  float32x2_t __rev1_186;  __rev1_186 = __builtin_shufflevector(__s1_186, __s1_186, 1, 0); \
+  float32x2_t __rev2_186;  __rev2_186 = __builtin_shufflevector(__s2_186, __s2_186, 1, 0); \
+  float32x2_t __ret_186; \
+float32x2_t __reint_186 = __rev2_186; \
+uint64x1_t __reint1_186 = (uint64x1_t) {vget_lane_u64(*(uint64x1_t *) &__reint_186, __p3_186)}; \
+  __ret_186 = __noswap_vcmla_rot270_f32(__rev0_186, __rev1_186, *(float32x2_t *) &__reint1_186); \
+  __ret_186 = __builtin_shufflevector(__ret_186, __ret_186, 1, 0); \
+  __ret_186; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmlaq_rot270_lane_f32(__p0_187, __p1_187, __p2_187, __p3_187) __extension__ ({ \
+  float32x4_t __s0_187 = __p0_187; \
+  float32x4_t __s1_187 = __p1_187; \
+  float32x2_t __s2_187 = __p2_187; \
+  float32x4_t __ret_187; \
+float32x2_t __reint_187 = __s2_187; \
+uint64x2_t __reint1_187 = (uint64x2_t) {vget_lane_u64(*(uint64x1_t *) &__reint_187, __p3_187), vget_lane_u64(*(uint64x1_t *) &__reint_187, __p3_187)}; \
+  __ret_187 = vcmlaq_rot270_f32(__s0_187, __s1_187, *(float32x4_t *) &__reint1_187); \
+  __ret_187; \
+})
+#else
+#define vcmlaq_rot270_lane_f32(__p0_188, __p1_188, __p2_188, __p3_188) __extension__ ({ \
+  float32x4_t __s0_188 = __p0_188; \
+  float32x4_t __s1_188 = __p1_188; \
+  float32x2_t __s2_188 = __p2_188; \
+  float32x4_t __rev0_188;  __rev0_188 = __builtin_shufflevector(__s0_188, __s0_188, 3, 2, 1, 0); \
+  float32x4_t __rev1_188;  __rev1_188 = __builtin_shufflevector(__s1_188, __s1_188, 3, 2, 1, 0); \
+  float32x2_t __rev2_188;  __rev2_188 = __builtin_shufflevector(__s2_188, __s2_188, 1, 0); \
+  float32x4_t __ret_188; \
+float32x2_t __reint_188 = __rev2_188; \
+uint64x2_t __reint1_188 = (uint64x2_t) {vget_lane_u64(*(uint64x1_t *) &__reint_188, __p3_188), vget_lane_u64(*(uint64x1_t *) &__reint_188, __p3_188)}; \
+  __ret_188 = __noswap_vcmlaq_rot270_f32(__rev0_188, __rev1_188, *(float32x4_t *) &__reint1_188); \
+  __ret_188 = __builtin_shufflevector(__ret_188, __ret_188, 3, 2, 1, 0); \
+  __ret_188; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmla_rot270_laneq_f32(__p0_189, __p1_189, __p2_189, __p3_189) __extension__ ({ \
+  float32x2_t __s0_189 = __p0_189; \
+  float32x2_t __s1_189 = __p1_189; \
+  float32x4_t __s2_189 = __p2_189; \
+  float32x2_t __ret_189; \
+float32x4_t __reint_189 = __s2_189; \
+uint64x1_t __reint1_189 = (uint64x1_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_189, __p3_189)}; \
+  __ret_189 = vcmla_rot270_f32(__s0_189, __s1_189, *(float32x2_t *) &__reint1_189); \
+  __ret_189; \
+})
+#else
+#define vcmla_rot270_laneq_f32(__p0_190, __p1_190, __p2_190, __p3_190) __extension__ ({ \
+  float32x2_t __s0_190 = __p0_190; \
+  float32x2_t __s1_190 = __p1_190; \
+  float32x4_t __s2_190 = __p2_190; \
+  float32x2_t __rev0_190;  __rev0_190 = __builtin_shufflevector(__s0_190, __s0_190, 1, 0); \
+  float32x2_t __rev1_190;  __rev1_190 = __builtin_shufflevector(__s1_190, __s1_190, 1, 0); \
+  float32x4_t __rev2_190;  __rev2_190 = __builtin_shufflevector(__s2_190, __s2_190, 3, 2, 1, 0); \
+  float32x2_t __ret_190; \
+float32x4_t __reint_190 = __rev2_190; \
+uint64x1_t __reint1_190 = (uint64x1_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_190, __p3_190)}; \
+  __ret_190 = __noswap_vcmla_rot270_f32(__rev0_190, __rev1_190, *(float32x2_t *) &__reint1_190); \
+  __ret_190 = __builtin_shufflevector(__ret_190, __ret_190, 1, 0); \
+  __ret_190; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmlaq_rot270_laneq_f32(__p0_191, __p1_191, __p2_191, __p3_191) __extension__ ({ \
+  float32x4_t __s0_191 = __p0_191; \
+  float32x4_t __s1_191 = __p1_191; \
+  float32x4_t __s2_191 = __p2_191; \
+  float32x4_t __ret_191; \
+float32x4_t __reint_191 = __s2_191; \
+uint64x2_t __reint1_191 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_191, __p3_191), vgetq_lane_u64(*(uint64x2_t *) &__reint_191, __p3_191)}; \
+  __ret_191 = vcmlaq_rot270_f32(__s0_191, __s1_191, *(float32x4_t *) &__reint1_191); \
+  __ret_191; \
+})
+#else
+#define vcmlaq_rot270_laneq_f32(__p0_192, __p1_192, __p2_192, __p3_192) __extension__ ({ \
+  float32x4_t __s0_192 = __p0_192; \
+  float32x4_t __s1_192 = __p1_192; \
+  float32x4_t __s2_192 = __p2_192; \
+  float32x4_t __rev0_192;  __rev0_192 = __builtin_shufflevector(__s0_192, __s0_192, 3, 2, 1, 0); \
+  float32x4_t __rev1_192;  __rev1_192 = __builtin_shufflevector(__s1_192, __s1_192, 3, 2, 1, 0); \
+  float32x4_t __rev2_192;  __rev2_192 = __builtin_shufflevector(__s2_192, __s2_192, 3, 2, 1, 0); \
+  float32x4_t __ret_192; \
+float32x4_t __reint_192 = __rev2_192; \
+uint64x2_t __reint1_192 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_192, __p3_192), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_192, __p3_192)}; \
+  __ret_192 = __noswap_vcmlaq_rot270_f32(__rev0_192, __rev1_192, *(float32x4_t *) &__reint1_192); \
+  __ret_192 = __builtin_shufflevector(__ret_192, __ret_192, 3, 2, 1, 0); \
+  __ret_192; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vcmlaq_rot90_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vcmlaq_rot90_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vcmlaq_rot90_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vcmlaq_rot90_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai float32x4_t __noswap_vcmlaq_rot90_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vcmlaq_rot90_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vcmla_rot90_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vcmla_rot90_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vcmla_rot90_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vcmla_rot90_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai float32x2_t __noswap_vcmla_rot90_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vcmla_rot90_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmla_rot90_lane_f32(__p0_193, __p1_193, __p2_193, __p3_193) __extension__ ({ \
+  float32x2_t __s0_193 = __p0_193; \
+  float32x2_t __s1_193 = __p1_193; \
+  float32x2_t __s2_193 = __p2_193; \
+  float32x2_t __ret_193; \
+float32x2_t __reint_193 = __s2_193; \
+uint64x1_t __reint1_193 = (uint64x1_t) {vget_lane_u64(*(uint64x1_t *) &__reint_193, __p3_193)}; \
+  __ret_193 = vcmla_rot90_f32(__s0_193, __s1_193, *(float32x2_t *) &__reint1_193); \
+  __ret_193; \
+})
+#else
+#define vcmla_rot90_lane_f32(__p0_194, __p1_194, __p2_194, __p3_194) __extension__ ({ \
+  float32x2_t __s0_194 = __p0_194; \
+  float32x2_t __s1_194 = __p1_194; \
+  float32x2_t __s2_194 = __p2_194; \
+  float32x2_t __rev0_194;  __rev0_194 = __builtin_shufflevector(__s0_194, __s0_194, 1, 0); \
+  float32x2_t __rev1_194;  __rev1_194 = __builtin_shufflevector(__s1_194, __s1_194, 1, 0); \
+  float32x2_t __rev2_194;  __rev2_194 = __builtin_shufflevector(__s2_194, __s2_194, 1, 0); \
+  float32x2_t __ret_194; \
+float32x2_t __reint_194 = __rev2_194; \
+uint64x1_t __reint1_194 = (uint64x1_t) {vget_lane_u64(*(uint64x1_t *) &__reint_194, __p3_194)}; \
+  __ret_194 = __noswap_vcmla_rot90_f32(__rev0_194, __rev1_194, *(float32x2_t *) &__reint1_194); \
+  __ret_194 = __builtin_shufflevector(__ret_194, __ret_194, 1, 0); \
+  __ret_194; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmlaq_rot90_lane_f32(__p0_195, __p1_195, __p2_195, __p3_195) __extension__ ({ \
+  float32x4_t __s0_195 = __p0_195; \
+  float32x4_t __s1_195 = __p1_195; \
+  float32x2_t __s2_195 = __p2_195; \
+  float32x4_t __ret_195; \
+float32x2_t __reint_195 = __s2_195; \
+uint64x2_t __reint1_195 = (uint64x2_t) {vget_lane_u64(*(uint64x1_t *) &__reint_195, __p3_195), vget_lane_u64(*(uint64x1_t *) &__reint_195, __p3_195)}; \
+  __ret_195 = vcmlaq_rot90_f32(__s0_195, __s1_195, *(float32x4_t *) &__reint1_195); \
+  __ret_195; \
+})
+#else
+#define vcmlaq_rot90_lane_f32(__p0_196, __p1_196, __p2_196, __p3_196) __extension__ ({ \
+  float32x4_t __s0_196 = __p0_196; \
+  float32x4_t __s1_196 = __p1_196; \
+  float32x2_t __s2_196 = __p2_196; \
+  float32x4_t __rev0_196;  __rev0_196 = __builtin_shufflevector(__s0_196, __s0_196, 3, 2, 1, 0); \
+  float32x4_t __rev1_196;  __rev1_196 = __builtin_shufflevector(__s1_196, __s1_196, 3, 2, 1, 0); \
+  float32x2_t __rev2_196;  __rev2_196 = __builtin_shufflevector(__s2_196, __s2_196, 1, 0); \
+  float32x4_t __ret_196; \
+float32x2_t __reint_196 = __rev2_196; \
+uint64x2_t __reint1_196 = (uint64x2_t) {vget_lane_u64(*(uint64x1_t *) &__reint_196, __p3_196), vget_lane_u64(*(uint64x1_t *) &__reint_196, __p3_196)}; \
+  __ret_196 = __noswap_vcmlaq_rot90_f32(__rev0_196, __rev1_196, *(float32x4_t *) &__reint1_196); \
+  __ret_196 = __builtin_shufflevector(__ret_196, __ret_196, 3, 2, 1, 0); \
+  __ret_196; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmla_rot90_laneq_f32(__p0_197, __p1_197, __p2_197, __p3_197) __extension__ ({ \
+  float32x2_t __s0_197 = __p0_197; \
+  float32x2_t __s1_197 = __p1_197; \
+  float32x4_t __s2_197 = __p2_197; \
+  float32x2_t __ret_197; \
+float32x4_t __reint_197 = __s2_197; \
+uint64x1_t __reint1_197 = (uint64x1_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_197, __p3_197)}; \
+  __ret_197 = vcmla_rot90_f32(__s0_197, __s1_197, *(float32x2_t *) &__reint1_197); \
+  __ret_197; \
+})
+#else
+#define vcmla_rot90_laneq_f32(__p0_198, __p1_198, __p2_198, __p3_198) __extension__ ({ \
+  float32x2_t __s0_198 = __p0_198; \
+  float32x2_t __s1_198 = __p1_198; \
+  float32x4_t __s2_198 = __p2_198; \
+  float32x2_t __rev0_198;  __rev0_198 = __builtin_shufflevector(__s0_198, __s0_198, 1, 0); \
+  float32x2_t __rev1_198;  __rev1_198 = __builtin_shufflevector(__s1_198, __s1_198, 1, 0); \
+  float32x4_t __rev2_198;  __rev2_198 = __builtin_shufflevector(__s2_198, __s2_198, 3, 2, 1, 0); \
+  float32x2_t __ret_198; \
+float32x4_t __reint_198 = __rev2_198; \
+uint64x1_t __reint1_198 = (uint64x1_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_198, __p3_198)}; \
+  __ret_198 = __noswap_vcmla_rot90_f32(__rev0_198, __rev1_198, *(float32x2_t *) &__reint1_198); \
+  __ret_198 = __builtin_shufflevector(__ret_198, __ret_198, 1, 0); \
+  __ret_198; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmlaq_rot90_laneq_f32(__p0_199, __p1_199, __p2_199, __p3_199) __extension__ ({ \
+  float32x4_t __s0_199 = __p0_199; \
+  float32x4_t __s1_199 = __p1_199; \
+  float32x4_t __s2_199 = __p2_199; \
+  float32x4_t __ret_199; \
+float32x4_t __reint_199 = __s2_199; \
+uint64x2_t __reint1_199 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_199, __p3_199), vgetq_lane_u64(*(uint64x2_t *) &__reint_199, __p3_199)}; \
+  __ret_199 = vcmlaq_rot90_f32(__s0_199, __s1_199, *(float32x4_t *) &__reint1_199); \
+  __ret_199; \
+})
+#else
+#define vcmlaq_rot90_laneq_f32(__p0_200, __p1_200, __p2_200, __p3_200) __extension__ ({ \
+  float32x4_t __s0_200 = __p0_200; \
+  float32x4_t __s1_200 = __p1_200; \
+  float32x4_t __s2_200 = __p2_200; \
+  float32x4_t __rev0_200;  __rev0_200 = __builtin_shufflevector(__s0_200, __s0_200, 3, 2, 1, 0); \
+  float32x4_t __rev1_200;  __rev1_200 = __builtin_shufflevector(__s1_200, __s1_200, 3, 2, 1, 0); \
+  float32x4_t __rev2_200;  __rev2_200 = __builtin_shufflevector(__s2_200, __s2_200, 3, 2, 1, 0); \
+  float32x4_t __ret_200; \
+float32x4_t __reint_200 = __rev2_200; \
+uint64x2_t __reint1_200 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_200, __p3_200), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_200, __p3_200)}; \
+  __ret_200 = __noswap_vcmlaq_rot90_f32(__rev0_200, __rev1_200, *(float32x4_t *) &__reint1_200); \
+  __ret_200 = __builtin_shufflevector(__ret_200, __ret_200, 3, 2, 1, 0); \
+  __ret_200; \
+})
+#endif
+
 #endif
 #if defined(__ARM_FEATURE_COMPLEX) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 #ifdef __LITTLE_ENDIAN__
@@ -40499,6 +41131,638 @@ __ai float16x8_t vcaddq_rot90_f16(float16x8_t __p0, float16x8_t __p1) {
 }
 #endif
 
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vcmlaq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vcmlaq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40);
+  return __ret;
+}
+#else
+__ai float16x8_t vcmlaq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vcmlaq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 40);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai float16x8_t __noswap_vcmlaq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vcmlaq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vcmla_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vcmla_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8);
+  return __ret;
+}
+#else
+__ai float16x4_t vcmla_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vcmla_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai float16x4_t __noswap_vcmla_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vcmla_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmla_lane_f16(__p0_201, __p1_201, __p2_201, __p3_201) __extension__ ({ \
+  float16x4_t __s0_201 = __p0_201; \
+  float16x4_t __s1_201 = __p1_201; \
+  float16x4_t __s2_201 = __p2_201; \
+  float16x4_t __ret_201; \
+float16x4_t __reint_201 = __s2_201; \
+uint32x2_t __reint1_201 = (uint32x2_t) {vget_lane_u32(*(uint32x2_t *) &__reint_201, __p3_201), vget_lane_u32(*(uint32x2_t *) &__reint_201, __p3_201)}; \
+  __ret_201 = vcmla_f16(__s0_201, __s1_201, *(float16x4_t *) &__reint1_201); \
+  __ret_201; \
+})
+#else
+#define vcmla_lane_f16(__p0_202, __p1_202, __p2_202, __p3_202) __extension__ ({ \
+  float16x4_t __s0_202 = __p0_202; \
+  float16x4_t __s1_202 = __p1_202; \
+  float16x4_t __s2_202 = __p2_202; \
+  float16x4_t __rev0_202;  __rev0_202 = __builtin_shufflevector(__s0_202, __s0_202, 3, 2, 1, 0); \
+  float16x4_t __rev1_202;  __rev1_202 = __builtin_shufflevector(__s1_202, __s1_202, 3, 2, 1, 0); \
+  float16x4_t __rev2_202;  __rev2_202 = __builtin_shufflevector(__s2_202, __s2_202, 3, 2, 1, 0); \
+  float16x4_t __ret_202; \
+float16x4_t __reint_202 = __rev2_202; \
+uint32x2_t __reint1_202 = (uint32x2_t) {__noswap_vget_lane_u32(*(uint32x2_t *) &__reint_202, __p3_202), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_202, __p3_202)}; \
+  __ret_202 = __noswap_vcmla_f16(__rev0_202, __rev1_202, *(float16x4_t *) &__reint1_202); \
+  __ret_202 = __builtin_shufflevector(__ret_202, __ret_202, 3, 2, 1, 0); \
+  __ret_202; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmlaq_lane_f16(__p0_203, __p1_203, __p2_203, __p3_203) __extension__ ({ \
+  float16x8_t __s0_203 = __p0_203; \
+  float16x8_t __s1_203 = __p1_203; \
+  float16x4_t __s2_203 = __p2_203; \
+  float16x8_t __ret_203; \
+float16x4_t __reint_203 = __s2_203; \
+uint32x4_t __reint1_203 = (uint32x4_t) {vget_lane_u32(*(uint32x2_t *) &__reint_203, __p3_203), vget_lane_u32(*(uint32x2_t *) &__reint_203, __p3_203), vget_lane_u32(*(uint32x2_t *) &__reint_203, __p3_203), vget_lane_u32(*(uint32x2_t *) &__reint_203, __p3_203)}; \
+  __ret_203 = vcmlaq_f16(__s0_203, __s1_203, *(float16x8_t *) &__reint1_203); \
+  __ret_203; \
+})
+#else
+#define vcmlaq_lane_f16(__p0_204, __p1_204, __p2_204, __p3_204) __extension__ ({ \
+  float16x8_t __s0_204 = __p0_204; \
+  float16x8_t __s1_204 = __p1_204; \
+  float16x4_t __s2_204 = __p2_204; \
+  float16x8_t __rev0_204;  __rev0_204 = __builtin_shufflevector(__s0_204, __s0_204, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __rev1_204;  __rev1_204 = __builtin_shufflevector(__s1_204, __s1_204, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x4_t __rev2_204;  __rev2_204 = __builtin_shufflevector(__s2_204, __s2_204, 3, 2, 1, 0); \
+  float16x8_t __ret_204; \
+float16x4_t __reint_204 = __rev2_204; \
+uint32x4_t __reint1_204 = (uint32x4_t) {__noswap_vget_lane_u32(*(uint32x2_t *) &__reint_204, __p3_204), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_204, __p3_204), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_204, __p3_204), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_204, __p3_204)}; \
+  __ret_204 = __noswap_vcmlaq_f16(__rev0_204, __rev1_204, *(float16x8_t *) &__reint1_204); \
+  __ret_204 = __builtin_shufflevector(__ret_204, __ret_204, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_204; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmla_laneq_f16(__p0_205, __p1_205, __p2_205, __p3_205) __extension__ ({ \
+  float16x4_t __s0_205 = __p0_205; \
+  float16x4_t __s1_205 = __p1_205; \
+  float16x8_t __s2_205 = __p2_205; \
+  float16x4_t __ret_205; \
+float16x8_t __reint_205 = __s2_205; \
+uint32x2_t __reint1_205 = (uint32x2_t) {vgetq_lane_u32(*(uint32x4_t *) &__reint_205, __p3_205), vgetq_lane_u32(*(uint32x4_t *) &__reint_205, __p3_205)}; \
+  __ret_205 = vcmla_f16(__s0_205, __s1_205, *(float16x4_t *) &__reint1_205); \
+  __ret_205; \
+})
+#else
+#define vcmla_laneq_f16(__p0_206, __p1_206, __p2_206, __p3_206) __extension__ ({ \
+  float16x4_t __s0_206 = __p0_206; \
+  float16x4_t __s1_206 = __p1_206; \
+  float16x8_t __s2_206 = __p2_206; \
+  float16x4_t __rev0_206;  __rev0_206 = __builtin_shufflevector(__s0_206, __s0_206, 3, 2, 1, 0); \
+  float16x4_t __rev1_206;  __rev1_206 = __builtin_shufflevector(__s1_206, __s1_206, 3, 2, 1, 0); \
+  float16x8_t __rev2_206;  __rev2_206 = __builtin_shufflevector(__s2_206, __s2_206, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x4_t __ret_206; \
+float16x8_t __reint_206 = __rev2_206; \
+uint32x2_t __reint1_206 = (uint32x2_t) {__noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_206, __p3_206), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_206, __p3_206)}; \
+  __ret_206 = __noswap_vcmla_f16(__rev0_206, __rev1_206, *(float16x4_t *) &__reint1_206); \
+  __ret_206 = __builtin_shufflevector(__ret_206, __ret_206, 3, 2, 1, 0); \
+  __ret_206; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmlaq_laneq_f16(__p0_207, __p1_207, __p2_207, __p3_207) __extension__ ({ \
+  float16x8_t __s0_207 = __p0_207; \
+  float16x8_t __s1_207 = __p1_207; \
+  float16x8_t __s2_207 = __p2_207; \
+  float16x8_t __ret_207; \
+float16x8_t __reint_207 = __s2_207; \
+uint32x4_t __reint1_207 = (uint32x4_t) {vgetq_lane_u32(*(uint32x4_t *) &__reint_207, __p3_207), vgetq_lane_u32(*(uint32x4_t *) &__reint_207, __p3_207), vgetq_lane_u32(*(uint32x4_t *) &__reint_207, __p3_207), vgetq_lane_u32(*(uint32x4_t *) &__reint_207, __p3_207)}; \
+  __ret_207 = vcmlaq_f16(__s0_207, __s1_207, *(float16x8_t *) &__reint1_207); \
+  __ret_207; \
+})
+#else
+#define vcmlaq_laneq_f16(__p0_208, __p1_208, __p2_208, __p3_208) __extension__ ({ \
+  float16x8_t __s0_208 = __p0_208; \
+  float16x8_t __s1_208 = __p1_208; \
+  float16x8_t __s2_208 = __p2_208; \
+  float16x8_t __rev0_208;  __rev0_208 = __builtin_shufflevector(__s0_208, __s0_208, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __rev1_208;  __rev1_208 = __builtin_shufflevector(__s1_208, __s1_208, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __rev2_208;  __rev2_208 = __builtin_shufflevector(__s2_208, __s2_208, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __ret_208; \
+float16x8_t __reint_208 = __rev2_208; \
+uint32x4_t __reint1_208 = (uint32x4_t) {__noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_208, __p3_208), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_208, __p3_208), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_208, __p3_208), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_208, __p3_208)}; \
+  __ret_208 = __noswap_vcmlaq_f16(__rev0_208, __rev1_208, *(float16x8_t *) &__reint1_208); \
+  __ret_208 = __builtin_shufflevector(__ret_208, __ret_208, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_208; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vcmlaq_rot180_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vcmlaq_rot180_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40);
+  return __ret;
+}
+#else
+__ai float16x8_t vcmlaq_rot180_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vcmlaq_rot180_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 40);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai float16x8_t __noswap_vcmlaq_rot180_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vcmlaq_rot180_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vcmla_rot180_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vcmla_rot180_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8);
+  return __ret;
+}
+#else
+__ai float16x4_t vcmla_rot180_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vcmla_rot180_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai float16x4_t __noswap_vcmla_rot180_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vcmla_rot180_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmla_rot180_lane_f16(__p0_209, __p1_209, __p2_209, __p3_209) __extension__ ({ \
+  float16x4_t __s0_209 = __p0_209; \
+  float16x4_t __s1_209 = __p1_209; \
+  float16x4_t __s2_209 = __p2_209; \
+  float16x4_t __ret_209; \
+float16x4_t __reint_209 = __s2_209; \
+uint32x2_t __reint1_209 = (uint32x2_t) {vget_lane_u32(*(uint32x2_t *) &__reint_209, __p3_209), vget_lane_u32(*(uint32x2_t *) &__reint_209, __p3_209)}; \
+  __ret_209 = vcmla_rot180_f16(__s0_209, __s1_209, *(float16x4_t *) &__reint1_209); \
+  __ret_209; \
+})
+#else
+#define vcmla_rot180_lane_f16(__p0_210, __p1_210, __p2_210, __p3_210) __extension__ ({ \
+  float16x4_t __s0_210 = __p0_210; \
+  float16x4_t __s1_210 = __p1_210; \
+  float16x4_t __s2_210 = __p2_210; \
+  float16x4_t __rev0_210;  __rev0_210 = __builtin_shufflevector(__s0_210, __s0_210, 3, 2, 1, 0); \
+  float16x4_t __rev1_210;  __rev1_210 = __builtin_shufflevector(__s1_210, __s1_210, 3, 2, 1, 0); \
+  float16x4_t __rev2_210;  __rev2_210 = __builtin_shufflevector(__s2_210, __s2_210, 3, 2, 1, 0); \
+  float16x4_t __ret_210; \
+float16x4_t __reint_210 = __rev2_210; \
+uint32x2_t __reint1_210 = (uint32x2_t) {__noswap_vget_lane_u32(*(uint32x2_t *) &__reint_210, __p3_210), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_210, __p3_210)}; \
+  __ret_210 = __noswap_vcmla_rot180_f16(__rev0_210, __rev1_210, *(float16x4_t *) &__reint1_210); \
+  __ret_210 = __builtin_shufflevector(__ret_210, __ret_210, 3, 2, 1, 0); \
+  __ret_210; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmlaq_rot180_lane_f16(__p0_211, __p1_211, __p2_211, __p3_211) __extension__ ({ \
+  float16x8_t __s0_211 = __p0_211; \
+  float16x8_t __s1_211 = __p1_211; \
+  float16x4_t __s2_211 = __p2_211; \
+  float16x8_t __ret_211; \
+float16x4_t __reint_211 = __s2_211; \
+uint32x4_t __reint1_211 = (uint32x4_t) {vget_lane_u32(*(uint32x2_t *) &__reint_211, __p3_211), vget_lane_u32(*(uint32x2_t *) &__reint_211, __p3_211), vget_lane_u32(*(uint32x2_t *) &__reint_211, __p3_211), vget_lane_u32(*(uint32x2_t *) &__reint_211, __p3_211)}; \
+  __ret_211 = vcmlaq_rot180_f16(__s0_211, __s1_211, *(float16x8_t *) &__reint1_211); \
+  __ret_211; \
+})
+#else
+#define vcmlaq_rot180_lane_f16(__p0_212, __p1_212, __p2_212, __p3_212) __extension__ ({ \
+  float16x8_t __s0_212 = __p0_212; \
+  float16x8_t __s1_212 = __p1_212; \
+  float16x4_t __s2_212 = __p2_212; \
+  float16x8_t __rev0_212;  __rev0_212 = __builtin_shufflevector(__s0_212, __s0_212, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __rev1_212;  __rev1_212 = __builtin_shufflevector(__s1_212, __s1_212, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x4_t __rev2_212;  __rev2_212 = __builtin_shufflevector(__s2_212, __s2_212, 3, 2, 1, 0); \
+  float16x8_t __ret_212; \
+float16x4_t __reint_212 = __rev2_212; \
+uint32x4_t __reint1_212 = (uint32x4_t) {__noswap_vget_lane_u32(*(uint32x2_t *) &__reint_212, __p3_212), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_212, __p3_212), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_212, __p3_212), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_212, __p3_212)}; \
+  __ret_212 = __noswap_vcmlaq_rot180_f16(__rev0_212, __rev1_212, *(float16x8_t *) &__reint1_212); \
+  __ret_212 = __builtin_shufflevector(__ret_212, __ret_212, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_212; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmla_rot180_laneq_f16(__p0_213, __p1_213, __p2_213, __p3_213) __extension__ ({ \
+  float16x4_t __s0_213 = __p0_213; \
+  float16x4_t __s1_213 = __p1_213; \
+  float16x8_t __s2_213 = __p2_213; \
+  float16x4_t __ret_213; \
+float16x8_t __reint_213 = __s2_213; \
+uint32x2_t __reint1_213 = (uint32x2_t) {vgetq_lane_u32(*(uint32x4_t *) &__reint_213, __p3_213), vgetq_lane_u32(*(uint32x4_t *) &__reint_213, __p3_213)}; \
+  __ret_213 = vcmla_rot180_f16(__s0_213, __s1_213, *(float16x4_t *) &__reint1_213); \
+  __ret_213; \
+})
+#else
+#define vcmla_rot180_laneq_f16(__p0_214, __p1_214, __p2_214, __p3_214) __extension__ ({ \
+  float16x4_t __s0_214 = __p0_214; \
+  float16x4_t __s1_214 = __p1_214; \
+  float16x8_t __s2_214 = __p2_214; \
+  float16x4_t __rev0_214;  __rev0_214 = __builtin_shufflevector(__s0_214, __s0_214, 3, 2, 1, 0); \
+  float16x4_t __rev1_214;  __rev1_214 = __builtin_shufflevector(__s1_214, __s1_214, 3, 2, 1, 0); \
+  float16x8_t __rev2_214;  __rev2_214 = __builtin_shufflevector(__s2_214, __s2_214, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x4_t __ret_214; \
+float16x8_t __reint_214 = __rev2_214; \
+uint32x2_t __reint1_214 = (uint32x2_t) {__noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_214, __p3_214), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_214, __p3_214)}; \
+  __ret_214 = __noswap_vcmla_rot180_f16(__rev0_214, __rev1_214, *(float16x4_t *) &__reint1_214); \
+  __ret_214 = __builtin_shufflevector(__ret_214, __ret_214, 3, 2, 1, 0); \
+  __ret_214; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmlaq_rot180_laneq_f16(__p0_215, __p1_215, __p2_215, __p3_215) __extension__ ({ \
+  float16x8_t __s0_215 = __p0_215; \
+  float16x8_t __s1_215 = __p1_215; \
+  float16x8_t __s2_215 = __p2_215; \
+  float16x8_t __ret_215; \
+float16x8_t __reint_215 = __s2_215; \
+uint32x4_t __reint1_215 = (uint32x4_t) {vgetq_lane_u32(*(uint32x4_t *) &__reint_215, __p3_215), vgetq_lane_u32(*(uint32x4_t *) &__reint_215, __p3_215), vgetq_lane_u32(*(uint32x4_t *) &__reint_215, __p3_215), vgetq_lane_u32(*(uint32x4_t *) &__reint_215, __p3_215)}; \
+  __ret_215 = vcmlaq_rot180_f16(__s0_215, __s1_215, *(float16x8_t *) &__reint1_215); \
+  __ret_215; \
+})
+#else
+#define vcmlaq_rot180_laneq_f16(__p0_216, __p1_216, __p2_216, __p3_216) __extension__ ({ \
+  float16x8_t __s0_216 = __p0_216; \
+  float16x8_t __s1_216 = __p1_216; \
+  float16x8_t __s2_216 = __p2_216; \
+  float16x8_t __rev0_216;  __rev0_216 = __builtin_shufflevector(__s0_216, __s0_216, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __rev1_216;  __rev1_216 = __builtin_shufflevector(__s1_216, __s1_216, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __rev2_216;  __rev2_216 = __builtin_shufflevector(__s2_216, __s2_216, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __ret_216; \
+float16x8_t __reint_216 = __rev2_216; \
+uint32x4_t __reint1_216 = (uint32x4_t) {__noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_216, __p3_216), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_216, __p3_216), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_216, __p3_216), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_216, __p3_216)}; \
+  __ret_216 = __noswap_vcmlaq_rot180_f16(__rev0_216, __rev1_216, *(float16x8_t *) &__reint1_216); \
+  __ret_216 = __builtin_shufflevector(__ret_216, __ret_216, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_216; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vcmlaq_rot270_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vcmlaq_rot270_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40);
+  return __ret;
+}
+#else
+__ai float16x8_t vcmlaq_rot270_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vcmlaq_rot270_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 40);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai float16x8_t __noswap_vcmlaq_rot270_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vcmlaq_rot270_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vcmla_rot270_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vcmla_rot270_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8);
+  return __ret;
+}
+#else
+__ai float16x4_t vcmla_rot270_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vcmla_rot270_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai float16x4_t __noswap_vcmla_rot270_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vcmla_rot270_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmla_rot270_lane_f16(__p0_217, __p1_217, __p2_217, __p3_217) __extension__ ({ \
+  float16x4_t __s0_217 = __p0_217; \
+  float16x4_t __s1_217 = __p1_217; \
+  float16x4_t __s2_217 = __p2_217; \
+  float16x4_t __ret_217; \
+float16x4_t __reint_217 = __s2_217; \
+uint32x2_t __reint1_217 = (uint32x2_t) {vget_lane_u32(*(uint32x2_t *) &__reint_217, __p3_217), vget_lane_u32(*(uint32x2_t *) &__reint_217, __p3_217)}; \
+  __ret_217 = vcmla_rot270_f16(__s0_217, __s1_217, *(float16x4_t *) &__reint1_217); \
+  __ret_217; \
+})
+#else
+#define vcmla_rot270_lane_f16(__p0_218, __p1_218, __p2_218, __p3_218) __extension__ ({ \
+  float16x4_t __s0_218 = __p0_218; \
+  float16x4_t __s1_218 = __p1_218; \
+  float16x4_t __s2_218 = __p2_218; \
+  float16x4_t __rev0_218;  __rev0_218 = __builtin_shufflevector(__s0_218, __s0_218, 3, 2, 1, 0); \
+  float16x4_t __rev1_218;  __rev1_218 = __builtin_shufflevector(__s1_218, __s1_218, 3, 2, 1, 0); \
+  float16x4_t __rev2_218;  __rev2_218 = __builtin_shufflevector(__s2_218, __s2_218, 3, 2, 1, 0); \
+  float16x4_t __ret_218; \
+float16x4_t __reint_218 = __rev2_218; \
+uint32x2_t __reint1_218 = (uint32x2_t) {__noswap_vget_lane_u32(*(uint32x2_t *) &__reint_218, __p3_218), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_218, __p3_218)}; \
+  __ret_218 = __noswap_vcmla_rot270_f16(__rev0_218, __rev1_218, *(float16x4_t *) &__reint1_218); \
+  __ret_218 = __builtin_shufflevector(__ret_218, __ret_218, 3, 2, 1, 0); \
+  __ret_218; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmlaq_rot270_lane_f16(__p0_219, __p1_219, __p2_219, __p3_219) __extension__ ({ \
+  float16x8_t __s0_219 = __p0_219; \
+  float16x8_t __s1_219 = __p1_219; \
+  float16x4_t __s2_219 = __p2_219; \
+  float16x8_t __ret_219; \
+float16x4_t __reint_219 = __s2_219; \
+uint32x4_t __reint1_219 = (uint32x4_t) {vget_lane_u32(*(uint32x2_t *) &__reint_219, __p3_219), vget_lane_u32(*(uint32x2_t *) &__reint_219, __p3_219), vget_lane_u32(*(uint32x2_t *) &__reint_219, __p3_219), vget_lane_u32(*(uint32x2_t *) &__reint_219, __p3_219)}; \
+  __ret_219 = vcmlaq_rot270_f16(__s0_219, __s1_219, *(float16x8_t *) &__reint1_219); \
+  __ret_219; \
+})
+#else
+#define vcmlaq_rot270_lane_f16(__p0_220, __p1_220, __p2_220, __p3_220) __extension__ ({ \
+  float16x8_t __s0_220 = __p0_220; \
+  float16x8_t __s1_220 = __p1_220; \
+  float16x4_t __s2_220 = __p2_220; \
+  float16x8_t __rev0_220;  __rev0_220 = __builtin_shufflevector(__s0_220, __s0_220, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __rev1_220;  __rev1_220 = __builtin_shufflevector(__s1_220, __s1_220, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x4_t __rev2_220;  __rev2_220 = __builtin_shufflevector(__s2_220, __s2_220, 3, 2, 1, 0); \
+  float16x8_t __ret_220; \
+float16x4_t __reint_220 = __rev2_220; \
+uint32x4_t __reint1_220 = (uint32x4_t) {__noswap_vget_lane_u32(*(uint32x2_t *) &__reint_220, __p3_220), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_220, __p3_220), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_220, __p3_220), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_220, __p3_220)}; \
+  __ret_220 = __noswap_vcmlaq_rot270_f16(__rev0_220, __rev1_220, *(float16x8_t *) &__reint1_220); \
+  __ret_220 = __builtin_shufflevector(__ret_220, __ret_220, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_220; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmla_rot270_laneq_f16(__p0_221, __p1_221, __p2_221, __p3_221) __extension__ ({ \
+  float16x4_t __s0_221 = __p0_221; \
+  float16x4_t __s1_221 = __p1_221; \
+  float16x8_t __s2_221 = __p2_221; \
+  float16x4_t __ret_221; \
+float16x8_t __reint_221 = __s2_221; \
+uint32x2_t __reint1_221 = (uint32x2_t) {vgetq_lane_u32(*(uint32x4_t *) &__reint_221, __p3_221), vgetq_lane_u32(*(uint32x4_t *) &__reint_221, __p3_221)}; \
+  __ret_221 = vcmla_rot270_f16(__s0_221, __s1_221, *(float16x4_t *) &__reint1_221); \
+  __ret_221; \
+})
+#else
+#define vcmla_rot270_laneq_f16(__p0_222, __p1_222, __p2_222, __p3_222) __extension__ ({ \
+  float16x4_t __s0_222 = __p0_222; \
+  float16x4_t __s1_222 = __p1_222; \
+  float16x8_t __s2_222 = __p2_222; \
+  float16x4_t __rev0_222;  __rev0_222 = __builtin_shufflevector(__s0_222, __s0_222, 3, 2, 1, 0); \
+  float16x4_t __rev1_222;  __rev1_222 = __builtin_shufflevector(__s1_222, __s1_222, 3, 2, 1, 0); \
+  float16x8_t __rev2_222;  __rev2_222 = __builtin_shufflevector(__s2_222, __s2_222, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x4_t __ret_222; \
+float16x8_t __reint_222 = __rev2_222; \
+uint32x2_t __reint1_222 = (uint32x2_t) {__noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_222, __p3_222), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_222, __p3_222)}; \
+  __ret_222 = __noswap_vcmla_rot270_f16(__rev0_222, __rev1_222, *(float16x4_t *) &__reint1_222); \
+  __ret_222 = __builtin_shufflevector(__ret_222, __ret_222, 3, 2, 1, 0); \
+  __ret_222; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmlaq_rot270_laneq_f16(__p0_223, __p1_223, __p2_223, __p3_223) __extension__ ({ \
+  float16x8_t __s0_223 = __p0_223; \
+  float16x8_t __s1_223 = __p1_223; \
+  float16x8_t __s2_223 = __p2_223; \
+  float16x8_t __ret_223; \
+float16x8_t __reint_223 = __s2_223; \
+uint32x4_t __reint1_223 = (uint32x4_t) {vgetq_lane_u32(*(uint32x4_t *) &__reint_223, __p3_223), vgetq_lane_u32(*(uint32x4_t *) &__reint_223, __p3_223), vgetq_lane_u32(*(uint32x4_t *) &__reint_223, __p3_223), vgetq_lane_u32(*(uint32x4_t *) &__reint_223, __p3_223)}; \
+  __ret_223 = vcmlaq_rot270_f16(__s0_223, __s1_223, *(float16x8_t *) &__reint1_223); \
+  __ret_223; \
+})
+#else
+#define vcmlaq_rot270_laneq_f16(__p0_224, __p1_224, __p2_224, __p3_224) __extension__ ({ \
+  float16x8_t __s0_224 = __p0_224; \
+  float16x8_t __s1_224 = __p1_224; \
+  float16x8_t __s2_224 = __p2_224; \
+  float16x8_t __rev0_224;  __rev0_224 = __builtin_shufflevector(__s0_224, __s0_224, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __rev1_224;  __rev1_224 = __builtin_shufflevector(__s1_224, __s1_224, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __rev2_224;  __rev2_224 = __builtin_shufflevector(__s2_224, __s2_224, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __ret_224; \
+float16x8_t __reint_224 = __rev2_224; \
+uint32x4_t __reint1_224 = (uint32x4_t) {__noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_224, __p3_224), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_224, __p3_224), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_224, __p3_224), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_224, __p3_224)}; \
+  __ret_224 = __noswap_vcmlaq_rot270_f16(__rev0_224, __rev1_224, *(float16x8_t *) &__reint1_224); \
+  __ret_224 = __builtin_shufflevector(__ret_224, __ret_224, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_224; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vcmlaq_rot90_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vcmlaq_rot90_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40);
+  return __ret;
+}
+#else
+__ai float16x8_t vcmlaq_rot90_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vcmlaq_rot90_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 40);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai float16x8_t __noswap_vcmlaq_rot90_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
+  float16x8_t __ret;
+  __ret = (float16x8_t) __builtin_neon_vcmlaq_rot90_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vcmla_rot90_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vcmla_rot90_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8);
+  return __ret;
+}
+#else
+__ai float16x4_t vcmla_rot90_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vcmla_rot90_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai float16x4_t __noswap_vcmla_rot90_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vcmla_rot90_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmla_rot90_lane_f16(__p0_225, __p1_225, __p2_225, __p3_225) __extension__ ({ \
+  float16x4_t __s0_225 = __p0_225; \
+  float16x4_t __s1_225 = __p1_225; \
+  float16x4_t __s2_225 = __p2_225; \
+  float16x4_t __ret_225; \
+float16x4_t __reint_225 = __s2_225; \
+uint32x2_t __reint1_225 = (uint32x2_t) {vget_lane_u32(*(uint32x2_t *) &__reint_225, __p3_225), vget_lane_u32(*(uint32x2_t *) &__reint_225, __p3_225)}; \
+  __ret_225 = vcmla_rot90_f16(__s0_225, __s1_225, *(float16x4_t *) &__reint1_225); \
+  __ret_225; \
+})
+#else
+#define vcmla_rot90_lane_f16(__p0_226, __p1_226, __p2_226, __p3_226) __extension__ ({ \
+  float16x4_t __s0_226 = __p0_226; \
+  float16x4_t __s1_226 = __p1_226; \
+  float16x4_t __s2_226 = __p2_226; \
+  float16x4_t __rev0_226;  __rev0_226 = __builtin_shufflevector(__s0_226, __s0_226, 3, 2, 1, 0); \
+  float16x4_t __rev1_226;  __rev1_226 = __builtin_shufflevector(__s1_226, __s1_226, 3, 2, 1, 0); \
+  float16x4_t __rev2_226;  __rev2_226 = __builtin_shufflevector(__s2_226, __s2_226, 3, 2, 1, 0); \
+  float16x4_t __ret_226; \
+float16x4_t __reint_226 = __rev2_226; \
+uint32x2_t __reint1_226 = (uint32x2_t) {__noswap_vget_lane_u32(*(uint32x2_t *) &__reint_226, __p3_226), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_226, __p3_226)}; \
+  __ret_226 = __noswap_vcmla_rot90_f16(__rev0_226, __rev1_226, *(float16x4_t *) &__reint1_226); \
+  __ret_226 = __builtin_shufflevector(__ret_226, __ret_226, 3, 2, 1, 0); \
+  __ret_226; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmlaq_rot90_lane_f16(__p0_227, __p1_227, __p2_227, __p3_227) __extension__ ({ \
+  float16x8_t __s0_227 = __p0_227; \
+  float16x8_t __s1_227 = __p1_227; \
+  float16x4_t __s2_227 = __p2_227; \
+  float16x8_t __ret_227; \
+float16x4_t __reint_227 = __s2_227; \
+uint32x4_t __reint1_227 = (uint32x4_t) {vget_lane_u32(*(uint32x2_t *) &__reint_227, __p3_227), vget_lane_u32(*(uint32x2_t *) &__reint_227, __p3_227), vget_lane_u32(*(uint32x2_t *) &__reint_227, __p3_227), vget_lane_u32(*(uint32x2_t *) &__reint_227, __p3_227)}; \
+  __ret_227 = vcmlaq_rot90_f16(__s0_227, __s1_227, *(float16x8_t *) &__reint1_227); \
+  __ret_227; \
+})
+#else
+#define vcmlaq_rot90_lane_f16(__p0_228, __p1_228, __p2_228, __p3_228) __extension__ ({ \
+  float16x8_t __s0_228 = __p0_228; \
+  float16x8_t __s1_228 = __p1_228; \
+  float16x4_t __s2_228 = __p2_228; \
+  float16x8_t __rev0_228;  __rev0_228 = __builtin_shufflevector(__s0_228, __s0_228, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __rev1_228;  __rev1_228 = __builtin_shufflevector(__s1_228, __s1_228, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x4_t __rev2_228;  __rev2_228 = __builtin_shufflevector(__s2_228, __s2_228, 3, 2, 1, 0); \
+  float16x8_t __ret_228; \
+float16x4_t __reint_228 = __rev2_228; \
+uint32x4_t __reint1_228 = (uint32x4_t) {__noswap_vget_lane_u32(*(uint32x2_t *) &__reint_228, __p3_228), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_228, __p3_228), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_228, __p3_228), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_228, __p3_228)}; \
+  __ret_228 = __noswap_vcmlaq_rot90_f16(__rev0_228, __rev1_228, *(float16x8_t *) &__reint1_228); \
+  __ret_228 = __builtin_shufflevector(__ret_228, __ret_228, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_228; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmla_rot90_laneq_f16(__p0_229, __p1_229, __p2_229, __p3_229) __extension__ ({ \
+  float16x4_t __s0_229 = __p0_229; \
+  float16x4_t __s1_229 = __p1_229; \
+  float16x8_t __s2_229 = __p2_229; \
+  float16x4_t __ret_229; \
+float16x8_t __reint_229 = __s2_229; \
+uint32x2_t __reint1_229 = (uint32x2_t) {vgetq_lane_u32(*(uint32x4_t *) &__reint_229, __p3_229), vgetq_lane_u32(*(uint32x4_t *) &__reint_229, __p3_229)}; \
+  __ret_229 = vcmla_rot90_f16(__s0_229, __s1_229, *(float16x4_t *) &__reint1_229); \
+  __ret_229; \
+})
+#else
+#define vcmla_rot90_laneq_f16(__p0_230, __p1_230, __p2_230, __p3_230) __extension__ ({ \
+  float16x4_t __s0_230 = __p0_230; \
+  float16x4_t __s1_230 = __p1_230; \
+  float16x8_t __s2_230 = __p2_230; \
+  float16x4_t __rev0_230;  __rev0_230 = __builtin_shufflevector(__s0_230, __s0_230, 3, 2, 1, 0); \
+  float16x4_t __rev1_230;  __rev1_230 = __builtin_shufflevector(__s1_230, __s1_230, 3, 2, 1, 0); \
+  float16x8_t __rev2_230;  __rev2_230 = __builtin_shufflevector(__s2_230, __s2_230, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x4_t __ret_230; \
+float16x8_t __reint_230 = __rev2_230; \
+uint32x2_t __reint1_230 = (uint32x2_t) {__noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_230, __p3_230), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_230, __p3_230)}; \
+  __ret_230 = __noswap_vcmla_rot90_f16(__rev0_230, __rev1_230, *(float16x4_t *) &__reint1_230); \
+  __ret_230 = __builtin_shufflevector(__ret_230, __ret_230, 3, 2, 1, 0); \
+  __ret_230; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmlaq_rot90_laneq_f16(__p0_231, __p1_231, __p2_231, __p3_231) __extension__ ({ \
+  float16x8_t __s0_231 = __p0_231; \
+  float16x8_t __s1_231 = __p1_231; \
+  float16x8_t __s2_231 = __p2_231; \
+  float16x8_t __ret_231; \
+float16x8_t __reint_231 = __s2_231; \
+uint32x4_t __reint1_231 = (uint32x4_t) {vgetq_lane_u32(*(uint32x4_t *) &__reint_231, __p3_231), vgetq_lane_u32(*(uint32x4_t *) &__reint_231, __p3_231), vgetq_lane_u32(*(uint32x4_t *) &__reint_231, __p3_231), vgetq_lane_u32(*(uint32x4_t *) &__reint_231, __p3_231)}; \
+  __ret_231 = vcmlaq_rot90_f16(__s0_231, __s1_231, *(float16x8_t *) &__reint1_231); \
+  __ret_231; \
+})
+#else
+#define vcmlaq_rot90_laneq_f16(__p0_232, __p1_232, __p2_232, __p3_232) __extension__ ({ \
+  float16x8_t __s0_232 = __p0_232; \
+  float16x8_t __s1_232 = __p1_232; \
+  float16x8_t __s2_232 = __p2_232; \
+  float16x8_t __rev0_232;  __rev0_232 = __builtin_shufflevector(__s0_232, __s0_232, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __rev1_232;  __rev1_232 = __builtin_shufflevector(__s1_232, __s1_232, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __rev2_232;  __rev2_232 = __builtin_shufflevector(__s2_232, __s2_232, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __ret_232; \
+float16x8_t __reint_232 = __rev2_232; \
+uint32x4_t __reint1_232 = (uint32x4_t) {__noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_232, __p3_232), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_232, __p3_232), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_232, __p3_232), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_232, __p3_232)}; \
+  __ret_232 = __noswap_vcmlaq_rot90_f16(__rev0_232, __rev1_232, *(float16x8_t *) &__reint1_232); \
+  __ret_232 = __builtin_shufflevector(__ret_232, __ret_232, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_232; \
+})
+#endif
+
 #endif
 #if defined(__ARM_FEATURE_COMPLEX) && defined(__aarch64__)
 #ifdef __LITTLE_ENDIAN__
@@ -40535,6 +41799,478 @@ __ai float64x2_t vcaddq_rot90_f64(float64x2_t __p0, float64x2_t __p1) {
 }
 #endif
 
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vcmlaq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vcmlaq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vcmlaq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vcmlaq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai float64x2_t __noswap_vcmlaq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vcmlaq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42);
+  return __ret;
+}
+#endif
+
+__ai float64x1_t vcmla_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vcmla_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 10);
+  return __ret;
+}
+#define vcmla_lane_f64(__p0_233, __p1_233, __p2_233, __p3_233) __extension__ ({ \
+  float64x1_t __s0_233 = __p0_233; \
+  float64x1_t __s1_233 = __p1_233; \
+  float64x1_t __s2_233 = __p2_233; \
+  float64x1_t __ret_233; \
+float64x1_t __reint_233 = __s2_233; \
+uint64x2_t __reint1_233 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_233, __p3_233), vgetq_lane_u64(*(uint64x2_t *) &__reint_233, __p3_233)}; \
+  __ret_233 = vcmla_f64(__s0_233, __s1_233, *(float64x1_t *) &__reint1_233); \
+  __ret_233; \
+})
+#ifdef __LITTLE_ENDIAN__
+#define vcmlaq_lane_f64(__p0_234, __p1_234, __p2_234, __p3_234) __extension__ ({ \
+  float64x2_t __s0_234 = __p0_234; \
+  float64x2_t __s1_234 = __p1_234; \
+  float64x1_t __s2_234 = __p2_234; \
+  float64x2_t __ret_234; \
+float64x1_t __reint_234 = __s2_234; \
+uint64x2_t __reint1_234 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_234, __p3_234), vgetq_lane_u64(*(uint64x2_t *) &__reint_234, __p3_234)}; \
+  __ret_234 = vcmlaq_f64(__s0_234, __s1_234, *(float64x2_t *) &__reint1_234); \
+  __ret_234; \
+})
+#else
+#define vcmlaq_lane_f64(__p0_235, __p1_235, __p2_235, __p3_235) __extension__ ({ \
+  float64x2_t __s0_235 = __p0_235; \
+  float64x2_t __s1_235 = __p1_235; \
+  float64x1_t __s2_235 = __p2_235; \
+  float64x2_t __rev0_235;  __rev0_235 = __builtin_shufflevector(__s0_235, __s0_235, 1, 0); \
+  float64x2_t __rev1_235;  __rev1_235 = __builtin_shufflevector(__s1_235, __s1_235, 1, 0); \
+  float64x2_t __ret_235; \
+float64x1_t __reint_235 = __s2_235; \
+uint64x2_t __reint1_235 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_235, __p3_235), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_235, __p3_235)}; \
+  __ret_235 = __noswap_vcmlaq_f64(__rev0_235, __rev1_235, *(float64x2_t *) &__reint1_235); \
+  __ret_235 = __builtin_shufflevector(__ret_235, __ret_235, 1, 0); \
+  __ret_235; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmla_laneq_f64(__p0_236, __p1_236, __p2_236, __p3_236) __extension__ ({ \
+  float64x1_t __s0_236 = __p0_236; \
+  float64x1_t __s1_236 = __p1_236; \
+  float64x2_t __s2_236 = __p2_236; \
+  float64x1_t __ret_236; \
+float64x2_t __reint_236 = __s2_236; \
+uint64x2_t __reint1_236 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_236, __p3_236), vgetq_lane_u64(*(uint64x2_t *) &__reint_236, __p3_236)}; \
+  __ret_236 = vcmla_f64(__s0_236, __s1_236, *(float64x1_t *) &__reint1_236); \
+  __ret_236; \
+})
+#else
+#define vcmla_laneq_f64(__p0_237, __p1_237, __p2_237, __p3_237) __extension__ ({ \
+  float64x1_t __s0_237 = __p0_237; \
+  float64x1_t __s1_237 = __p1_237; \
+  float64x2_t __s2_237 = __p2_237; \
+  float64x2_t __rev2_237;  __rev2_237 = __builtin_shufflevector(__s2_237, __s2_237, 1, 0); \
+  float64x1_t __ret_237; \
+float64x2_t __reint_237 = __rev2_237; \
+uint64x2_t __reint1_237 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_237, __p3_237), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_237, __p3_237)}; \
+  __ret_237 = vcmla_f64(__s0_237, __s1_237, *(float64x1_t *) &__reint1_237); \
+  __ret_237; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmlaq_laneq_f64(__p0_238, __p1_238, __p2_238, __p3_238) __extension__ ({ \
+  float64x2_t __s0_238 = __p0_238; \
+  float64x2_t __s1_238 = __p1_238; \
+  float64x2_t __s2_238 = __p2_238; \
+  float64x2_t __ret_238; \
+float64x2_t __reint_238 = __s2_238; \
+uint64x2_t __reint1_238 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_238, __p3_238), vgetq_lane_u64(*(uint64x2_t *) &__reint_238, __p3_238)}; \
+  __ret_238 = vcmlaq_f64(__s0_238, __s1_238, *(float64x2_t *) &__reint1_238); \
+  __ret_238; \
+})
+#else
+#define vcmlaq_laneq_f64(__p0_239, __p1_239, __p2_239, __p3_239) __extension__ ({ \
+  float64x2_t __s0_239 = __p0_239; \
+  float64x2_t __s1_239 = __p1_239; \
+  float64x2_t __s2_239 = __p2_239; \
+  float64x2_t __rev0_239;  __rev0_239 = __builtin_shufflevector(__s0_239, __s0_239, 1, 0); \
+  float64x2_t __rev1_239;  __rev1_239 = __builtin_shufflevector(__s1_239, __s1_239, 1, 0); \
+  float64x2_t __rev2_239;  __rev2_239 = __builtin_shufflevector(__s2_239, __s2_239, 1, 0); \
+  float64x2_t __ret_239; \
+float64x2_t __reint_239 = __rev2_239; \
+uint64x2_t __reint1_239 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_239, __p3_239), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_239, __p3_239)}; \
+  __ret_239 = __noswap_vcmlaq_f64(__rev0_239, __rev1_239, *(float64x2_t *) &__reint1_239); \
+  __ret_239 = __builtin_shufflevector(__ret_239, __ret_239, 1, 0); \
+  __ret_239; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vcmlaq_rot180_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vcmlaq_rot180_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vcmlaq_rot180_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vcmlaq_rot180_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai float64x2_t __noswap_vcmlaq_rot180_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vcmlaq_rot180_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42);
+  return __ret;
+}
+#endif
+
+__ai float64x1_t vcmla_rot180_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vcmla_rot180_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 10);
+  return __ret;
+}
+#define vcmla_rot180_lane_f64(__p0_240, __p1_240, __p2_240, __p3_240) __extension__ ({ \
+  float64x1_t __s0_240 = __p0_240; \
+  float64x1_t __s1_240 = __p1_240; \
+  float64x1_t __s2_240 = __p2_240; \
+  float64x1_t __ret_240; \
+float64x1_t __reint_240 = __s2_240; \
+uint64x2_t __reint1_240 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_240, __p3_240), vgetq_lane_u64(*(uint64x2_t *) &__reint_240, __p3_240)}; \
+  __ret_240 = vcmla_rot180_f64(__s0_240, __s1_240, *(float64x1_t *) &__reint1_240); \
+  __ret_240; \
+})
+#ifdef __LITTLE_ENDIAN__
+#define vcmlaq_rot180_lane_f64(__p0_241, __p1_241, __p2_241, __p3_241) __extension__ ({ \
+  float64x2_t __s0_241 = __p0_241; \
+  float64x2_t __s1_241 = __p1_241; \
+  float64x1_t __s2_241 = __p2_241; \
+  float64x2_t __ret_241; \
+float64x1_t __reint_241 = __s2_241; \
+uint64x2_t __reint1_241 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_241, __p3_241), vgetq_lane_u64(*(uint64x2_t *) &__reint_241, __p3_241)}; \
+  __ret_241 = vcmlaq_rot180_f64(__s0_241, __s1_241, *(float64x2_t *) &__reint1_241); \
+  __ret_241; \
+})
+#else
+#define vcmlaq_rot180_lane_f64(__p0_242, __p1_242, __p2_242, __p3_242) __extension__ ({ \
+  float64x2_t __s0_242 = __p0_242; \
+  float64x2_t __s1_242 = __p1_242; \
+  float64x1_t __s2_242 = __p2_242; \
+  float64x2_t __rev0_242;  __rev0_242 = __builtin_shufflevector(__s0_242, __s0_242, 1, 0); \
+  float64x2_t __rev1_242;  __rev1_242 = __builtin_shufflevector(__s1_242, __s1_242, 1, 0); \
+  float64x2_t __ret_242; \
+float64x1_t __reint_242 = __s2_242; \
+uint64x2_t __reint1_242 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_242, __p3_242), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_242, __p3_242)}; \
+  __ret_242 = __noswap_vcmlaq_rot180_f64(__rev0_242, __rev1_242, *(float64x2_t *) &__reint1_242); \
+  __ret_242 = __builtin_shufflevector(__ret_242, __ret_242, 1, 0); \
+  __ret_242; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmla_rot180_laneq_f64(__p0_243, __p1_243, __p2_243, __p3_243) __extension__ ({ \
+  float64x1_t __s0_243 = __p0_243; \
+  float64x1_t __s1_243 = __p1_243; \
+  float64x2_t __s2_243 = __p2_243; \
+  float64x1_t __ret_243; \
+float64x2_t __reint_243 = __s2_243; \
+uint64x2_t __reint1_243 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_243, __p3_243), vgetq_lane_u64(*(uint64x2_t *) &__reint_243, __p3_243)}; \
+  __ret_243 = vcmla_rot180_f64(__s0_243, __s1_243, *(float64x1_t *) &__reint1_243); \
+  __ret_243; \
+})
+#else
+#define vcmla_rot180_laneq_f64(__p0_244, __p1_244, __p2_244, __p3_244) __extension__ ({ \
+  float64x1_t __s0_244 = __p0_244; \
+  float64x1_t __s1_244 = __p1_244; \
+  float64x2_t __s2_244 = __p2_244; \
+  float64x2_t __rev2_244;  __rev2_244 = __builtin_shufflevector(__s2_244, __s2_244, 1, 0); \
+  float64x1_t __ret_244; \
+float64x2_t __reint_244 = __rev2_244; \
+uint64x2_t __reint1_244 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_244, __p3_244), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_244, __p3_244)}; \
+  __ret_244 = vcmla_rot180_f64(__s0_244, __s1_244, *(float64x1_t *) &__reint1_244); \
+  __ret_244; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmlaq_rot180_laneq_f64(__p0_245, __p1_245, __p2_245, __p3_245) __extension__ ({ \
+  float64x2_t __s0_245 = __p0_245; \
+  float64x2_t __s1_245 = __p1_245; \
+  float64x2_t __s2_245 = __p2_245; \
+  float64x2_t __ret_245; \
+float64x2_t __reint_245 = __s2_245; \
+uint64x2_t __reint1_245 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_245, __p3_245), vgetq_lane_u64(*(uint64x2_t *) &__reint_245, __p3_245)}; \
+  __ret_245 = vcmlaq_rot180_f64(__s0_245, __s1_245, *(float64x2_t *) &__reint1_245); \
+  __ret_245; \
+})
+#else
+#define vcmlaq_rot180_laneq_f64(__p0_246, __p1_246, __p2_246, __p3_246) __extension__ ({ \
+  float64x2_t __s0_246 = __p0_246; \
+  float64x2_t __s1_246 = __p1_246; \
+  float64x2_t __s2_246 = __p2_246; \
+  float64x2_t __rev0_246;  __rev0_246 = __builtin_shufflevector(__s0_246, __s0_246, 1, 0); \
+  float64x2_t __rev1_246;  __rev1_246 = __builtin_shufflevector(__s1_246, __s1_246, 1, 0); \
+  float64x2_t __rev2_246;  __rev2_246 = __builtin_shufflevector(__s2_246, __s2_246, 1, 0); \
+  float64x2_t __ret_246; \
+float64x2_t __reint_246 = __rev2_246; \
+uint64x2_t __reint1_246 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_246, __p3_246), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_246, __p3_246)}; \
+  __ret_246 = __noswap_vcmlaq_rot180_f64(__rev0_246, __rev1_246, *(float64x2_t *) &__reint1_246); \
+  __ret_246 = __builtin_shufflevector(__ret_246, __ret_246, 1, 0); \
+  __ret_246; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vcmlaq_rot270_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vcmlaq_rot270_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vcmlaq_rot270_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vcmlaq_rot270_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai float64x2_t __noswap_vcmlaq_rot270_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vcmlaq_rot270_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42);
+  return __ret;
+}
+#endif
+
+__ai float64x1_t vcmla_rot270_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vcmla_rot270_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 10);
+  return __ret;
+}
+#define vcmla_rot270_lane_f64(__p0_247, __p1_247, __p2_247, __p3_247) __extension__ ({ \
+  float64x1_t __s0_247 = __p0_247; \
+  float64x1_t __s1_247 = __p1_247; \
+  float64x1_t __s2_247 = __p2_247; \
+  float64x1_t __ret_247; \
+float64x1_t __reint_247 = __s2_247; \
+uint64x2_t __reint1_247 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_247, __p3_247), vgetq_lane_u64(*(uint64x2_t *) &__reint_247, __p3_247)}; \
+  __ret_247 = vcmla_rot270_f64(__s0_247, __s1_247, *(float64x1_t *) &__reint1_247); \
+  __ret_247; \
+})
+#ifdef __LITTLE_ENDIAN__
+#define vcmlaq_rot270_lane_f64(__p0_248, __p1_248, __p2_248, __p3_248) __extension__ ({ \
+  float64x2_t __s0_248 = __p0_248; \
+  float64x2_t __s1_248 = __p1_248; \
+  float64x1_t __s2_248 = __p2_248; \
+  float64x2_t __ret_248; \
+float64x1_t __reint_248 = __s2_248; \
+uint64x2_t __reint1_248 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_248, __p3_248), vgetq_lane_u64(*(uint64x2_t *) &__reint_248, __p3_248)}; \
+  __ret_248 = vcmlaq_rot270_f64(__s0_248, __s1_248, *(float64x2_t *) &__reint1_248); \
+  __ret_248; \
+})
+#else
+#define vcmlaq_rot270_lane_f64(__p0_249, __p1_249, __p2_249, __p3_249) __extension__ ({ \
+  float64x2_t __s0_249 = __p0_249; \
+  float64x2_t __s1_249 = __p1_249; \
+  float64x1_t __s2_249 = __p2_249; \
+  float64x2_t __rev0_249;  __rev0_249 = __builtin_shufflevector(__s0_249, __s0_249, 1, 0); \
+  float64x2_t __rev1_249;  __rev1_249 = __builtin_shufflevector(__s1_249, __s1_249, 1, 0); \
+  float64x2_t __ret_249; \
+float64x1_t __reint_249 = __s2_249; \
+uint64x2_t __reint1_249 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_249, __p3_249), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_249, __p3_249)}; \
+  __ret_249 = __noswap_vcmlaq_rot270_f64(__rev0_249, __rev1_249, *(float64x2_t *) &__reint1_249); \
+  __ret_249 = __builtin_shufflevector(__ret_249, __ret_249, 1, 0); \
+  __ret_249; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmla_rot270_laneq_f64(__p0_250, __p1_250, __p2_250, __p3_250) __extension__ ({ \
+  float64x1_t __s0_250 = __p0_250; \
+  float64x1_t __s1_250 = __p1_250; \
+  float64x2_t __s2_250 = __p2_250; \
+  float64x1_t __ret_250; \
+float64x2_t __reint_250 = __s2_250; \
+uint64x2_t __reint1_250 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_250, __p3_250), vgetq_lane_u64(*(uint64x2_t *) &__reint_250, __p3_250)}; \
+  __ret_250 = vcmla_rot270_f64(__s0_250, __s1_250, *(float64x1_t *) &__reint1_250); \
+  __ret_250; \
+})
+#else
+#define vcmla_rot270_laneq_f64(__p0_251, __p1_251, __p2_251, __p3_251) __extension__ ({ \
+  float64x1_t __s0_251 = __p0_251; \
+  float64x1_t __s1_251 = __p1_251; \
+  float64x2_t __s2_251 = __p2_251; \
+  float64x2_t __rev2_251;  __rev2_251 = __builtin_shufflevector(__s2_251, __s2_251, 1, 0); \
+  float64x1_t __ret_251; \
+float64x2_t __reint_251 = __rev2_251; \
+uint64x2_t __reint1_251 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_251, __p3_251), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_251, __p3_251)}; \
+  __ret_251 = vcmla_rot270_f64(__s0_251, __s1_251, *(float64x1_t *) &__reint1_251); \
+  __ret_251; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmlaq_rot270_laneq_f64(__p0_252, __p1_252, __p2_252, __p3_252) __extension__ ({ \
+  float64x2_t __s0_252 = __p0_252; \
+  float64x2_t __s1_252 = __p1_252; \
+  float64x2_t __s2_252 = __p2_252; \
+  float64x2_t __ret_252; \
+float64x2_t __reint_252 = __s2_252; \
+uint64x2_t __reint1_252 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_252, __p3_252), vgetq_lane_u64(*(uint64x2_t *) &__reint_252, __p3_252)}; \
+  __ret_252 = vcmlaq_rot270_f64(__s0_252, __s1_252, *(float64x2_t *) &__reint1_252); \
+  __ret_252; \
+})
+#else
+#define vcmlaq_rot270_laneq_f64(__p0_253, __p1_253, __p2_253, __p3_253) __extension__ ({ \
+  float64x2_t __s0_253 = __p0_253; \
+  float64x2_t __s1_253 = __p1_253; \
+  float64x2_t __s2_253 = __p2_253; \
+  float64x2_t __rev0_253;  __rev0_253 = __builtin_shufflevector(__s0_253, __s0_253, 1, 0); \
+  float64x2_t __rev1_253;  __rev1_253 = __builtin_shufflevector(__s1_253, __s1_253, 1, 0); \
+  float64x2_t __rev2_253;  __rev2_253 = __builtin_shufflevector(__s2_253, __s2_253, 1, 0); \
+  float64x2_t __ret_253; \
+float64x2_t __reint_253 = __rev2_253; \
+uint64x2_t __reint1_253 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_253, __p3_253), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_253, __p3_253)}; \
+  __ret_253 = __noswap_vcmlaq_rot270_f64(__rev0_253, __rev1_253, *(float64x2_t *) &__reint1_253); \
+  __ret_253 = __builtin_shufflevector(__ret_253, __ret_253, 1, 0); \
+  __ret_253; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vcmlaq_rot90_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vcmlaq_rot90_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vcmlaq_rot90_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vcmlaq_rot90_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai float64x2_t __noswap_vcmlaq_rot90_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vcmlaq_rot90_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42);
+  return __ret;
+}
+#endif
+
+__ai float64x1_t vcmla_rot90_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vcmla_rot90_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 10);
+  return __ret;
+}
+#define vcmla_rot90_lane_f64(__p0_254, __p1_254, __p2_254, __p3_254) __extension__ ({ \
+  float64x1_t __s0_254 = __p0_254; \
+  float64x1_t __s1_254 = __p1_254; \
+  float64x1_t __s2_254 = __p2_254; \
+  float64x1_t __ret_254; \
+float64x1_t __reint_254 = __s2_254; \
+uint64x2_t __reint1_254 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_254, __p3_254), vgetq_lane_u64(*(uint64x2_t *) &__reint_254, __p3_254)}; \
+  __ret_254 = vcmla_rot90_f64(__s0_254, __s1_254, *(float64x1_t *) &__reint1_254); \
+  __ret_254; \
+})
+#ifdef __LITTLE_ENDIAN__
+#define vcmlaq_rot90_lane_f64(__p0_255, __p1_255, __p2_255, __p3_255) __extension__ ({ \
+  float64x2_t __s0_255 = __p0_255; \
+  float64x2_t __s1_255 = __p1_255; \
+  float64x1_t __s2_255 = __p2_255; \
+  float64x2_t __ret_255; \
+float64x1_t __reint_255 = __s2_255; \
+uint64x2_t __reint1_255 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_255, __p3_255), vgetq_lane_u64(*(uint64x2_t *) &__reint_255, __p3_255)}; \
+  __ret_255 = vcmlaq_rot90_f64(__s0_255, __s1_255, *(float64x2_t *) &__reint1_255); \
+  __ret_255; \
+})
+#else
+#define vcmlaq_rot90_lane_f64(__p0_256, __p1_256, __p2_256, __p3_256) __extension__ ({ \
+  float64x2_t __s0_256 = __p0_256; \
+  float64x2_t __s1_256 = __p1_256; \
+  float64x1_t __s2_256 = __p2_256; \
+  float64x2_t __rev0_256;  __rev0_256 = __builtin_shufflevector(__s0_256, __s0_256, 1, 0); \
+  float64x2_t __rev1_256;  __rev1_256 = __builtin_shufflevector(__s1_256, __s1_256, 1, 0); \
+  float64x2_t __ret_256; \
+float64x1_t __reint_256 = __s2_256; \
+uint64x2_t __reint1_256 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_256, __p3_256), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_256, __p3_256)}; \
+  __ret_256 = __noswap_vcmlaq_rot90_f64(__rev0_256, __rev1_256, *(float64x2_t *) &__reint1_256); \
+  __ret_256 = __builtin_shufflevector(__ret_256, __ret_256, 1, 0); \
+  __ret_256; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmla_rot90_laneq_f64(__p0_257, __p1_257, __p2_257, __p3_257) __extension__ ({ \
+  float64x1_t __s0_257 = __p0_257; \
+  float64x1_t __s1_257 = __p1_257; \
+  float64x2_t __s2_257 = __p2_257; \
+  float64x1_t __ret_257; \
+float64x2_t __reint_257 = __s2_257; \
+uint64x2_t __reint1_257 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_257, __p3_257), vgetq_lane_u64(*(uint64x2_t *) &__reint_257, __p3_257)}; \
+  __ret_257 = vcmla_rot90_f64(__s0_257, __s1_257, *(float64x1_t *) &__reint1_257); \
+  __ret_257; \
+})
+#else
+#define vcmla_rot90_laneq_f64(__p0_258, __p1_258, __p2_258, __p3_258) __extension__ ({ \
+  float64x1_t __s0_258 = __p0_258; \
+  float64x1_t __s1_258 = __p1_258; \
+  float64x2_t __s2_258 = __p2_258; \
+  float64x2_t __rev2_258;  __rev2_258 = __builtin_shufflevector(__s2_258, __s2_258, 1, 0); \
+  float64x1_t __ret_258; \
+float64x2_t __reint_258 = __rev2_258; \
+uint64x2_t __reint1_258 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_258, __p3_258), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_258, __p3_258)}; \
+  __ret_258 = vcmla_rot90_f64(__s0_258, __s1_258, *(float64x1_t *) &__reint1_258); \
+  __ret_258; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcmlaq_rot90_laneq_f64(__p0_259, __p1_259, __p2_259, __p3_259) __extension__ ({ \
+  float64x2_t __s0_259 = __p0_259; \
+  float64x2_t __s1_259 = __p1_259; \
+  float64x2_t __s2_259 = __p2_259; \
+  float64x2_t __ret_259; \
+float64x2_t __reint_259 = __s2_259; \
+uint64x2_t __reint1_259 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_259, __p3_259), vgetq_lane_u64(*(uint64x2_t *) &__reint_259, __p3_259)}; \
+  __ret_259 = vcmlaq_rot90_f64(__s0_259, __s1_259, *(float64x2_t *) &__reint1_259); \
+  __ret_259; \
+})
+#else
+#define vcmlaq_rot90_laneq_f64(__p0_260, __p1_260, __p2_260, __p3_260) __extension__ ({ \
+  float64x2_t __s0_260 = __p0_260; \
+  float64x2_t __s1_260 = __p1_260; \
+  float64x2_t __s2_260 = __p2_260; \
+  float64x2_t __rev0_260;  __rev0_260 = __builtin_shufflevector(__s0_260, __s0_260, 1, 0); \
+  float64x2_t __rev1_260;  __rev1_260 = __builtin_shufflevector(__s1_260, __s1_260, 1, 0); \
+  float64x2_t __rev2_260;  __rev2_260 = __builtin_shufflevector(__s2_260, __s2_260, 1, 0); \
+  float64x2_t __ret_260; \
+float64x2_t __reint_260 = __rev2_260; \
+uint64x2_t __reint1_260 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_260, __p3_260), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_260, __p3_260)}; \
+  __ret_260 = __noswap_vcmlaq_rot90_f64(__rev0_260, __rev1_260, *(float64x2_t *) &__reint1_260); \
+  __ret_260 = __builtin_shufflevector(__ret_260, __ret_260, 1, 0); \
+  __ret_260; \
+})
+#endif
+
 #endif
 #if defined(__ARM_FEATURE_DOTPROD)
 #ifdef __LITTLE_ENDIAN__
@@ -40630,228 +42366,228 @@ __ai int32x2_t __noswap_vdot_s32(int32x2_t __p0, int8x8_t __p1, int8x8_t __p2) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vdotq_lane_u32(__p0_169, __p1_169, __p2_169, __p3_169) __extension__ ({ \
-  uint32x4_t __s0_169 = __p0_169; \
-  uint8x16_t __s1_169 = __p1_169; \
-  uint8x8_t __s2_169 = __p2_169; \
-  uint32x4_t __ret_169; \
-uint8x8_t __reint_169 = __s2_169; \
-uint32x4_t __reint1_169 = splatq_lane_u32(*(uint32x2_t *) &__reint_169, __p3_169); \
-  __ret_169 = vdotq_u32(__s0_169, __s1_169, *(uint8x16_t *) &__reint1_169); \
-  __ret_169; \
+#define vdotq_lane_u32(__p0_261, __p1_261, __p2_261, __p3_261) __extension__ ({ \
+  uint32x4_t __s0_261 = __p0_261; \
+  uint8x16_t __s1_261 = __p1_261; \
+  uint8x8_t __s2_261 = __p2_261; \
+  uint32x4_t __ret_261; \
+uint8x8_t __reint_261 = __s2_261; \
+uint32x4_t __reint1_261 = splatq_lane_u32(*(uint32x2_t *) &__reint_261, __p3_261); \
+  __ret_261 = vdotq_u32(__s0_261, __s1_261, *(uint8x16_t *) &__reint1_261); \
+  __ret_261; \
 })
 #else
-#define vdotq_lane_u32(__p0_170, __p1_170, __p2_170, __p3_170) __extension__ ({ \
-  uint32x4_t __s0_170 = __p0_170; \
-  uint8x16_t __s1_170 = __p1_170; \
-  uint8x8_t __s2_170 = __p2_170; \
-  uint32x4_t __rev0_170;  __rev0_170 = __builtin_shufflevector(__s0_170, __s0_170, 3, 2, 1, 0); \
-  uint8x16_t __rev1_170;  __rev1_170 = __builtin_shufflevector(__s1_170, __s1_170, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x8_t __rev2_170;  __rev2_170 = __builtin_shufflevector(__s2_170, __s2_170, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint32x4_t __ret_170; \
-uint8x8_t __reint_170 = __rev2_170; \
-uint32x4_t __reint1_170 = __noswap_splatq_lane_u32(*(uint32x2_t *) &__reint_170, __p3_170); \
-  __ret_170 = __noswap_vdotq_u32(__rev0_170, __rev1_170, *(uint8x16_t *) &__reint1_170); \
-  __ret_170 = __builtin_shufflevector(__ret_170, __ret_170, 3, 2, 1, 0); \
-  __ret_170; \
+#define vdotq_lane_u32(__p0_262, __p1_262, __p2_262, __p3_262) __extension__ ({ \
+  uint32x4_t __s0_262 = __p0_262; \
+  uint8x16_t __s1_262 = __p1_262; \
+  uint8x8_t __s2_262 = __p2_262; \
+  uint32x4_t __rev0_262;  __rev0_262 = __builtin_shufflevector(__s0_262, __s0_262, 3, 2, 1, 0); \
+  uint8x16_t __rev1_262;  __rev1_262 = __builtin_shufflevector(__s1_262, __s1_262, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __rev2_262;  __rev2_262 = __builtin_shufflevector(__s2_262, __s2_262, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint32x4_t __ret_262; \
+uint8x8_t __reint_262 = __rev2_262; \
+uint32x4_t __reint1_262 = __noswap_splatq_lane_u32(*(uint32x2_t *) &__reint_262, __p3_262); \
+  __ret_262 = __noswap_vdotq_u32(__rev0_262, __rev1_262, *(uint8x16_t *) &__reint1_262); \
+  __ret_262 = __builtin_shufflevector(__ret_262, __ret_262, 3, 2, 1, 0); \
+  __ret_262; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vdotq_lane_s32(__p0_171, __p1_171, __p2_171, __p3_171) __extension__ ({ \
-  int32x4_t __s0_171 = __p0_171; \
-  int8x16_t __s1_171 = __p1_171; \
-  int8x8_t __s2_171 = __p2_171; \
-  int32x4_t __ret_171; \
-int8x8_t __reint_171 = __s2_171; \
-int32x4_t __reint1_171 = splatq_lane_s32(*(int32x2_t *) &__reint_171, __p3_171); \
-  __ret_171 = vdotq_s32(__s0_171, __s1_171, *(int8x16_t *) &__reint1_171); \
-  __ret_171; \
+#define vdotq_lane_s32(__p0_263, __p1_263, __p2_263, __p3_263) __extension__ ({ \
+  int32x4_t __s0_263 = __p0_263; \
+  int8x16_t __s1_263 = __p1_263; \
+  int8x8_t __s2_263 = __p2_263; \
+  int32x4_t __ret_263; \
+int8x8_t __reint_263 = __s2_263; \
+int32x4_t __reint1_263 = splatq_lane_s32(*(int32x2_t *) &__reint_263, __p3_263); \
+  __ret_263 = vdotq_s32(__s0_263, __s1_263, *(int8x16_t *) &__reint1_263); \
+  __ret_263; \
 })
 #else
-#define vdotq_lane_s32(__p0_172, __p1_172, __p2_172, __p3_172) __extension__ ({ \
-  int32x4_t __s0_172 = __p0_172; \
-  int8x16_t __s1_172 = __p1_172; \
-  int8x8_t __s2_172 = __p2_172; \
-  int32x4_t __rev0_172;  __rev0_172 = __builtin_shufflevector(__s0_172, __s0_172, 3, 2, 1, 0); \
-  int8x16_t __rev1_172;  __rev1_172 = __builtin_shufflevector(__s1_172, __s1_172, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x8_t __rev2_172;  __rev2_172 = __builtin_shufflevector(__s2_172, __s2_172, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int32x4_t __ret_172; \
-int8x8_t __reint_172 = __rev2_172; \
-int32x4_t __reint1_172 = __noswap_splatq_lane_s32(*(int32x2_t *) &__reint_172, __p3_172); \
-  __ret_172 = __noswap_vdotq_s32(__rev0_172, __rev1_172, *(int8x16_t *) &__reint1_172); \
-  __ret_172 = __builtin_shufflevector(__ret_172, __ret_172, 3, 2, 1, 0); \
-  __ret_172; \
+#define vdotq_lane_s32(__p0_264, __p1_264, __p2_264, __p3_264) __extension__ ({ \
+  int32x4_t __s0_264 = __p0_264; \
+  int8x16_t __s1_264 = __p1_264; \
+  int8x8_t __s2_264 = __p2_264; \
+  int32x4_t __rev0_264;  __rev0_264 = __builtin_shufflevector(__s0_264, __s0_264, 3, 2, 1, 0); \
+  int8x16_t __rev1_264;  __rev1_264 = __builtin_shufflevector(__s1_264, __s1_264, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __rev2_264;  __rev2_264 = __builtin_shufflevector(__s2_264, __s2_264, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret_264; \
+int8x8_t __reint_264 = __rev2_264; \
+int32x4_t __reint1_264 = __noswap_splatq_lane_s32(*(int32x2_t *) &__reint_264, __p3_264); \
+  __ret_264 = __noswap_vdotq_s32(__rev0_264, __rev1_264, *(int8x16_t *) &__reint1_264); \
+  __ret_264 = __builtin_shufflevector(__ret_264, __ret_264, 3, 2, 1, 0); \
+  __ret_264; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vdot_lane_u32(__p0_173, __p1_173, __p2_173, __p3_173) __extension__ ({ \
-  uint32x2_t __s0_173 = __p0_173; \
-  uint8x8_t __s1_173 = __p1_173; \
-  uint8x8_t __s2_173 = __p2_173; \
-  uint32x2_t __ret_173; \
-uint8x8_t __reint_173 = __s2_173; \
-uint32x2_t __reint1_173 = splat_lane_u32(*(uint32x2_t *) &__reint_173, __p3_173); \
-  __ret_173 = vdot_u32(__s0_173, __s1_173, *(uint8x8_t *) &__reint1_173); \
-  __ret_173; \
+#define vdot_lane_u32(__p0_265, __p1_265, __p2_265, __p3_265) __extension__ ({ \
+  uint32x2_t __s0_265 = __p0_265; \
+  uint8x8_t __s1_265 = __p1_265; \
+  uint8x8_t __s2_265 = __p2_265; \
+  uint32x2_t __ret_265; \
+uint8x8_t __reint_265 = __s2_265; \
+uint32x2_t __reint1_265 = splat_lane_u32(*(uint32x2_t *) &__reint_265, __p3_265); \
+  __ret_265 = vdot_u32(__s0_265, __s1_265, *(uint8x8_t *) &__reint1_265); \
+  __ret_265; \
 })
 #else
-#define vdot_lane_u32(__p0_174, __p1_174, __p2_174, __p3_174) __extension__ ({ \
-  uint32x2_t __s0_174 = __p0_174; \
-  uint8x8_t __s1_174 = __p1_174; \
-  uint8x8_t __s2_174 = __p2_174; \
-  uint32x2_t __rev0_174;  __rev0_174 = __builtin_shufflevector(__s0_174, __s0_174, 1, 0); \
-  uint8x8_t __rev1_174;  __rev1_174 = __builtin_shufflevector(__s1_174, __s1_174, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x8_t __rev2_174;  __rev2_174 = __builtin_shufflevector(__s2_174, __s2_174, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint32x2_t __ret_174; \
-uint8x8_t __reint_174 = __rev2_174; \
-uint32x2_t __reint1_174 = __noswap_splat_lane_u32(*(uint32x2_t *) &__reint_174, __p3_174); \
-  __ret_174 = __noswap_vdot_u32(__rev0_174, __rev1_174, *(uint8x8_t *) &__reint1_174); \
-  __ret_174 = __builtin_shufflevector(__ret_174, __ret_174, 1, 0); \
-  __ret_174; \
+#define vdot_lane_u32(__p0_266, __p1_266, __p2_266, __p3_266) __extension__ ({ \
+  uint32x2_t __s0_266 = __p0_266; \
+  uint8x8_t __s1_266 = __p1_266; \
+  uint8x8_t __s2_266 = __p2_266; \
+  uint32x2_t __rev0_266;  __rev0_266 = __builtin_shufflevector(__s0_266, __s0_266, 1, 0); \
+  uint8x8_t __rev1_266;  __rev1_266 = __builtin_shufflevector(__s1_266, __s1_266, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __rev2_266;  __rev2_266 = __builtin_shufflevector(__s2_266, __s2_266, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint32x2_t __ret_266; \
+uint8x8_t __reint_266 = __rev2_266; \
+uint32x2_t __reint1_266 = __noswap_splat_lane_u32(*(uint32x2_t *) &__reint_266, __p3_266); \
+  __ret_266 = __noswap_vdot_u32(__rev0_266, __rev1_266, *(uint8x8_t *) &__reint1_266); \
+  __ret_266 = __builtin_shufflevector(__ret_266, __ret_266, 1, 0); \
+  __ret_266; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vdot_lane_s32(__p0_175, __p1_175, __p2_175, __p3_175) __extension__ ({ \
-  int32x2_t __s0_175 = __p0_175; \
-  int8x8_t __s1_175 = __p1_175; \
-  int8x8_t __s2_175 = __p2_175; \
-  int32x2_t __ret_175; \
-int8x8_t __reint_175 = __s2_175; \
-int32x2_t __reint1_175 = splat_lane_s32(*(int32x2_t *) &__reint_175, __p3_175); \
-  __ret_175 = vdot_s32(__s0_175, __s1_175, *(int8x8_t *) &__reint1_175); \
-  __ret_175; \
+#define vdot_lane_s32(__p0_267, __p1_267, __p2_267, __p3_267) __extension__ ({ \
+  int32x2_t __s0_267 = __p0_267; \
+  int8x8_t __s1_267 = __p1_267; \
+  int8x8_t __s2_267 = __p2_267; \
+  int32x2_t __ret_267; \
+int8x8_t __reint_267 = __s2_267; \
+int32x2_t __reint1_267 = splat_lane_s32(*(int32x2_t *) &__reint_267, __p3_267); \
+  __ret_267 = vdot_s32(__s0_267, __s1_267, *(int8x8_t *) &__reint1_267); \
+  __ret_267; \
 })
 #else
-#define vdot_lane_s32(__p0_176, __p1_176, __p2_176, __p3_176) __extension__ ({ \
-  int32x2_t __s0_176 = __p0_176; \
-  int8x8_t __s1_176 = __p1_176; \
-  int8x8_t __s2_176 = __p2_176; \
-  int32x2_t __rev0_176;  __rev0_176 = __builtin_shufflevector(__s0_176, __s0_176, 1, 0); \
-  int8x8_t __rev1_176;  __rev1_176 = __builtin_shufflevector(__s1_176, __s1_176, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x8_t __rev2_176;  __rev2_176 = __builtin_shufflevector(__s2_176, __s2_176, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int32x2_t __ret_176; \
-int8x8_t __reint_176 = __rev2_176; \
-int32x2_t __reint1_176 = __noswap_splat_lane_s32(*(int32x2_t *) &__reint_176, __p3_176); \
-  __ret_176 = __noswap_vdot_s32(__rev0_176, __rev1_176, *(int8x8_t *) &__reint1_176); \
-  __ret_176 = __builtin_shufflevector(__ret_176, __ret_176, 1, 0); \
-  __ret_176; \
+#define vdot_lane_s32(__p0_268, __p1_268, __p2_268, __p3_268) __extension__ ({ \
+  int32x2_t __s0_268 = __p0_268; \
+  int8x8_t __s1_268 = __p1_268; \
+  int8x8_t __s2_268 = __p2_268; \
+  int32x2_t __rev0_268;  __rev0_268 = __builtin_shufflevector(__s0_268, __s0_268, 1, 0); \
+  int8x8_t __rev1_268;  __rev1_268 = __builtin_shufflevector(__s1_268, __s1_268, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __rev2_268;  __rev2_268 = __builtin_shufflevector(__s2_268, __s2_268, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x2_t __ret_268; \
+int8x8_t __reint_268 = __rev2_268; \
+int32x2_t __reint1_268 = __noswap_splat_lane_s32(*(int32x2_t *) &__reint_268, __p3_268); \
+  __ret_268 = __noswap_vdot_s32(__rev0_268, __rev1_268, *(int8x8_t *) &__reint1_268); \
+  __ret_268 = __builtin_shufflevector(__ret_268, __ret_268, 1, 0); \
+  __ret_268; \
 })
 #endif
 
 #endif
 #if defined(__ARM_FEATURE_DOTPROD) && defined(__aarch64__)
 #ifdef __LITTLE_ENDIAN__
-#define vdotq_laneq_u32(__p0_177, __p1_177, __p2_177, __p3_177) __extension__ ({ \
-  uint32x4_t __s0_177 = __p0_177; \
-  uint8x16_t __s1_177 = __p1_177; \
-  uint8x16_t __s2_177 = __p2_177; \
-  uint32x4_t __ret_177; \
-uint8x16_t __reint_177 = __s2_177; \
-uint32x4_t __reint1_177 = splatq_laneq_u32(*(uint32x4_t *) &__reint_177, __p3_177); \
-  __ret_177 = vdotq_u32(__s0_177, __s1_177, *(uint8x16_t *) &__reint1_177); \
-  __ret_177; \
+#define vdotq_laneq_u32(__p0_269, __p1_269, __p2_269, __p3_269) __extension__ ({ \
+  uint32x4_t __s0_269 = __p0_269; \
+  uint8x16_t __s1_269 = __p1_269; \
+  uint8x16_t __s2_269 = __p2_269; \
+  uint32x4_t __ret_269; \
+uint8x16_t __reint_269 = __s2_269; \
+uint32x4_t __reint1_269 = splatq_laneq_u32(*(uint32x4_t *) &__reint_269, __p3_269); \
+  __ret_269 = vdotq_u32(__s0_269, __s1_269, *(uint8x16_t *) &__reint1_269); \
+  __ret_269; \
 })
 #else
-#define vdotq_laneq_u32(__p0_178, __p1_178, __p2_178, __p3_178) __extension__ ({ \
-  uint32x4_t __s0_178 = __p0_178; \
-  uint8x16_t __s1_178 = __p1_178; \
-  uint8x16_t __s2_178 = __p2_178; \
-  uint32x4_t __rev0_178;  __rev0_178 = __builtin_shufflevector(__s0_178, __s0_178, 3, 2, 1, 0); \
-  uint8x16_t __rev1_178;  __rev1_178 = __builtin_shufflevector(__s1_178, __s1_178, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x16_t __rev2_178;  __rev2_178 = __builtin_shufflevector(__s2_178, __s2_178, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint32x4_t __ret_178; \
-uint8x16_t __reint_178 = __rev2_178; \
-uint32x4_t __reint1_178 = __noswap_splatq_laneq_u32(*(uint32x4_t *) &__reint_178, __p3_178); \
-  __ret_178 = __noswap_vdotq_u32(__rev0_178, __rev1_178, *(uint8x16_t *) &__reint1_178); \
-  __ret_178 = __builtin_shufflevector(__ret_178, __ret_178, 3, 2, 1, 0); \
-  __ret_178; \
+#define vdotq_laneq_u32(__p0_270, __p1_270, __p2_270, __p3_270) __extension__ ({ \
+  uint32x4_t __s0_270 = __p0_270; \
+  uint8x16_t __s1_270 = __p1_270; \
+  uint8x16_t __s2_270 = __p2_270; \
+  uint32x4_t __rev0_270;  __rev0_270 = __builtin_shufflevector(__s0_270, __s0_270, 3, 2, 1, 0); \
+  uint8x16_t __rev1_270;  __rev1_270 = __builtin_shufflevector(__s1_270, __s1_270, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __rev2_270;  __rev2_270 = __builtin_shufflevector(__s2_270, __s2_270, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint32x4_t __ret_270; \
+uint8x16_t __reint_270 = __rev2_270; \
+uint32x4_t __reint1_270 = __noswap_splatq_laneq_u32(*(uint32x4_t *) &__reint_270, __p3_270); \
+  __ret_270 = __noswap_vdotq_u32(__rev0_270, __rev1_270, *(uint8x16_t *) &__reint1_270); \
+  __ret_270 = __builtin_shufflevector(__ret_270, __ret_270, 3, 2, 1, 0); \
+  __ret_270; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vdotq_laneq_s32(__p0_179, __p1_179, __p2_179, __p3_179) __extension__ ({ \
-  int32x4_t __s0_179 = __p0_179; \
-  int8x16_t __s1_179 = __p1_179; \
-  int8x16_t __s2_179 = __p2_179; \
-  int32x4_t __ret_179; \
-int8x16_t __reint_179 = __s2_179; \
-int32x4_t __reint1_179 = splatq_laneq_s32(*(int32x4_t *) &__reint_179, __p3_179); \
-  __ret_179 = vdotq_s32(__s0_179, __s1_179, *(int8x16_t *) &__reint1_179); \
-  __ret_179; \
+#define vdotq_laneq_s32(__p0_271, __p1_271, __p2_271, __p3_271) __extension__ ({ \
+  int32x4_t __s0_271 = __p0_271; \
+  int8x16_t __s1_271 = __p1_271; \
+  int8x16_t __s2_271 = __p2_271; \
+  int32x4_t __ret_271; \
+int8x16_t __reint_271 = __s2_271; \
+int32x4_t __reint1_271 = splatq_laneq_s32(*(int32x4_t *) &__reint_271, __p3_271); \
+  __ret_271 = vdotq_s32(__s0_271, __s1_271, *(int8x16_t *) &__reint1_271); \
+  __ret_271; \
 })
 #else
-#define vdotq_laneq_s32(__p0_180, __p1_180, __p2_180, __p3_180) __extension__ ({ \
-  int32x4_t __s0_180 = __p0_180; \
-  int8x16_t __s1_180 = __p1_180; \
-  int8x16_t __s2_180 = __p2_180; \
-  int32x4_t __rev0_180;  __rev0_180 = __builtin_shufflevector(__s0_180, __s0_180, 3, 2, 1, 0); \
-  int8x16_t __rev1_180;  __rev1_180 = __builtin_shufflevector(__s1_180, __s1_180, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x16_t __rev2_180;  __rev2_180 = __builtin_shufflevector(__s2_180, __s2_180, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int32x4_t __ret_180; \
-int8x16_t __reint_180 = __rev2_180; \
-int32x4_t __reint1_180 = __noswap_splatq_laneq_s32(*(int32x4_t *) &__reint_180, __p3_180); \
-  __ret_180 = __noswap_vdotq_s32(__rev0_180, __rev1_180, *(int8x16_t *) &__reint1_180); \
-  __ret_180 = __builtin_shufflevector(__ret_180, __ret_180, 3, 2, 1, 0); \
-  __ret_180; \
+#define vdotq_laneq_s32(__p0_272, __p1_272, __p2_272, __p3_272) __extension__ ({ \
+  int32x4_t __s0_272 = __p0_272; \
+  int8x16_t __s1_272 = __p1_272; \
+  int8x16_t __s2_272 = __p2_272; \
+  int32x4_t __rev0_272;  __rev0_272 = __builtin_shufflevector(__s0_272, __s0_272, 3, 2, 1, 0); \
+  int8x16_t __rev1_272;  __rev1_272 = __builtin_shufflevector(__s1_272, __s1_272, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __rev2_272;  __rev2_272 = __builtin_shufflevector(__s2_272, __s2_272, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret_272; \
+int8x16_t __reint_272 = __rev2_272; \
+int32x4_t __reint1_272 = __noswap_splatq_laneq_s32(*(int32x4_t *) &__reint_272, __p3_272); \
+  __ret_272 = __noswap_vdotq_s32(__rev0_272, __rev1_272, *(int8x16_t *) &__reint1_272); \
+  __ret_272 = __builtin_shufflevector(__ret_272, __ret_272, 3, 2, 1, 0); \
+  __ret_272; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vdot_laneq_u32(__p0_181, __p1_181, __p2_181, __p3_181) __extension__ ({ \
-  uint32x2_t __s0_181 = __p0_181; \
-  uint8x8_t __s1_181 = __p1_181; \
-  uint8x16_t __s2_181 = __p2_181; \
-  uint32x2_t __ret_181; \
-uint8x16_t __reint_181 = __s2_181; \
-uint32x2_t __reint1_181 = splat_laneq_u32(*(uint32x4_t *) &__reint_181, __p3_181); \
-  __ret_181 = vdot_u32(__s0_181, __s1_181, *(uint8x8_t *) &__reint1_181); \
-  __ret_181; \
+#define vdot_laneq_u32(__p0_273, __p1_273, __p2_273, __p3_273) __extension__ ({ \
+  uint32x2_t __s0_273 = __p0_273; \
+  uint8x8_t __s1_273 = __p1_273; \
+  uint8x16_t __s2_273 = __p2_273; \
+  uint32x2_t __ret_273; \
+uint8x16_t __reint_273 = __s2_273; \
+uint32x2_t __reint1_273 = splat_laneq_u32(*(uint32x4_t *) &__reint_273, __p3_273); \
+  __ret_273 = vdot_u32(__s0_273, __s1_273, *(uint8x8_t *) &__reint1_273); \
+  __ret_273; \
 })
 #else
-#define vdot_laneq_u32(__p0_182, __p1_182, __p2_182, __p3_182) __extension__ ({ \
-  uint32x2_t __s0_182 = __p0_182; \
-  uint8x8_t __s1_182 = __p1_182; \
-  uint8x16_t __s2_182 = __p2_182; \
-  uint32x2_t __rev0_182;  __rev0_182 = __builtin_shufflevector(__s0_182, __s0_182, 1, 0); \
-  uint8x8_t __rev1_182;  __rev1_182 = __builtin_shufflevector(__s1_182, __s1_182, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x16_t __rev2_182;  __rev2_182 = __builtin_shufflevector(__s2_182, __s2_182, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint32x2_t __ret_182; \
-uint8x16_t __reint_182 = __rev2_182; \
-uint32x2_t __reint1_182 = __noswap_splat_laneq_u32(*(uint32x4_t *) &__reint_182, __p3_182); \
-  __ret_182 = __noswap_vdot_u32(__rev0_182, __rev1_182, *(uint8x8_t *) &__reint1_182); \
-  __ret_182 = __builtin_shufflevector(__ret_182, __ret_182, 1, 0); \
-  __ret_182; \
+#define vdot_laneq_u32(__p0_274, __p1_274, __p2_274, __p3_274) __extension__ ({ \
+  uint32x2_t __s0_274 = __p0_274; \
+  uint8x8_t __s1_274 = __p1_274; \
+  uint8x16_t __s2_274 = __p2_274; \
+  uint32x2_t __rev0_274;  __rev0_274 = __builtin_shufflevector(__s0_274, __s0_274, 1, 0); \
+  uint8x8_t __rev1_274;  __rev1_274 = __builtin_shufflevector(__s1_274, __s1_274, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __rev2_274;  __rev2_274 = __builtin_shufflevector(__s2_274, __s2_274, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint32x2_t __ret_274; \
+uint8x16_t __reint_274 = __rev2_274; \
+uint32x2_t __reint1_274 = __noswap_splat_laneq_u32(*(uint32x4_t *) &__reint_274, __p3_274); \
+  __ret_274 = __noswap_vdot_u32(__rev0_274, __rev1_274, *(uint8x8_t *) &__reint1_274); \
+  __ret_274 = __builtin_shufflevector(__ret_274, __ret_274, 1, 0); \
+  __ret_274; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vdot_laneq_s32(__p0_183, __p1_183, __p2_183, __p3_183) __extension__ ({ \
-  int32x2_t __s0_183 = __p0_183; \
-  int8x8_t __s1_183 = __p1_183; \
-  int8x16_t __s2_183 = __p2_183; \
-  int32x2_t __ret_183; \
-int8x16_t __reint_183 = __s2_183; \
-int32x2_t __reint1_183 = splat_laneq_s32(*(int32x4_t *) &__reint_183, __p3_183); \
-  __ret_183 = vdot_s32(__s0_183, __s1_183, *(int8x8_t *) &__reint1_183); \
-  __ret_183; \
+#define vdot_laneq_s32(__p0_275, __p1_275, __p2_275, __p3_275) __extension__ ({ \
+  int32x2_t __s0_275 = __p0_275; \
+  int8x8_t __s1_275 = __p1_275; \
+  int8x16_t __s2_275 = __p2_275; \
+  int32x2_t __ret_275; \
+int8x16_t __reint_275 = __s2_275; \
+int32x2_t __reint1_275 = splat_laneq_s32(*(int32x4_t *) &__reint_275, __p3_275); \
+  __ret_275 = vdot_s32(__s0_275, __s1_275, *(int8x8_t *) &__reint1_275); \
+  __ret_275; \
 })
 #else
-#define vdot_laneq_s32(__p0_184, __p1_184, __p2_184, __p3_184) __extension__ ({ \
-  int32x2_t __s0_184 = __p0_184; \
-  int8x8_t __s1_184 = __p1_184; \
-  int8x16_t __s2_184 = __p2_184; \
-  int32x2_t __rev0_184;  __rev0_184 = __builtin_shufflevector(__s0_184, __s0_184, 1, 0); \
-  int8x8_t __rev1_184;  __rev1_184 = __builtin_shufflevector(__s1_184, __s1_184, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x16_t __rev2_184;  __rev2_184 = __builtin_shufflevector(__s2_184, __s2_184, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int32x2_t __ret_184; \
-int8x16_t __reint_184 = __rev2_184; \
-int32x2_t __reint1_184 = __noswap_splat_laneq_s32(*(int32x4_t *) &__reint_184, __p3_184); \
-  __ret_184 = __noswap_vdot_s32(__rev0_184, __rev1_184, *(int8x8_t *) &__reint1_184); \
-  __ret_184 = __builtin_shufflevector(__ret_184, __ret_184, 1, 0); \
-  __ret_184; \
+#define vdot_laneq_s32(__p0_276, __p1_276, __p2_276, __p3_276) __extension__ ({ \
+  int32x2_t __s0_276 = __p0_276; \
+  int8x8_t __s1_276 = __p1_276; \
+  int8x16_t __s2_276 = __p2_276; \
+  int32x2_t __rev0_276;  __rev0_276 = __builtin_shufflevector(__s0_276, __s0_276, 1, 0); \
+  int8x8_t __rev1_276;  __rev1_276 = __builtin_shufflevector(__s1_276, __s1_276, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __rev2_276;  __rev2_276 = __builtin_shufflevector(__s2_276, __s2_276, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x2_t __ret_276; \
+int8x16_t __reint_276 = __rev2_276; \
+int32x2_t __reint1_276 = __noswap_splat_laneq_s32(*(int32x4_t *) &__reint_276, __p3_276); \
+  __ret_276 = __noswap_vdot_s32(__rev0_276, __rev1_276, *(int8x8_t *) &__reint1_276); \
+  __ret_276 = __builtin_shufflevector(__ret_276, __ret_276, 1, 0); \
+  __ret_276; \
 })
 #endif
 
@@ -42518,44 +44254,44 @@ __ai float16x4_t vmul_f16(float16x4_t __p0, float16x4_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmulq_lane_f16(__p0_185, __p1_185, __p2_185) __extension__ ({ \
-  float16x8_t __s0_185 = __p0_185; \
-  float16x4_t __s1_185 = __p1_185; \
-  float16x8_t __ret_185; \
-  __ret_185 = __s0_185 * splatq_lane_f16(__s1_185, __p2_185); \
-  __ret_185; \
+#define vmulq_lane_f16(__p0_277, __p1_277, __p2_277) __extension__ ({ \
+  float16x8_t __s0_277 = __p0_277; \
+  float16x4_t __s1_277 = __p1_277; \
+  float16x8_t __ret_277; \
+  __ret_277 = __s0_277 * splatq_lane_f16(__s1_277, __p2_277); \
+  __ret_277; \
 })
 #else
-#define vmulq_lane_f16(__p0_186, __p1_186, __p2_186) __extension__ ({ \
-  float16x8_t __s0_186 = __p0_186; \
-  float16x4_t __s1_186 = __p1_186; \
-  float16x8_t __rev0_186;  __rev0_186 = __builtin_shufflevector(__s0_186, __s0_186, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x4_t __rev1_186;  __rev1_186 = __builtin_shufflevector(__s1_186, __s1_186, 3, 2, 1, 0); \
-  float16x8_t __ret_186; \
-  __ret_186 = __rev0_186 * __noswap_splatq_lane_f16(__rev1_186, __p2_186); \
-  __ret_186 = __builtin_shufflevector(__ret_186, __ret_186, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_186; \
+#define vmulq_lane_f16(__p0_278, __p1_278, __p2_278) __extension__ ({ \
+  float16x8_t __s0_278 = __p0_278; \
+  float16x4_t __s1_278 = __p1_278; \
+  float16x8_t __rev0_278;  __rev0_278 = __builtin_shufflevector(__s0_278, __s0_278, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x4_t __rev1_278;  __rev1_278 = __builtin_shufflevector(__s1_278, __s1_278, 3, 2, 1, 0); \
+  float16x8_t __ret_278; \
+  __ret_278 = __rev0_278 * __noswap_splatq_lane_f16(__rev1_278, __p2_278); \
+  __ret_278 = __builtin_shufflevector(__ret_278, __ret_278, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_278; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmul_lane_f16(__p0_187, __p1_187, __p2_187) __extension__ ({ \
-  float16x4_t __s0_187 = __p0_187; \
-  float16x4_t __s1_187 = __p1_187; \
-  float16x4_t __ret_187; \
-  __ret_187 = __s0_187 * splat_lane_f16(__s1_187, __p2_187); \
-  __ret_187; \
+#define vmul_lane_f16(__p0_279, __p1_279, __p2_279) __extension__ ({ \
+  float16x4_t __s0_279 = __p0_279; \
+  float16x4_t __s1_279 = __p1_279; \
+  float16x4_t __ret_279; \
+  __ret_279 = __s0_279 * splat_lane_f16(__s1_279, __p2_279); \
+  __ret_279; \
 })
 #else
-#define vmul_lane_f16(__p0_188, __p1_188, __p2_188) __extension__ ({ \
-  float16x4_t __s0_188 = __p0_188; \
-  float16x4_t __s1_188 = __p1_188; \
-  float16x4_t __rev0_188;  __rev0_188 = __builtin_shufflevector(__s0_188, __s0_188, 3, 2, 1, 0); \
-  float16x4_t __rev1_188;  __rev1_188 = __builtin_shufflevector(__s1_188, __s1_188, 3, 2, 1, 0); \
-  float16x4_t __ret_188; \
-  __ret_188 = __rev0_188 * __noswap_splat_lane_f16(__rev1_188, __p2_188); \
-  __ret_188 = __builtin_shufflevector(__ret_188, __ret_188, 3, 2, 1, 0); \
-  __ret_188; \
+#define vmul_lane_f16(__p0_280, __p1_280, __p2_280) __extension__ ({ \
+  float16x4_t __s0_280 = __p0_280; \
+  float16x4_t __s1_280 = __p1_280; \
+  float16x4_t __rev0_280;  __rev0_280 = __builtin_shufflevector(__s0_280, __s0_280, 3, 2, 1, 0); \
+  float16x4_t __rev1_280;  __rev1_280 = __builtin_shufflevector(__s1_280, __s1_280, 3, 2, 1, 0); \
+  float16x4_t __ret_280; \
+  __ret_280 = __rev0_280 * __noswap_splat_lane_f16(__rev1_280, __p2_280); \
+  __ret_280 = __builtin_shufflevector(__ret_280, __ret_280, 3, 2, 1, 0); \
+  __ret_280; \
 })
 #endif
 
@@ -43297,140 +45033,140 @@ __ai float16x4_t vdiv_f16(float16x4_t __p0, float16x4_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmsh_lane_f16(__p0_189, __p1_189, __p2_189, __p3_189) __extension__ ({ \
-  float16_t __s0_189 = __p0_189; \
-  float16_t __s1_189 = __p1_189; \
-  float16x4_t __s2_189 = __p2_189; \
-  float16_t __ret_189; \
-  __ret_189 = vfmah_lane_f16(__s0_189, -__s1_189, __s2_189, __p3_189); \
-  __ret_189; \
+#define vfmsh_lane_f16(__p0_281, __p1_281, __p2_281, __p3_281) __extension__ ({ \
+  float16_t __s0_281 = __p0_281; \
+  float16_t __s1_281 = __p1_281; \
+  float16x4_t __s2_281 = __p2_281; \
+  float16_t __ret_281; \
+  __ret_281 = vfmah_lane_f16(__s0_281, -__s1_281, __s2_281, __p3_281); \
+  __ret_281; \
 })
 #else
-#define vfmsh_lane_f16(__p0_190, __p1_190, __p2_190, __p3_190) __extension__ ({ \
-  float16_t __s0_190 = __p0_190; \
-  float16_t __s1_190 = __p1_190; \
-  float16x4_t __s2_190 = __p2_190; \
-  float16x4_t __rev2_190;  __rev2_190 = __builtin_shufflevector(__s2_190, __s2_190, 3, 2, 1, 0); \
-  float16_t __ret_190; \
-  __ret_190 = __noswap_vfmah_lane_f16(__s0_190, -__s1_190, __rev2_190, __p3_190); \
-  __ret_190; \
+#define vfmsh_lane_f16(__p0_282, __p1_282, __p2_282, __p3_282) __extension__ ({ \
+  float16_t __s0_282 = __p0_282; \
+  float16_t __s1_282 = __p1_282; \
+  float16x4_t __s2_282 = __p2_282; \
+  float16x4_t __rev2_282;  __rev2_282 = __builtin_shufflevector(__s2_282, __s2_282, 3, 2, 1, 0); \
+  float16_t __ret_282; \
+  __ret_282 = __noswap_vfmah_lane_f16(__s0_282, -__s1_282, __rev2_282, __p3_282); \
+  __ret_282; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmsq_lane_f16(__p0_191, __p1_191, __p2_191, __p3_191) __extension__ ({ \
-  float16x8_t __s0_191 = __p0_191; \
-  float16x8_t __s1_191 = __p1_191; \
-  float16x4_t __s2_191 = __p2_191; \
-  float16x8_t __ret_191; \
-  __ret_191 = vfmaq_lane_f16(__s0_191, -__s1_191, __s2_191, __p3_191); \
-  __ret_191; \
+#define vfmsq_lane_f16(__p0_283, __p1_283, __p2_283, __p3_283) __extension__ ({ \
+  float16x8_t __s0_283 = __p0_283; \
+  float16x8_t __s1_283 = __p1_283; \
+  float16x4_t __s2_283 = __p2_283; \
+  float16x8_t __ret_283; \
+  __ret_283 = vfmaq_lane_f16(__s0_283, -__s1_283, __s2_283, __p3_283); \
+  __ret_283; \
 })
 #else
-#define vfmsq_lane_f16(__p0_192, __p1_192, __p2_192, __p3_192) __extension__ ({ \
-  float16x8_t __s0_192 = __p0_192; \
-  float16x8_t __s1_192 = __p1_192; \
-  float16x4_t __s2_192 = __p2_192; \
-  float16x8_t __rev0_192;  __rev0_192 = __builtin_shufflevector(__s0_192, __s0_192, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __rev1_192;  __rev1_192 = __builtin_shufflevector(__s1_192, __s1_192, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x4_t __rev2_192;  __rev2_192 = __builtin_shufflevector(__s2_192, __s2_192, 3, 2, 1, 0); \
-  float16x8_t __ret_192; \
-  __ret_192 = __noswap_vfmaq_lane_f16(__rev0_192, -__rev1_192, __rev2_192, __p3_192); \
-  __ret_192 = __builtin_shufflevector(__ret_192, __ret_192, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_192; \
+#define vfmsq_lane_f16(__p0_284, __p1_284, __p2_284, __p3_284) __extension__ ({ \
+  float16x8_t __s0_284 = __p0_284; \
+  float16x8_t __s1_284 = __p1_284; \
+  float16x4_t __s2_284 = __p2_284; \
+  float16x8_t __rev0_284;  __rev0_284 = __builtin_shufflevector(__s0_284, __s0_284, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __rev1_284;  __rev1_284 = __builtin_shufflevector(__s1_284, __s1_284, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x4_t __rev2_284;  __rev2_284 = __builtin_shufflevector(__s2_284, __s2_284, 3, 2, 1, 0); \
+  float16x8_t __ret_284; \
+  __ret_284 = __noswap_vfmaq_lane_f16(__rev0_284, -__rev1_284, __rev2_284, __p3_284); \
+  __ret_284 = __builtin_shufflevector(__ret_284, __ret_284, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_284; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfms_lane_f16(__p0_193, __p1_193, __p2_193, __p3_193) __extension__ ({ \
-  float16x4_t __s0_193 = __p0_193; \
-  float16x4_t __s1_193 = __p1_193; \
-  float16x4_t __s2_193 = __p2_193; \
-  float16x4_t __ret_193; \
-  __ret_193 = vfma_lane_f16(__s0_193, -__s1_193, __s2_193, __p3_193); \
-  __ret_193; \
+#define vfms_lane_f16(__p0_285, __p1_285, __p2_285, __p3_285) __extension__ ({ \
+  float16x4_t __s0_285 = __p0_285; \
+  float16x4_t __s1_285 = __p1_285; \
+  float16x4_t __s2_285 = __p2_285; \
+  float16x4_t __ret_285; \
+  __ret_285 = vfma_lane_f16(__s0_285, -__s1_285, __s2_285, __p3_285); \
+  __ret_285; \
 })
 #else
-#define vfms_lane_f16(__p0_194, __p1_194, __p2_194, __p3_194) __extension__ ({ \
-  float16x4_t __s0_194 = __p0_194; \
-  float16x4_t __s1_194 = __p1_194; \
-  float16x4_t __s2_194 = __p2_194; \
-  float16x4_t __rev0_194;  __rev0_194 = __builtin_shufflevector(__s0_194, __s0_194, 3, 2, 1, 0); \
-  float16x4_t __rev1_194;  __rev1_194 = __builtin_shufflevector(__s1_194, __s1_194, 3, 2, 1, 0); \
-  float16x4_t __rev2_194;  __rev2_194 = __builtin_shufflevector(__s2_194, __s2_194, 3, 2, 1, 0); \
-  float16x4_t __ret_194; \
-  __ret_194 = __noswap_vfma_lane_f16(__rev0_194, -__rev1_194, __rev2_194, __p3_194); \
-  __ret_194 = __builtin_shufflevector(__ret_194, __ret_194, 3, 2, 1, 0); \
-  __ret_194; \
+#define vfms_lane_f16(__p0_286, __p1_286, __p2_286, __p3_286) __extension__ ({ \
+  float16x4_t __s0_286 = __p0_286; \
+  float16x4_t __s1_286 = __p1_286; \
+  float16x4_t __s2_286 = __p2_286; \
+  float16x4_t __rev0_286;  __rev0_286 = __builtin_shufflevector(__s0_286, __s0_286, 3, 2, 1, 0); \
+  float16x4_t __rev1_286;  __rev1_286 = __builtin_shufflevector(__s1_286, __s1_286, 3, 2, 1, 0); \
+  float16x4_t __rev2_286;  __rev2_286 = __builtin_shufflevector(__s2_286, __s2_286, 3, 2, 1, 0); \
+  float16x4_t __ret_286; \
+  __ret_286 = __noswap_vfma_lane_f16(__rev0_286, -__rev1_286, __rev2_286, __p3_286); \
+  __ret_286 = __builtin_shufflevector(__ret_286, __ret_286, 3, 2, 1, 0); \
+  __ret_286; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmsh_laneq_f16(__p0_195, __p1_195, __p2_195, __p3_195) __extension__ ({ \
-  float16_t __s0_195 = __p0_195; \
-  float16_t __s1_195 = __p1_195; \
-  float16x8_t __s2_195 = __p2_195; \
-  float16_t __ret_195; \
-  __ret_195 = vfmah_laneq_f16(__s0_195, -__s1_195, __s2_195, __p3_195); \
-  __ret_195; \
+#define vfmsh_laneq_f16(__p0_287, __p1_287, __p2_287, __p3_287) __extension__ ({ \
+  float16_t __s0_287 = __p0_287; \
+  float16_t __s1_287 = __p1_287; \
+  float16x8_t __s2_287 = __p2_287; \
+  float16_t __ret_287; \
+  __ret_287 = vfmah_laneq_f16(__s0_287, -__s1_287, __s2_287, __p3_287); \
+  __ret_287; \
 })
 #else
-#define vfmsh_laneq_f16(__p0_196, __p1_196, __p2_196, __p3_196) __extension__ ({ \
-  float16_t __s0_196 = __p0_196; \
-  float16_t __s1_196 = __p1_196; \
-  float16x8_t __s2_196 = __p2_196; \
-  float16x8_t __rev2_196;  __rev2_196 = __builtin_shufflevector(__s2_196, __s2_196, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16_t __ret_196; \
-  __ret_196 = __noswap_vfmah_laneq_f16(__s0_196, -__s1_196, __rev2_196, __p3_196); \
-  __ret_196; \
+#define vfmsh_laneq_f16(__p0_288, __p1_288, __p2_288, __p3_288) __extension__ ({ \
+  float16_t __s0_288 = __p0_288; \
+  float16_t __s1_288 = __p1_288; \
+  float16x8_t __s2_288 = __p2_288; \
+  float16x8_t __rev2_288;  __rev2_288 = __builtin_shufflevector(__s2_288, __s2_288, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16_t __ret_288; \
+  __ret_288 = __noswap_vfmah_laneq_f16(__s0_288, -__s1_288, __rev2_288, __p3_288); \
+  __ret_288; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmsq_laneq_f16(__p0_197, __p1_197, __p2_197, __p3_197) __extension__ ({ \
-  float16x8_t __s0_197 = __p0_197; \
-  float16x8_t __s1_197 = __p1_197; \
-  float16x8_t __s2_197 = __p2_197; \
-  float16x8_t __ret_197; \
-  __ret_197 = vfmaq_laneq_f16(__s0_197, -__s1_197, __s2_197, __p3_197); \
-  __ret_197; \
+#define vfmsq_laneq_f16(__p0_289, __p1_289, __p2_289, __p3_289) __extension__ ({ \
+  float16x8_t __s0_289 = __p0_289; \
+  float16x8_t __s1_289 = __p1_289; \
+  float16x8_t __s2_289 = __p2_289; \
+  float16x8_t __ret_289; \
+  __ret_289 = vfmaq_laneq_f16(__s0_289, -__s1_289, __s2_289, __p3_289); \
+  __ret_289; \
 })
 #else
-#define vfmsq_laneq_f16(__p0_198, __p1_198, __p2_198, __p3_198) __extension__ ({ \
-  float16x8_t __s0_198 = __p0_198; \
-  float16x8_t __s1_198 = __p1_198; \
-  float16x8_t __s2_198 = __p2_198; \
-  float16x8_t __rev0_198;  __rev0_198 = __builtin_shufflevector(__s0_198, __s0_198, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __rev1_198;  __rev1_198 = __builtin_shufflevector(__s1_198, __s1_198, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __rev2_198;  __rev2_198 = __builtin_shufflevector(__s2_198, __s2_198, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __ret_198; \
-  __ret_198 = __noswap_vfmaq_laneq_f16(__rev0_198, -__rev1_198, __rev2_198, __p3_198); \
-  __ret_198 = __builtin_shufflevector(__ret_198, __ret_198, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_198; \
+#define vfmsq_laneq_f16(__p0_290, __p1_290, __p2_290, __p3_290) __extension__ ({ \
+  float16x8_t __s0_290 = __p0_290; \
+  float16x8_t __s1_290 = __p1_290; \
+  float16x8_t __s2_290 = __p2_290; \
+  float16x8_t __rev0_290;  __rev0_290 = __builtin_shufflevector(__s0_290, __s0_290, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __rev1_290;  __rev1_290 = __builtin_shufflevector(__s1_290, __s1_290, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __rev2_290;  __rev2_290 = __builtin_shufflevector(__s2_290, __s2_290, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __ret_290; \
+  __ret_290 = __noswap_vfmaq_laneq_f16(__rev0_290, -__rev1_290, __rev2_290, __p3_290); \
+  __ret_290 = __builtin_shufflevector(__ret_290, __ret_290, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_290; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfms_laneq_f16(__p0_199, __p1_199, __p2_199, __p3_199) __extension__ ({ \
-  float16x4_t __s0_199 = __p0_199; \
-  float16x4_t __s1_199 = __p1_199; \
-  float16x8_t __s2_199 = __p2_199; \
-  float16x4_t __ret_199; \
-  __ret_199 = vfma_laneq_f16(__s0_199, -__s1_199, __s2_199, __p3_199); \
-  __ret_199; \
+#define vfms_laneq_f16(__p0_291, __p1_291, __p2_291, __p3_291) __extension__ ({ \
+  float16x4_t __s0_291 = __p0_291; \
+  float16x4_t __s1_291 = __p1_291; \
+  float16x8_t __s2_291 = __p2_291; \
+  float16x4_t __ret_291; \
+  __ret_291 = vfma_laneq_f16(__s0_291, -__s1_291, __s2_291, __p3_291); \
+  __ret_291; \
 })
 #else
-#define vfms_laneq_f16(__p0_200, __p1_200, __p2_200, __p3_200) __extension__ ({ \
-  float16x4_t __s0_200 = __p0_200; \
-  float16x4_t __s1_200 = __p1_200; \
-  float16x8_t __s2_200 = __p2_200; \
-  float16x4_t __rev0_200;  __rev0_200 = __builtin_shufflevector(__s0_200, __s0_200, 3, 2, 1, 0); \
-  float16x4_t __rev1_200;  __rev1_200 = __builtin_shufflevector(__s1_200, __s1_200, 3, 2, 1, 0); \
-  float16x8_t __rev2_200;  __rev2_200 = __builtin_shufflevector(__s2_200, __s2_200, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x4_t __ret_200; \
-  __ret_200 = __noswap_vfma_laneq_f16(__rev0_200, -__rev1_200, __rev2_200, __p3_200); \
-  __ret_200 = __builtin_shufflevector(__ret_200, __ret_200, 3, 2, 1, 0); \
-  __ret_200; \
+#define vfms_laneq_f16(__p0_292, __p1_292, __p2_292, __p3_292) __extension__ ({ \
+  float16x4_t __s0_292 = __p0_292; \
+  float16x4_t __s1_292 = __p1_292; \
+  float16x8_t __s2_292 = __p2_292; \
+  float16x4_t __rev0_292;  __rev0_292 = __builtin_shufflevector(__s0_292, __s0_292, 3, 2, 1, 0); \
+  float16x4_t __rev1_292;  __rev1_292 = __builtin_shufflevector(__s1_292, __s1_292, 3, 2, 1, 0); \
+  float16x8_t __rev2_292;  __rev2_292 = __builtin_shufflevector(__s2_292, __s2_292, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x4_t __ret_292; \
+  __ret_292 = __noswap_vfma_laneq_f16(__rev0_292, -__rev1_292, __rev2_292, __p3_292); \
+  __ret_292 = __builtin_shufflevector(__ret_292, __ret_292, 3, 2, 1, 0); \
+  __ret_292; \
 })
 #endif
 
@@ -43617,44 +45353,44 @@ __ai float16x4_t vdiv_f16(float16x4_t __p0, float16x4_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmulq_laneq_f16(__p0_201, __p1_201, __p2_201) __extension__ ({ \
-  float16x8_t __s0_201 = __p0_201; \
-  float16x8_t __s1_201 = __p1_201; \
-  float16x8_t __ret_201; \
-  __ret_201 = __s0_201 * splatq_laneq_f16(__s1_201, __p2_201); \
-  __ret_201; \
+#define vmulq_laneq_f16(__p0_293, __p1_293, __p2_293) __extension__ ({ \
+  float16x8_t __s0_293 = __p0_293; \
+  float16x8_t __s1_293 = __p1_293; \
+  float16x8_t __ret_293; \
+  __ret_293 = __s0_293 * splatq_laneq_f16(__s1_293, __p2_293); \
+  __ret_293; \
 })
 #else
-#define vmulq_laneq_f16(__p0_202, __p1_202, __p2_202) __extension__ ({ \
-  float16x8_t __s0_202 = __p0_202; \
-  float16x8_t __s1_202 = __p1_202; \
-  float16x8_t __rev0_202;  __rev0_202 = __builtin_shufflevector(__s0_202, __s0_202, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __rev1_202;  __rev1_202 = __builtin_shufflevector(__s1_202, __s1_202, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __ret_202; \
-  __ret_202 = __rev0_202 * __noswap_splatq_laneq_f16(__rev1_202, __p2_202); \
-  __ret_202 = __builtin_shufflevector(__ret_202, __ret_202, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_202; \
+#define vmulq_laneq_f16(__p0_294, __p1_294, __p2_294) __extension__ ({ \
+  float16x8_t __s0_294 = __p0_294; \
+  float16x8_t __s1_294 = __p1_294; \
+  float16x8_t __rev0_294;  __rev0_294 = __builtin_shufflevector(__s0_294, __s0_294, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __rev1_294;  __rev1_294 = __builtin_shufflevector(__s1_294, __s1_294, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __ret_294; \
+  __ret_294 = __rev0_294 * __noswap_splatq_laneq_f16(__rev1_294, __p2_294); \
+  __ret_294 = __builtin_shufflevector(__ret_294, __ret_294, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_294; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmul_laneq_f16(__p0_203, __p1_203, __p2_203) __extension__ ({ \
-  float16x4_t __s0_203 = __p0_203; \
-  float16x8_t __s1_203 = __p1_203; \
-  float16x4_t __ret_203; \
-  __ret_203 = __s0_203 * splat_laneq_f16(__s1_203, __p2_203); \
-  __ret_203; \
+#define vmul_laneq_f16(__p0_295, __p1_295, __p2_295) __extension__ ({ \
+  float16x4_t __s0_295 = __p0_295; \
+  float16x8_t __s1_295 = __p1_295; \
+  float16x4_t __ret_295; \
+  __ret_295 = __s0_295 * splat_laneq_f16(__s1_295, __p2_295); \
+  __ret_295; \
 })
 #else
-#define vmul_laneq_f16(__p0_204, __p1_204, __p2_204) __extension__ ({ \
-  float16x4_t __s0_204 = __p0_204; \
-  float16x8_t __s1_204 = __p1_204; \
-  float16x4_t __rev0_204;  __rev0_204 = __builtin_shufflevector(__s0_204, __s0_204, 3, 2, 1, 0); \
-  float16x8_t __rev1_204;  __rev1_204 = __builtin_shufflevector(__s1_204, __s1_204, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x4_t __ret_204; \
-  __ret_204 = __rev0_204 * __noswap_splat_laneq_f16(__rev1_204, __p2_204); \
-  __ret_204 = __builtin_shufflevector(__ret_204, __ret_204, 3, 2, 1, 0); \
-  __ret_204; \
+#define vmul_laneq_f16(__p0_296, __p1_296, __p2_296) __extension__ ({ \
+  float16x4_t __s0_296 = __p0_296; \
+  float16x8_t __s1_296 = __p1_296; \
+  float16x4_t __rev0_296;  __rev0_296 = __builtin_shufflevector(__s0_296, __s0_296, 3, 2, 1, 0); \
+  float16x8_t __rev1_296;  __rev1_296 = __builtin_shufflevector(__s1_296, __s1_296, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x4_t __ret_296; \
+  __ret_296 = __rev0_296 * __noswap_splat_laneq_f16(__rev1_296, __p2_296); \
+  __ret_296 = __builtin_shufflevector(__ret_296, __ret_296, 3, 2, 1, 0); \
+  __ret_296; \
 })
 #endif
 
@@ -43722,44 +45458,44 @@ __ai float16x4_t __noswap_vmulx_f16(float16x4_t __p0, float16x4_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmulxq_lane_f16(__p0_205, __p1_205, __p2_205) __extension__ ({ \
-  float16x8_t __s0_205 = __p0_205; \
-  float16x4_t __s1_205 = __p1_205; \
-  float16x8_t __ret_205; \
-  __ret_205 = vmulxq_f16(__s0_205, splatq_lane_f16(__s1_205, __p2_205)); \
-  __ret_205; \
+#define vmulxq_lane_f16(__p0_297, __p1_297, __p2_297) __extension__ ({ \
+  float16x8_t __s0_297 = __p0_297; \
+  float16x4_t __s1_297 = __p1_297; \
+  float16x8_t __ret_297; \
+  __ret_297 = vmulxq_f16(__s0_297, splatq_lane_f16(__s1_297, __p2_297)); \
+  __ret_297; \
 })
 #else
-#define vmulxq_lane_f16(__p0_206, __p1_206, __p2_206) __extension__ ({ \
-  float16x8_t __s0_206 = __p0_206; \
-  float16x4_t __s1_206 = __p1_206; \
-  float16x8_t __rev0_206;  __rev0_206 = __builtin_shufflevector(__s0_206, __s0_206, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x4_t __rev1_206;  __rev1_206 = __builtin_shufflevector(__s1_206, __s1_206, 3, 2, 1, 0); \
-  float16x8_t __ret_206; \
-  __ret_206 = __noswap_vmulxq_f16(__rev0_206, __noswap_splatq_lane_f16(__rev1_206, __p2_206)); \
-  __ret_206 = __builtin_shufflevector(__ret_206, __ret_206, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_206; \
+#define vmulxq_lane_f16(__p0_298, __p1_298, __p2_298) __extension__ ({ \
+  float16x8_t __s0_298 = __p0_298; \
+  float16x4_t __s1_298 = __p1_298; \
+  float16x8_t __rev0_298;  __rev0_298 = __builtin_shufflevector(__s0_298, __s0_298, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x4_t __rev1_298;  __rev1_298 = __builtin_shufflevector(__s1_298, __s1_298, 3, 2, 1, 0); \
+  float16x8_t __ret_298; \
+  __ret_298 = __noswap_vmulxq_f16(__rev0_298, __noswap_splatq_lane_f16(__rev1_298, __p2_298)); \
+  __ret_298 = __builtin_shufflevector(__ret_298, __ret_298, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_298; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmulx_lane_f16(__p0_207, __p1_207, __p2_207) __extension__ ({ \
-  float16x4_t __s0_207 = __p0_207; \
-  float16x4_t __s1_207 = __p1_207; \
-  float16x4_t __ret_207; \
-  __ret_207 = vmulx_f16(__s0_207, splat_lane_f16(__s1_207, __p2_207)); \
-  __ret_207; \
+#define vmulx_lane_f16(__p0_299, __p1_299, __p2_299) __extension__ ({ \
+  float16x4_t __s0_299 = __p0_299; \
+  float16x4_t __s1_299 = __p1_299; \
+  float16x4_t __ret_299; \
+  __ret_299 = vmulx_f16(__s0_299, splat_lane_f16(__s1_299, __p2_299)); \
+  __ret_299; \
 })
 #else
-#define vmulx_lane_f16(__p0_208, __p1_208, __p2_208) __extension__ ({ \
-  float16x4_t __s0_208 = __p0_208; \
-  float16x4_t __s1_208 = __p1_208; \
-  float16x4_t __rev0_208;  __rev0_208 = __builtin_shufflevector(__s0_208, __s0_208, 3, 2, 1, 0); \
-  float16x4_t __rev1_208;  __rev1_208 = __builtin_shufflevector(__s1_208, __s1_208, 3, 2, 1, 0); \
-  float16x4_t __ret_208; \
-  __ret_208 = __noswap_vmulx_f16(__rev0_208, __noswap_splat_lane_f16(__rev1_208, __p2_208)); \
-  __ret_208 = __builtin_shufflevector(__ret_208, __ret_208, 3, 2, 1, 0); \
-  __ret_208; \
+#define vmulx_lane_f16(__p0_300, __p1_300, __p2_300) __extension__ ({ \
+  float16x4_t __s0_300 = __p0_300; \
+  float16x4_t __s1_300 = __p1_300; \
+  float16x4_t __rev0_300;  __rev0_300 = __builtin_shufflevector(__s0_300, __s0_300, 3, 2, 1, 0); \
+  float16x4_t __rev1_300;  __rev1_300 = __builtin_shufflevector(__s1_300, __s1_300, 3, 2, 1, 0); \
+  float16x4_t __ret_300; \
+  __ret_300 = __noswap_vmulx_f16(__rev0_300, __noswap_splat_lane_f16(__rev1_300, __p2_300)); \
+  __ret_300 = __builtin_shufflevector(__ret_300, __ret_300, 3, 2, 1, 0); \
+  __ret_300; \
 })
 #endif
 
@@ -43783,44 +45519,44 @@ __ai float16x4_t __noswap_vmulx_f16(float16x4_t __p0, float16x4_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmulxq_laneq_f16(__p0_209, __p1_209, __p2_209) __extension__ ({ \
-  float16x8_t __s0_209 = __p0_209; \
-  float16x8_t __s1_209 = __p1_209; \
-  float16x8_t __ret_209; \
-  __ret_209 = vmulxq_f16(__s0_209, splatq_laneq_f16(__s1_209, __p2_209)); \
-  __ret_209; \
+#define vmulxq_laneq_f16(__p0_301, __p1_301, __p2_301) __extension__ ({ \
+  float16x8_t __s0_301 = __p0_301; \
+  float16x8_t __s1_301 = __p1_301; \
+  float16x8_t __ret_301; \
+  __ret_301 = vmulxq_f16(__s0_301, splatq_laneq_f16(__s1_301, __p2_301)); \
+  __ret_301; \
 })
 #else
-#define vmulxq_laneq_f16(__p0_210, __p1_210, __p2_210) __extension__ ({ \
-  float16x8_t __s0_210 = __p0_210; \
-  float16x8_t __s1_210 = __p1_210; \
-  float16x8_t __rev0_210;  __rev0_210 = __builtin_shufflevector(__s0_210, __s0_210, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __rev1_210;  __rev1_210 = __builtin_shufflevector(__s1_210, __s1_210, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __ret_210; \
-  __ret_210 = __noswap_vmulxq_f16(__rev0_210, __noswap_splatq_laneq_f16(__rev1_210, __p2_210)); \
-  __ret_210 = __builtin_shufflevector(__ret_210, __ret_210, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_210; \
+#define vmulxq_laneq_f16(__p0_302, __p1_302, __p2_302) __extension__ ({ \
+  float16x8_t __s0_302 = __p0_302; \
+  float16x8_t __s1_302 = __p1_302; \
+  float16x8_t __rev0_302;  __rev0_302 = __builtin_shufflevector(__s0_302, __s0_302, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __rev1_302;  __rev1_302 = __builtin_shufflevector(__s1_302, __s1_302, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __ret_302; \
+  __ret_302 = __noswap_vmulxq_f16(__rev0_302, __noswap_splatq_laneq_f16(__rev1_302, __p2_302)); \
+  __ret_302 = __builtin_shufflevector(__ret_302, __ret_302, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_302; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmulx_laneq_f16(__p0_211, __p1_211, __p2_211) __extension__ ({ \
-  float16x4_t __s0_211 = __p0_211; \
-  float16x8_t __s1_211 = __p1_211; \
-  float16x4_t __ret_211; \
-  __ret_211 = vmulx_f16(__s0_211, splat_laneq_f16(__s1_211, __p2_211)); \
-  __ret_211; \
+#define vmulx_laneq_f16(__p0_303, __p1_303, __p2_303) __extension__ ({ \
+  float16x4_t __s0_303 = __p0_303; \
+  float16x8_t __s1_303 = __p1_303; \
+  float16x4_t __ret_303; \
+  __ret_303 = vmulx_f16(__s0_303, splat_laneq_f16(__s1_303, __p2_303)); \
+  __ret_303; \
 })
 #else
-#define vmulx_laneq_f16(__p0_212, __p1_212, __p2_212) __extension__ ({ \
-  float16x4_t __s0_212 = __p0_212; \
-  float16x8_t __s1_212 = __p1_212; \
-  float16x4_t __rev0_212;  __rev0_212 = __builtin_shufflevector(__s0_212, __s0_212, 3, 2, 1, 0); \
-  float16x8_t __rev1_212;  __rev1_212 = __builtin_shufflevector(__s1_212, __s1_212, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x4_t __ret_212; \
-  __ret_212 = __noswap_vmulx_f16(__rev0_212, __noswap_splat_laneq_f16(__rev1_212, __p2_212)); \
-  __ret_212 = __builtin_shufflevector(__ret_212, __ret_212, 3, 2, 1, 0); \
-  __ret_212; \
+#define vmulx_laneq_f16(__p0_304, __p1_304, __p2_304) __extension__ ({ \
+  float16x4_t __s0_304 = __p0_304; \
+  float16x8_t __s1_304 = __p1_304; \
+  float16x4_t __rev0_304;  __rev0_304 = __builtin_shufflevector(__s0_304, __s0_304, 3, 2, 1, 0); \
+  float16x8_t __rev1_304;  __rev1_304 = __builtin_shufflevector(__s1_304, __s1_304, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x4_t __ret_304; \
+  __ret_304 = __noswap_vmulx_f16(__rev0_304, __noswap_splat_laneq_f16(__rev1_304, __p2_304)); \
+  __ret_304 = __builtin_shufflevector(__ret_304, __ret_304, 3, 2, 1, 0); \
+  __ret_304; \
 })
 #endif
 
@@ -44336,54 +46072,54 @@ __ai int32x2_t __noswap_vusdot_s32(int32x2_t __p0, uint8x8_t __p1, int8x8_t __p2
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vusdotq_lane_s32(__p0_213, __p1_213, __p2_213, __p3_213) __extension__ ({ \
-  int32x4_t __s0_213 = __p0_213; \
-  uint8x16_t __s1_213 = __p1_213; \
-  int8x8_t __s2_213 = __p2_213; \
-  int32x4_t __ret_213; \
-int8x8_t __reint_213 = __s2_213; \
-  __ret_213 = vusdotq_s32(__s0_213, __s1_213, (int8x16_t)(splatq_lane_s32(*(int32x2_t *) &__reint_213, __p3_213))); \
-  __ret_213; \
+#define vusdotq_lane_s32(__p0_305, __p1_305, __p2_305, __p3_305) __extension__ ({ \
+  int32x4_t __s0_305 = __p0_305; \
+  uint8x16_t __s1_305 = __p1_305; \
+  int8x8_t __s2_305 = __p2_305; \
+  int32x4_t __ret_305; \
+int8x8_t __reint_305 = __s2_305; \
+  __ret_305 = vusdotq_s32(__s0_305, __s1_305, (int8x16_t)(splatq_lane_s32(*(int32x2_t *) &__reint_305, __p3_305))); \
+  __ret_305; \
 })
 #else
-#define vusdotq_lane_s32(__p0_214, __p1_214, __p2_214, __p3_214) __extension__ ({ \
-  int32x4_t __s0_214 = __p0_214; \
-  uint8x16_t __s1_214 = __p1_214; \
-  int8x8_t __s2_214 = __p2_214; \
-  int32x4_t __rev0_214;  __rev0_214 = __builtin_shufflevector(__s0_214, __s0_214, 3, 2, 1, 0); \
-  uint8x16_t __rev1_214;  __rev1_214 = __builtin_shufflevector(__s1_214, __s1_214, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x8_t __rev2_214;  __rev2_214 = __builtin_shufflevector(__s2_214, __s2_214, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int32x4_t __ret_214; \
-int8x8_t __reint_214 = __rev2_214; \
-  __ret_214 = __noswap_vusdotq_s32(__rev0_214, __rev1_214, (int8x16_t)(__noswap_splatq_lane_s32(*(int32x2_t *) &__reint_214, __p3_214))); \
-  __ret_214 = __builtin_shufflevector(__ret_214, __ret_214, 3, 2, 1, 0); \
-  __ret_214; \
+#define vusdotq_lane_s32(__p0_306, __p1_306, __p2_306, __p3_306) __extension__ ({ \
+  int32x4_t __s0_306 = __p0_306; \
+  uint8x16_t __s1_306 = __p1_306; \
+  int8x8_t __s2_306 = __p2_306; \
+  int32x4_t __rev0_306;  __rev0_306 = __builtin_shufflevector(__s0_306, __s0_306, 3, 2, 1, 0); \
+  uint8x16_t __rev1_306;  __rev1_306 = __builtin_shufflevector(__s1_306, __s1_306, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __rev2_306;  __rev2_306 = __builtin_shufflevector(__s2_306, __s2_306, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret_306; \
+int8x8_t __reint_306 = __rev2_306; \
+  __ret_306 = __noswap_vusdotq_s32(__rev0_306, __rev1_306, (int8x16_t)(__noswap_splatq_lane_s32(*(int32x2_t *) &__reint_306, __p3_306))); \
+  __ret_306 = __builtin_shufflevector(__ret_306, __ret_306, 3, 2, 1, 0); \
+  __ret_306; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vusdot_lane_s32(__p0_215, __p1_215, __p2_215, __p3_215) __extension__ ({ \
-  int32x2_t __s0_215 = __p0_215; \
-  uint8x8_t __s1_215 = __p1_215; \
-  int8x8_t __s2_215 = __p2_215; \
-  int32x2_t __ret_215; \
-int8x8_t __reint_215 = __s2_215; \
-  __ret_215 = vusdot_s32(__s0_215, __s1_215, (int8x8_t)(splat_lane_s32(*(int32x2_t *) &__reint_215, __p3_215))); \
-  __ret_215; \
+#define vusdot_lane_s32(__p0_307, __p1_307, __p2_307, __p3_307) __extension__ ({ \
+  int32x2_t __s0_307 = __p0_307; \
+  uint8x8_t __s1_307 = __p1_307; \
+  int8x8_t __s2_307 = __p2_307; \
+  int32x2_t __ret_307; \
+int8x8_t __reint_307 = __s2_307; \
+  __ret_307 = vusdot_s32(__s0_307, __s1_307, (int8x8_t)(splat_lane_s32(*(int32x2_t *) &__reint_307, __p3_307))); \
+  __ret_307; \
 })
 #else
-#define vusdot_lane_s32(__p0_216, __p1_216, __p2_216, __p3_216) __extension__ ({ \
-  int32x2_t __s0_216 = __p0_216; \
-  uint8x8_t __s1_216 = __p1_216; \
-  int8x8_t __s2_216 = __p2_216; \
-  int32x2_t __rev0_216;  __rev0_216 = __builtin_shufflevector(__s0_216, __s0_216, 1, 0); \
-  uint8x8_t __rev1_216;  __rev1_216 = __builtin_shufflevector(__s1_216, __s1_216, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x8_t __rev2_216;  __rev2_216 = __builtin_shufflevector(__s2_216, __s2_216, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int32x2_t __ret_216; \
-int8x8_t __reint_216 = __rev2_216; \
-  __ret_216 = __noswap_vusdot_s32(__rev0_216, __rev1_216, (int8x8_t)(__noswap_splat_lane_s32(*(int32x2_t *) &__reint_216, __p3_216))); \
-  __ret_216 = __builtin_shufflevector(__ret_216, __ret_216, 1, 0); \
-  __ret_216; \
+#define vusdot_lane_s32(__p0_308, __p1_308, __p2_308, __p3_308) __extension__ ({ \
+  int32x2_t __s0_308 = __p0_308; \
+  uint8x8_t __s1_308 = __p1_308; \
+  int8x8_t __s2_308 = __p2_308; \
+  int32x2_t __rev0_308;  __rev0_308 = __builtin_shufflevector(__s0_308, __s0_308, 1, 0); \
+  uint8x8_t __rev1_308;  __rev1_308 = __builtin_shufflevector(__s1_308, __s1_308, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __rev2_308;  __rev2_308 = __builtin_shufflevector(__s2_308, __s2_308, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x2_t __ret_308; \
+int8x8_t __reint_308 = __rev2_308; \
+  __ret_308 = __noswap_vusdot_s32(__rev0_308, __rev1_308, (int8x8_t)(__noswap_splat_lane_s32(*(int32x2_t *) &__reint_308, __p3_308))); \
+  __ret_308 = __builtin_shufflevector(__ret_308, __ret_308, 1, 0); \
+  __ret_308; \
 })
 #endif
 
@@ -44480,98 +46216,98 @@ __ai int16x4_t vqrdmlah_s16(int16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmlahq_lane_s32(__p0_217, __p1_217, __p2_217, __p3_217) __extension__ ({ \
-  int32x4_t __s0_217 = __p0_217; \
-  int32x4_t __s1_217 = __p1_217; \
-  int32x2_t __s2_217 = __p2_217; \
-  int32x4_t __ret_217; \
-  __ret_217 = vqaddq_s32(__s0_217, vqrdmulhq_s32(__s1_217, splatq_lane_s32(__s2_217, __p3_217))); \
-  __ret_217; \
+#define vqrdmlahq_lane_s32(__p0_309, __p1_309, __p2_309, __p3_309) __extension__ ({ \
+  int32x4_t __s0_309 = __p0_309; \
+  int32x4_t __s1_309 = __p1_309; \
+  int32x2_t __s2_309 = __p2_309; \
+  int32x4_t __ret_309; \
+  __ret_309 = vqaddq_s32(__s0_309, vqrdmulhq_s32(__s1_309, splatq_lane_s32(__s2_309, __p3_309))); \
+  __ret_309; \
 })
 #else
-#define vqrdmlahq_lane_s32(__p0_218, __p1_218, __p2_218, __p3_218) __extension__ ({ \
-  int32x4_t __s0_218 = __p0_218; \
-  int32x4_t __s1_218 = __p1_218; \
-  int32x2_t __s2_218 = __p2_218; \
-  int32x4_t __rev0_218;  __rev0_218 = __builtin_shufflevector(__s0_218, __s0_218, 3, 2, 1, 0); \
-  int32x4_t __rev1_218;  __rev1_218 = __builtin_shufflevector(__s1_218, __s1_218, 3, 2, 1, 0); \
-  int32x2_t __rev2_218;  __rev2_218 = __builtin_shufflevector(__s2_218, __s2_218, 1, 0); \
-  int32x4_t __ret_218; \
-  __ret_218 = __noswap_vqaddq_s32(__rev0_218, __noswap_vqrdmulhq_s32(__rev1_218, __noswap_splatq_lane_s32(__rev2_218, __p3_218))); \
-  __ret_218 = __builtin_shufflevector(__ret_218, __ret_218, 3, 2, 1, 0); \
-  __ret_218; \
+#define vqrdmlahq_lane_s32(__p0_310, __p1_310, __p2_310, __p3_310) __extension__ ({ \
+  int32x4_t __s0_310 = __p0_310; \
+  int32x4_t __s1_310 = __p1_310; \
+  int32x2_t __s2_310 = __p2_310; \
+  int32x4_t __rev0_310;  __rev0_310 = __builtin_shufflevector(__s0_310, __s0_310, 3, 2, 1, 0); \
+  int32x4_t __rev1_310;  __rev1_310 = __builtin_shufflevector(__s1_310, __s1_310, 3, 2, 1, 0); \
+  int32x2_t __rev2_310;  __rev2_310 = __builtin_shufflevector(__s2_310, __s2_310, 1, 0); \
+  int32x4_t __ret_310; \
+  __ret_310 = __noswap_vqaddq_s32(__rev0_310, __noswap_vqrdmulhq_s32(__rev1_310, __noswap_splatq_lane_s32(__rev2_310, __p3_310))); \
+  __ret_310 = __builtin_shufflevector(__ret_310, __ret_310, 3, 2, 1, 0); \
+  __ret_310; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmlahq_lane_s16(__p0_219, __p1_219, __p2_219, __p3_219) __extension__ ({ \
-  int16x8_t __s0_219 = __p0_219; \
-  int16x8_t __s1_219 = __p1_219; \
-  int16x4_t __s2_219 = __p2_219; \
-  int16x8_t __ret_219; \
-  __ret_219 = vqaddq_s16(__s0_219, vqrdmulhq_s16(__s1_219, splatq_lane_s16(__s2_219, __p3_219))); \
-  __ret_219; \
+#define vqrdmlahq_lane_s16(__p0_311, __p1_311, __p2_311, __p3_311) __extension__ ({ \
+  int16x8_t __s0_311 = __p0_311; \
+  int16x8_t __s1_311 = __p1_311; \
+  int16x4_t __s2_311 = __p2_311; \
+  int16x8_t __ret_311; \
+  __ret_311 = vqaddq_s16(__s0_311, vqrdmulhq_s16(__s1_311, splatq_lane_s16(__s2_311, __p3_311))); \
+  __ret_311; \
 })
 #else
-#define vqrdmlahq_lane_s16(__p0_220, __p1_220, __p2_220, __p3_220) __extension__ ({ \
-  int16x8_t __s0_220 = __p0_220; \
-  int16x8_t __s1_220 = __p1_220; \
-  int16x4_t __s2_220 = __p2_220; \
-  int16x8_t __rev0_220;  __rev0_220 = __builtin_shufflevector(__s0_220, __s0_220, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1_220;  __rev1_220 = __builtin_shufflevector(__s1_220, __s1_220, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x4_t __rev2_220;  __rev2_220 = __builtin_shufflevector(__s2_220, __s2_220, 3, 2, 1, 0); \
-  int16x8_t __ret_220; \
-  __ret_220 = __noswap_vqaddq_s16(__rev0_220, __noswap_vqrdmulhq_s16(__rev1_220, __noswap_splatq_lane_s16(__rev2_220, __p3_220))); \
-  __ret_220 = __builtin_shufflevector(__ret_220, __ret_220, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_220; \
+#define vqrdmlahq_lane_s16(__p0_312, __p1_312, __p2_312, __p3_312) __extension__ ({ \
+  int16x8_t __s0_312 = __p0_312; \
+  int16x8_t __s1_312 = __p1_312; \
+  int16x4_t __s2_312 = __p2_312; \
+  int16x8_t __rev0_312;  __rev0_312 = __builtin_shufflevector(__s0_312, __s0_312, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1_312;  __rev1_312 = __builtin_shufflevector(__s1_312, __s1_312, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __rev2_312;  __rev2_312 = __builtin_shufflevector(__s2_312, __s2_312, 3, 2, 1, 0); \
+  int16x8_t __ret_312; \
+  __ret_312 = __noswap_vqaddq_s16(__rev0_312, __noswap_vqrdmulhq_s16(__rev1_312, __noswap_splatq_lane_s16(__rev2_312, __p3_312))); \
+  __ret_312 = __builtin_shufflevector(__ret_312, __ret_312, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_312; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmlah_lane_s32(__p0_221, __p1_221, __p2_221, __p3_221) __extension__ ({ \
-  int32x2_t __s0_221 = __p0_221; \
-  int32x2_t __s1_221 = __p1_221; \
-  int32x2_t __s2_221 = __p2_221; \
-  int32x2_t __ret_221; \
-  __ret_221 = vqadd_s32(__s0_221, vqrdmulh_s32(__s1_221, splat_lane_s32(__s2_221, __p3_221))); \
-  __ret_221; \
+#define vqrdmlah_lane_s32(__p0_313, __p1_313, __p2_313, __p3_313) __extension__ ({ \
+  int32x2_t __s0_313 = __p0_313; \
+  int32x2_t __s1_313 = __p1_313; \
+  int32x2_t __s2_313 = __p2_313; \
+  int32x2_t __ret_313; \
+  __ret_313 = vqadd_s32(__s0_313, vqrdmulh_s32(__s1_313, splat_lane_s32(__s2_313, __p3_313))); \
+  __ret_313; \
 })
 #else
-#define vqrdmlah_lane_s32(__p0_222, __p1_222, __p2_222, __p3_222) __extension__ ({ \
-  int32x2_t __s0_222 = __p0_222; \
-  int32x2_t __s1_222 = __p1_222; \
-  int32x2_t __s2_222 = __p2_222; \
-  int32x2_t __rev0_222;  __rev0_222 = __builtin_shufflevector(__s0_222, __s0_222, 1, 0); \
-  int32x2_t __rev1_222;  __rev1_222 = __builtin_shufflevector(__s1_222, __s1_222, 1, 0); \
-  int32x2_t __rev2_222;  __rev2_222 = __builtin_shufflevector(__s2_222, __s2_222, 1, 0); \
-  int32x2_t __ret_222; \
-  __ret_222 = __noswap_vqadd_s32(__rev0_222, __noswap_vqrdmulh_s32(__rev1_222, __noswap_splat_lane_s32(__rev2_222, __p3_222))); \
-  __ret_222 = __builtin_shufflevector(__ret_222, __ret_222, 1, 0); \
-  __ret_222; \
+#define vqrdmlah_lane_s32(__p0_314, __p1_314, __p2_314, __p3_314) __extension__ ({ \
+  int32x2_t __s0_314 = __p0_314; \
+  int32x2_t __s1_314 = __p1_314; \
+  int32x2_t __s2_314 = __p2_314; \
+  int32x2_t __rev0_314;  __rev0_314 = __builtin_shufflevector(__s0_314, __s0_314, 1, 0); \
+  int32x2_t __rev1_314;  __rev1_314 = __builtin_shufflevector(__s1_314, __s1_314, 1, 0); \
+  int32x2_t __rev2_314;  __rev2_314 = __builtin_shufflevector(__s2_314, __s2_314, 1, 0); \
+  int32x2_t __ret_314; \
+  __ret_314 = __noswap_vqadd_s32(__rev0_314, __noswap_vqrdmulh_s32(__rev1_314, __noswap_splat_lane_s32(__rev2_314, __p3_314))); \
+  __ret_314 = __builtin_shufflevector(__ret_314, __ret_314, 1, 0); \
+  __ret_314; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmlah_lane_s16(__p0_223, __p1_223, __p2_223, __p3_223) __extension__ ({ \
-  int16x4_t __s0_223 = __p0_223; \
-  int16x4_t __s1_223 = __p1_223; \
-  int16x4_t __s2_223 = __p2_223; \
-  int16x4_t __ret_223; \
-  __ret_223 = vqadd_s16(__s0_223, vqrdmulh_s16(__s1_223, splat_lane_s16(__s2_223, __p3_223))); \
-  __ret_223; \
+#define vqrdmlah_lane_s16(__p0_315, __p1_315, __p2_315, __p3_315) __extension__ ({ \
+  int16x4_t __s0_315 = __p0_315; \
+  int16x4_t __s1_315 = __p1_315; \
+  int16x4_t __s2_315 = __p2_315; \
+  int16x4_t __ret_315; \
+  __ret_315 = vqadd_s16(__s0_315, vqrdmulh_s16(__s1_315, splat_lane_s16(__s2_315, __p3_315))); \
+  __ret_315; \
 })
 #else
-#define vqrdmlah_lane_s16(__p0_224, __p1_224, __p2_224, __p3_224) __extension__ ({ \
-  int16x4_t __s0_224 = __p0_224; \
-  int16x4_t __s1_224 = __p1_224; \
-  int16x4_t __s2_224 = __p2_224; \
-  int16x4_t __rev0_224;  __rev0_224 = __builtin_shufflevector(__s0_224, __s0_224, 3, 2, 1, 0); \
-  int16x4_t __rev1_224;  __rev1_224 = __builtin_shufflevector(__s1_224, __s1_224, 3, 2, 1, 0); \
-  int16x4_t __rev2_224;  __rev2_224 = __builtin_shufflevector(__s2_224, __s2_224, 3, 2, 1, 0); \
-  int16x4_t __ret_224; \
-  __ret_224 = __noswap_vqadd_s16(__rev0_224, __noswap_vqrdmulh_s16(__rev1_224, __noswap_splat_lane_s16(__rev2_224, __p3_224))); \
-  __ret_224 = __builtin_shufflevector(__ret_224, __ret_224, 3, 2, 1, 0); \
-  __ret_224; \
+#define vqrdmlah_lane_s16(__p0_316, __p1_316, __p2_316, __p3_316) __extension__ ({ \
+  int16x4_t __s0_316 = __p0_316; \
+  int16x4_t __s1_316 = __p1_316; \
+  int16x4_t __s2_316 = __p2_316; \
+  int16x4_t __rev0_316;  __rev0_316 = __builtin_shufflevector(__s0_316, __s0_316, 3, 2, 1, 0); \
+  int16x4_t __rev1_316;  __rev1_316 = __builtin_shufflevector(__s1_316, __s1_316, 3, 2, 1, 0); \
+  int16x4_t __rev2_316;  __rev2_316 = __builtin_shufflevector(__s2_316, __s2_316, 3, 2, 1, 0); \
+  int16x4_t __ret_316; \
+  __ret_316 = __noswap_vqadd_s16(__rev0_316, __noswap_vqrdmulh_s16(__rev1_316, __noswap_splat_lane_s16(__rev2_316, __p3_316))); \
+  __ret_316 = __builtin_shufflevector(__ret_316, __ret_316, 3, 2, 1, 0); \
+  __ret_316; \
 })
 #endif
 
@@ -44648,292 +46384,292 @@ __ai int16x4_t vqrdmlsh_s16(int16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmlshq_lane_s32(__p0_225, __p1_225, __p2_225, __p3_225) __extension__ ({ \
-  int32x4_t __s0_225 = __p0_225; \
-  int32x4_t __s1_225 = __p1_225; \
-  int32x2_t __s2_225 = __p2_225; \
-  int32x4_t __ret_225; \
-  __ret_225 = vqsubq_s32(__s0_225, vqrdmulhq_s32(__s1_225, splatq_lane_s32(__s2_225, __p3_225))); \
-  __ret_225; \
+#define vqrdmlshq_lane_s32(__p0_317, __p1_317, __p2_317, __p3_317) __extension__ ({ \
+  int32x4_t __s0_317 = __p0_317; \
+  int32x4_t __s1_317 = __p1_317; \
+  int32x2_t __s2_317 = __p2_317; \
+  int32x4_t __ret_317; \
+  __ret_317 = vqsubq_s32(__s0_317, vqrdmulhq_s32(__s1_317, splatq_lane_s32(__s2_317, __p3_317))); \
+  __ret_317; \
 })
 #else
-#define vqrdmlshq_lane_s32(__p0_226, __p1_226, __p2_226, __p3_226) __extension__ ({ \
-  int32x4_t __s0_226 = __p0_226; \
-  int32x4_t __s1_226 = __p1_226; \
-  int32x2_t __s2_226 = __p2_226; \
-  int32x4_t __rev0_226;  __rev0_226 = __builtin_shufflevector(__s0_226, __s0_226, 3, 2, 1, 0); \
-  int32x4_t __rev1_226;  __rev1_226 = __builtin_shufflevector(__s1_226, __s1_226, 3, 2, 1, 0); \
-  int32x2_t __rev2_226;  __rev2_226 = __builtin_shufflevector(__s2_226, __s2_226, 1, 0); \
-  int32x4_t __ret_226; \
-  __ret_226 = __noswap_vqsubq_s32(__rev0_226, __noswap_vqrdmulhq_s32(__rev1_226, __noswap_splatq_lane_s32(__rev2_226, __p3_226))); \
-  __ret_226 = __builtin_shufflevector(__ret_226, __ret_226, 3, 2, 1, 0); \
-  __ret_226; \
+#define vqrdmlshq_lane_s32(__p0_318, __p1_318, __p2_318, __p3_318) __extension__ ({ \
+  int32x4_t __s0_318 = __p0_318; \
+  int32x4_t __s1_318 = __p1_318; \
+  int32x2_t __s2_318 = __p2_318; \
+  int32x4_t __rev0_318;  __rev0_318 = __builtin_shufflevector(__s0_318, __s0_318, 3, 2, 1, 0); \
+  int32x4_t __rev1_318;  __rev1_318 = __builtin_shufflevector(__s1_318, __s1_318, 3, 2, 1, 0); \
+  int32x2_t __rev2_318;  __rev2_318 = __builtin_shufflevector(__s2_318, __s2_318, 1, 0); \
+  int32x4_t __ret_318; \
+  __ret_318 = __noswap_vqsubq_s32(__rev0_318, __noswap_vqrdmulhq_s32(__rev1_318, __noswap_splatq_lane_s32(__rev2_318, __p3_318))); \
+  __ret_318 = __builtin_shufflevector(__ret_318, __ret_318, 3, 2, 1, 0); \
+  __ret_318; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmlshq_lane_s16(__p0_227, __p1_227, __p2_227, __p3_227) __extension__ ({ \
-  int16x8_t __s0_227 = __p0_227; \
-  int16x8_t __s1_227 = __p1_227; \
-  int16x4_t __s2_227 = __p2_227; \
-  int16x8_t __ret_227; \
-  __ret_227 = vqsubq_s16(__s0_227, vqrdmulhq_s16(__s1_227, splatq_lane_s16(__s2_227, __p3_227))); \
-  __ret_227; \
+#define vqrdmlshq_lane_s16(__p0_319, __p1_319, __p2_319, __p3_319) __extension__ ({ \
+  int16x8_t __s0_319 = __p0_319; \
+  int16x8_t __s1_319 = __p1_319; \
+  int16x4_t __s2_319 = __p2_319; \
+  int16x8_t __ret_319; \
+  __ret_319 = vqsubq_s16(__s0_319, vqrdmulhq_s16(__s1_319, splatq_lane_s16(__s2_319, __p3_319))); \
+  __ret_319; \
 })
 #else
-#define vqrdmlshq_lane_s16(__p0_228, __p1_228, __p2_228, __p3_228) __extension__ ({ \
-  int16x8_t __s0_228 = __p0_228; \
-  int16x8_t __s1_228 = __p1_228; \
-  int16x4_t __s2_228 = __p2_228; \
-  int16x8_t __rev0_228;  __rev0_228 = __builtin_shufflevector(__s0_228, __s0_228, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1_228;  __rev1_228 = __builtin_shufflevector(__s1_228, __s1_228, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x4_t __rev2_228;  __rev2_228 = __builtin_shufflevector(__s2_228, __s2_228, 3, 2, 1, 0); \
-  int16x8_t __ret_228; \
-  __ret_228 = __noswap_vqsubq_s16(__rev0_228, __noswap_vqrdmulhq_s16(__rev1_228, __noswap_splatq_lane_s16(__rev2_228, __p3_228))); \
-  __ret_228 = __builtin_shufflevector(__ret_228, __ret_228, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_228; \
+#define vqrdmlshq_lane_s16(__p0_320, __p1_320, __p2_320, __p3_320) __extension__ ({ \
+  int16x8_t __s0_320 = __p0_320; \
+  int16x8_t __s1_320 = __p1_320; \
+  int16x4_t __s2_320 = __p2_320; \
+  int16x8_t __rev0_320;  __rev0_320 = __builtin_shufflevector(__s0_320, __s0_320, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1_320;  __rev1_320 = __builtin_shufflevector(__s1_320, __s1_320, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __rev2_320;  __rev2_320 = __builtin_shufflevector(__s2_320, __s2_320, 3, 2, 1, 0); \
+  int16x8_t __ret_320; \
+  __ret_320 = __noswap_vqsubq_s16(__rev0_320, __noswap_vqrdmulhq_s16(__rev1_320, __noswap_splatq_lane_s16(__rev2_320, __p3_320))); \
+  __ret_320 = __builtin_shufflevector(__ret_320, __ret_320, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_320; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmlsh_lane_s32(__p0_229, __p1_229, __p2_229, __p3_229) __extension__ ({ \
-  int32x2_t __s0_229 = __p0_229; \
-  int32x2_t __s1_229 = __p1_229; \
-  int32x2_t __s2_229 = __p2_229; \
-  int32x2_t __ret_229; \
-  __ret_229 = vqsub_s32(__s0_229, vqrdmulh_s32(__s1_229, splat_lane_s32(__s2_229, __p3_229))); \
-  __ret_229; \
+#define vqrdmlsh_lane_s32(__p0_321, __p1_321, __p2_321, __p3_321) __extension__ ({ \
+  int32x2_t __s0_321 = __p0_321; \
+  int32x2_t __s1_321 = __p1_321; \
+  int32x2_t __s2_321 = __p2_321; \
+  int32x2_t __ret_321; \
+  __ret_321 = vqsub_s32(__s0_321, vqrdmulh_s32(__s1_321, splat_lane_s32(__s2_321, __p3_321))); \
+  __ret_321; \
 })
 #else
-#define vqrdmlsh_lane_s32(__p0_230, __p1_230, __p2_230, __p3_230) __extension__ ({ \
-  int32x2_t __s0_230 = __p0_230; \
-  int32x2_t __s1_230 = __p1_230; \
-  int32x2_t __s2_230 = __p2_230; \
-  int32x2_t __rev0_230;  __rev0_230 = __builtin_shufflevector(__s0_230, __s0_230, 1, 0); \
-  int32x2_t __rev1_230;  __rev1_230 = __builtin_shufflevector(__s1_230, __s1_230, 1, 0); \
-  int32x2_t __rev2_230;  __rev2_230 = __builtin_shufflevector(__s2_230, __s2_230, 1, 0); \
-  int32x2_t __ret_230; \
-  __ret_230 = __noswap_vqsub_s32(__rev0_230, __noswap_vqrdmulh_s32(__rev1_230, __noswap_splat_lane_s32(__rev2_230, __p3_230))); \
-  __ret_230 = __builtin_shufflevector(__ret_230, __ret_230, 1, 0); \
-  __ret_230; \
+#define vqrdmlsh_lane_s32(__p0_322, __p1_322, __p2_322, __p3_322) __extension__ ({ \
+  int32x2_t __s0_322 = __p0_322; \
+  int32x2_t __s1_322 = __p1_322; \
+  int32x2_t __s2_322 = __p2_322; \
+  int32x2_t __rev0_322;  __rev0_322 = __builtin_shufflevector(__s0_322, __s0_322, 1, 0); \
+  int32x2_t __rev1_322;  __rev1_322 = __builtin_shufflevector(__s1_322, __s1_322, 1, 0); \
+  int32x2_t __rev2_322;  __rev2_322 = __builtin_shufflevector(__s2_322, __s2_322, 1, 0); \
+  int32x2_t __ret_322; \
+  __ret_322 = __noswap_vqsub_s32(__rev0_322, __noswap_vqrdmulh_s32(__rev1_322, __noswap_splat_lane_s32(__rev2_322, __p3_322))); \
+  __ret_322 = __builtin_shufflevector(__ret_322, __ret_322, 1, 0); \
+  __ret_322; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmlsh_lane_s16(__p0_231, __p1_231, __p2_231, __p3_231) __extension__ ({ \
-  int16x4_t __s0_231 = __p0_231; \
-  int16x4_t __s1_231 = __p1_231; \
-  int16x4_t __s2_231 = __p2_231; \
-  int16x4_t __ret_231; \
-  __ret_231 = vqsub_s16(__s0_231, vqrdmulh_s16(__s1_231, splat_lane_s16(__s2_231, __p3_231))); \
-  __ret_231; \
+#define vqrdmlsh_lane_s16(__p0_323, __p1_323, __p2_323, __p3_323) __extension__ ({ \
+  int16x4_t __s0_323 = __p0_323; \
+  int16x4_t __s1_323 = __p1_323; \
+  int16x4_t __s2_323 = __p2_323; \
+  int16x4_t __ret_323; \
+  __ret_323 = vqsub_s16(__s0_323, vqrdmulh_s16(__s1_323, splat_lane_s16(__s2_323, __p3_323))); \
+  __ret_323; \
 })
 #else
-#define vqrdmlsh_lane_s16(__p0_232, __p1_232, __p2_232, __p3_232) __extension__ ({ \
-  int16x4_t __s0_232 = __p0_232; \
-  int16x4_t __s1_232 = __p1_232; \
-  int16x4_t __s2_232 = __p2_232; \
-  int16x4_t __rev0_232;  __rev0_232 = __builtin_shufflevector(__s0_232, __s0_232, 3, 2, 1, 0); \
-  int16x4_t __rev1_232;  __rev1_232 = __builtin_shufflevector(__s1_232, __s1_232, 3, 2, 1, 0); \
-  int16x4_t __rev2_232;  __rev2_232 = __builtin_shufflevector(__s2_232, __s2_232, 3, 2, 1, 0); \
-  int16x4_t __ret_232; \
-  __ret_232 = __noswap_vqsub_s16(__rev0_232, __noswap_vqrdmulh_s16(__rev1_232, __noswap_splat_lane_s16(__rev2_232, __p3_232))); \
-  __ret_232 = __builtin_shufflevector(__ret_232, __ret_232, 3, 2, 1, 0); \
-  __ret_232; \
+#define vqrdmlsh_lane_s16(__p0_324, __p1_324, __p2_324, __p3_324) __extension__ ({ \
+  int16x4_t __s0_324 = __p0_324; \
+  int16x4_t __s1_324 = __p1_324; \
+  int16x4_t __s2_324 = __p2_324; \
+  int16x4_t __rev0_324;  __rev0_324 = __builtin_shufflevector(__s0_324, __s0_324, 3, 2, 1, 0); \
+  int16x4_t __rev1_324;  __rev1_324 = __builtin_shufflevector(__s1_324, __s1_324, 3, 2, 1, 0); \
+  int16x4_t __rev2_324;  __rev2_324 = __builtin_shufflevector(__s2_324, __s2_324, 3, 2, 1, 0); \
+  int16x4_t __ret_324; \
+  __ret_324 = __noswap_vqsub_s16(__rev0_324, __noswap_vqrdmulh_s16(__rev1_324, __noswap_splat_lane_s16(__rev2_324, __p3_324))); \
+  __ret_324 = __builtin_shufflevector(__ret_324, __ret_324, 3, 2, 1, 0); \
+  __ret_324; \
 })
 #endif
 
 #endif
 #if defined(__ARM_FEATURE_QRDMX) && defined(__aarch64__)
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmlahq_laneq_s32(__p0_233, __p1_233, __p2_233, __p3_233) __extension__ ({ \
-  int32x4_t __s0_233 = __p0_233; \
-  int32x4_t __s1_233 = __p1_233; \
-  int32x4_t __s2_233 = __p2_233; \
-  int32x4_t __ret_233; \
-  __ret_233 = vqaddq_s32(__s0_233, vqrdmulhq_s32(__s1_233, splatq_laneq_s32(__s2_233, __p3_233))); \
-  __ret_233; \
+#define vqrdmlahq_laneq_s32(__p0_325, __p1_325, __p2_325, __p3_325) __extension__ ({ \
+  int32x4_t __s0_325 = __p0_325; \
+  int32x4_t __s1_325 = __p1_325; \
+  int32x4_t __s2_325 = __p2_325; \
+  int32x4_t __ret_325; \
+  __ret_325 = vqaddq_s32(__s0_325, vqrdmulhq_s32(__s1_325, splatq_laneq_s32(__s2_325, __p3_325))); \
+  __ret_325; \
 })
 #else
-#define vqrdmlahq_laneq_s32(__p0_234, __p1_234, __p2_234, __p3_234) __extension__ ({ \
-  int32x4_t __s0_234 = __p0_234; \
-  int32x4_t __s1_234 = __p1_234; \
-  int32x4_t __s2_234 = __p2_234; \
-  int32x4_t __rev0_234;  __rev0_234 = __builtin_shufflevector(__s0_234, __s0_234, 3, 2, 1, 0); \
-  int32x4_t __rev1_234;  __rev1_234 = __builtin_shufflevector(__s1_234, __s1_234, 3, 2, 1, 0); \
-  int32x4_t __rev2_234;  __rev2_234 = __builtin_shufflevector(__s2_234, __s2_234, 3, 2, 1, 0); \
-  int32x4_t __ret_234; \
-  __ret_234 = __noswap_vqaddq_s32(__rev0_234, __noswap_vqrdmulhq_s32(__rev1_234, __noswap_splatq_laneq_s32(__rev2_234, __p3_234))); \
-  __ret_234 = __builtin_shufflevector(__ret_234, __ret_234, 3, 2, 1, 0); \
-  __ret_234; \
+#define vqrdmlahq_laneq_s32(__p0_326, __p1_326, __p2_326, __p3_326) __extension__ ({ \
+  int32x4_t __s0_326 = __p0_326; \
+  int32x4_t __s1_326 = __p1_326; \
+  int32x4_t __s2_326 = __p2_326; \
+  int32x4_t __rev0_326;  __rev0_326 = __builtin_shufflevector(__s0_326, __s0_326, 3, 2, 1, 0); \
+  int32x4_t __rev1_326;  __rev1_326 = __builtin_shufflevector(__s1_326, __s1_326, 3, 2, 1, 0); \
+  int32x4_t __rev2_326;  __rev2_326 = __builtin_shufflevector(__s2_326, __s2_326, 3, 2, 1, 0); \
+  int32x4_t __ret_326; \
+  __ret_326 = __noswap_vqaddq_s32(__rev0_326, __noswap_vqrdmulhq_s32(__rev1_326, __noswap_splatq_laneq_s32(__rev2_326, __p3_326))); \
+  __ret_326 = __builtin_shufflevector(__ret_326, __ret_326, 3, 2, 1, 0); \
+  __ret_326; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmlahq_laneq_s16(__p0_235, __p1_235, __p2_235, __p3_235) __extension__ ({ \
-  int16x8_t __s0_235 = __p0_235; \
-  int16x8_t __s1_235 = __p1_235; \
-  int16x8_t __s2_235 = __p2_235; \
-  int16x8_t __ret_235; \
-  __ret_235 = vqaddq_s16(__s0_235, vqrdmulhq_s16(__s1_235, splatq_laneq_s16(__s2_235, __p3_235))); \
-  __ret_235; \
+#define vqrdmlahq_laneq_s16(__p0_327, __p1_327, __p2_327, __p3_327) __extension__ ({ \
+  int16x8_t __s0_327 = __p0_327; \
+  int16x8_t __s1_327 = __p1_327; \
+  int16x8_t __s2_327 = __p2_327; \
+  int16x8_t __ret_327; \
+  __ret_327 = vqaddq_s16(__s0_327, vqrdmulhq_s16(__s1_327, splatq_laneq_s16(__s2_327, __p3_327))); \
+  __ret_327; \
 })
 #else
-#define vqrdmlahq_laneq_s16(__p0_236, __p1_236, __p2_236, __p3_236) __extension__ ({ \
-  int16x8_t __s0_236 = __p0_236; \
-  int16x8_t __s1_236 = __p1_236; \
-  int16x8_t __s2_236 = __p2_236; \
-  int16x8_t __rev0_236;  __rev0_236 = __builtin_shufflevector(__s0_236, __s0_236, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1_236;  __rev1_236 = __builtin_shufflevector(__s1_236, __s1_236, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev2_236;  __rev2_236 = __builtin_shufflevector(__s2_236, __s2_236, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __ret_236; \
-  __ret_236 = __noswap_vqaddq_s16(__rev0_236, __noswap_vqrdmulhq_s16(__rev1_236, __noswap_splatq_laneq_s16(__rev2_236, __p3_236))); \
-  __ret_236 = __builtin_shufflevector(__ret_236, __ret_236, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_236; \
+#define vqrdmlahq_laneq_s16(__p0_328, __p1_328, __p2_328, __p3_328) __extension__ ({ \
+  int16x8_t __s0_328 = __p0_328; \
+  int16x8_t __s1_328 = __p1_328; \
+  int16x8_t __s2_328 = __p2_328; \
+  int16x8_t __rev0_328;  __rev0_328 = __builtin_shufflevector(__s0_328, __s0_328, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1_328;  __rev1_328 = __builtin_shufflevector(__s1_328, __s1_328, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev2_328;  __rev2_328 = __builtin_shufflevector(__s2_328, __s2_328, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret_328; \
+  __ret_328 = __noswap_vqaddq_s16(__rev0_328, __noswap_vqrdmulhq_s16(__rev1_328, __noswap_splatq_laneq_s16(__rev2_328, __p3_328))); \
+  __ret_328 = __builtin_shufflevector(__ret_328, __ret_328, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_328; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmlah_laneq_s32(__p0_237, __p1_237, __p2_237, __p3_237) __extension__ ({ \
-  int32x2_t __s0_237 = __p0_237; \
-  int32x2_t __s1_237 = __p1_237; \
-  int32x4_t __s2_237 = __p2_237; \
-  int32x2_t __ret_237; \
-  __ret_237 = vqadd_s32(__s0_237, vqrdmulh_s32(__s1_237, splat_laneq_s32(__s2_237, __p3_237))); \
-  __ret_237; \
+#define vqrdmlah_laneq_s32(__p0_329, __p1_329, __p2_329, __p3_329) __extension__ ({ \
+  int32x2_t __s0_329 = __p0_329; \
+  int32x2_t __s1_329 = __p1_329; \
+  int32x4_t __s2_329 = __p2_329; \
+  int32x2_t __ret_329; \
+  __ret_329 = vqadd_s32(__s0_329, vqrdmulh_s32(__s1_329, splat_laneq_s32(__s2_329, __p3_329))); \
+  __ret_329; \
 })
 #else
-#define vqrdmlah_laneq_s32(__p0_238, __p1_238, __p2_238, __p3_238) __extension__ ({ \
-  int32x2_t __s0_238 = __p0_238; \
-  int32x2_t __s1_238 = __p1_238; \
-  int32x4_t __s2_238 = __p2_238; \
-  int32x2_t __rev0_238;  __rev0_238 = __builtin_shufflevector(__s0_238, __s0_238, 1, 0); \
-  int32x2_t __rev1_238;  __rev1_238 = __builtin_shufflevector(__s1_238, __s1_238, 1, 0); \
-  int32x4_t __rev2_238;  __rev2_238 = __builtin_shufflevector(__s2_238, __s2_238, 3, 2, 1, 0); \
-  int32x2_t __ret_238; \
-  __ret_238 = __noswap_vqadd_s32(__rev0_238, __noswap_vqrdmulh_s32(__rev1_238, __noswap_splat_laneq_s32(__rev2_238, __p3_238))); \
-  __ret_238 = __builtin_shufflevector(__ret_238, __ret_238, 1, 0); \
-  __ret_238; \
+#define vqrdmlah_laneq_s32(__p0_330, __p1_330, __p2_330, __p3_330) __extension__ ({ \
+  int32x2_t __s0_330 = __p0_330; \
+  int32x2_t __s1_330 = __p1_330; \
+  int32x4_t __s2_330 = __p2_330; \
+  int32x2_t __rev0_330;  __rev0_330 = __builtin_shufflevector(__s0_330, __s0_330, 1, 0); \
+  int32x2_t __rev1_330;  __rev1_330 = __builtin_shufflevector(__s1_330, __s1_330, 1, 0); \
+  int32x4_t __rev2_330;  __rev2_330 = __builtin_shufflevector(__s2_330, __s2_330, 3, 2, 1, 0); \
+  int32x2_t __ret_330; \
+  __ret_330 = __noswap_vqadd_s32(__rev0_330, __noswap_vqrdmulh_s32(__rev1_330, __noswap_splat_laneq_s32(__rev2_330, __p3_330))); \
+  __ret_330 = __builtin_shufflevector(__ret_330, __ret_330, 1, 0); \
+  __ret_330; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmlah_laneq_s16(__p0_239, __p1_239, __p2_239, __p3_239) __extension__ ({ \
-  int16x4_t __s0_239 = __p0_239; \
-  int16x4_t __s1_239 = __p1_239; \
-  int16x8_t __s2_239 = __p2_239; \
-  int16x4_t __ret_239; \
-  __ret_239 = vqadd_s16(__s0_239, vqrdmulh_s16(__s1_239, splat_laneq_s16(__s2_239, __p3_239))); \
-  __ret_239; \
+#define vqrdmlah_laneq_s16(__p0_331, __p1_331, __p2_331, __p3_331) __extension__ ({ \
+  int16x4_t __s0_331 = __p0_331; \
+  int16x4_t __s1_331 = __p1_331; \
+  int16x8_t __s2_331 = __p2_331; \
+  int16x4_t __ret_331; \
+  __ret_331 = vqadd_s16(__s0_331, vqrdmulh_s16(__s1_331, splat_laneq_s16(__s2_331, __p3_331))); \
+  __ret_331; \
 })
 #else
-#define vqrdmlah_laneq_s16(__p0_240, __p1_240, __p2_240, __p3_240) __extension__ ({ \
-  int16x4_t __s0_240 = __p0_240; \
-  int16x4_t __s1_240 = __p1_240; \
-  int16x8_t __s2_240 = __p2_240; \
-  int16x4_t __rev0_240;  __rev0_240 = __builtin_shufflevector(__s0_240, __s0_240, 3, 2, 1, 0); \
-  int16x4_t __rev1_240;  __rev1_240 = __builtin_shufflevector(__s1_240, __s1_240, 3, 2, 1, 0); \
-  int16x8_t __rev2_240;  __rev2_240 = __builtin_shufflevector(__s2_240, __s2_240, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x4_t __ret_240; \
-  __ret_240 = __noswap_vqadd_s16(__rev0_240, __noswap_vqrdmulh_s16(__rev1_240, __noswap_splat_laneq_s16(__rev2_240, __p3_240))); \
-  __ret_240 = __builtin_shufflevector(__ret_240, __ret_240, 3, 2, 1, 0); \
-  __ret_240; \
+#define vqrdmlah_laneq_s16(__p0_332, __p1_332, __p2_332, __p3_332) __extension__ ({ \
+  int16x4_t __s0_332 = __p0_332; \
+  int16x4_t __s1_332 = __p1_332; \
+  int16x8_t __s2_332 = __p2_332; \
+  int16x4_t __rev0_332;  __rev0_332 = __builtin_shufflevector(__s0_332, __s0_332, 3, 2, 1, 0); \
+  int16x4_t __rev1_332;  __rev1_332 = __builtin_shufflevector(__s1_332, __s1_332, 3, 2, 1, 0); \
+  int16x8_t __rev2_332;  __rev2_332 = __builtin_shufflevector(__s2_332, __s2_332, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __ret_332; \
+  __ret_332 = __noswap_vqadd_s16(__rev0_332, __noswap_vqrdmulh_s16(__rev1_332, __noswap_splat_laneq_s16(__rev2_332, __p3_332))); \
+  __ret_332 = __builtin_shufflevector(__ret_332, __ret_332, 3, 2, 1, 0); \
+  __ret_332; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmlshq_laneq_s32(__p0_241, __p1_241, __p2_241, __p3_241) __extension__ ({ \
-  int32x4_t __s0_241 = __p0_241; \
-  int32x4_t __s1_241 = __p1_241; \
-  int32x4_t __s2_241 = __p2_241; \
-  int32x4_t __ret_241; \
-  __ret_241 = vqsubq_s32(__s0_241, vqrdmulhq_s32(__s1_241, splatq_laneq_s32(__s2_241, __p3_241))); \
-  __ret_241; \
+#define vqrdmlshq_laneq_s32(__p0_333, __p1_333, __p2_333, __p3_333) __extension__ ({ \
+  int32x4_t __s0_333 = __p0_333; \
+  int32x4_t __s1_333 = __p1_333; \
+  int32x4_t __s2_333 = __p2_333; \
+  int32x4_t __ret_333; \
+  __ret_333 = vqsubq_s32(__s0_333, vqrdmulhq_s32(__s1_333, splatq_laneq_s32(__s2_333, __p3_333))); \
+  __ret_333; \
 })
 #else
-#define vqrdmlshq_laneq_s32(__p0_242, __p1_242, __p2_242, __p3_242) __extension__ ({ \
-  int32x4_t __s0_242 = __p0_242; \
-  int32x4_t __s1_242 = __p1_242; \
-  int32x4_t __s2_242 = __p2_242; \
-  int32x4_t __rev0_242;  __rev0_242 = __builtin_shufflevector(__s0_242, __s0_242, 3, 2, 1, 0); \
-  int32x4_t __rev1_242;  __rev1_242 = __builtin_shufflevector(__s1_242, __s1_242, 3, 2, 1, 0); \
-  int32x4_t __rev2_242;  __rev2_242 = __builtin_shufflevector(__s2_242, __s2_242, 3, 2, 1, 0); \
-  int32x4_t __ret_242; \
-  __ret_242 = __noswap_vqsubq_s32(__rev0_242, __noswap_vqrdmulhq_s32(__rev1_242, __noswap_splatq_laneq_s32(__rev2_242, __p3_242))); \
-  __ret_242 = __builtin_shufflevector(__ret_242, __ret_242, 3, 2, 1, 0); \
-  __ret_242; \
+#define vqrdmlshq_laneq_s32(__p0_334, __p1_334, __p2_334, __p3_334) __extension__ ({ \
+  int32x4_t __s0_334 = __p0_334; \
+  int32x4_t __s1_334 = __p1_334; \
+  int32x4_t __s2_334 = __p2_334; \
+  int32x4_t __rev0_334;  __rev0_334 = __builtin_shufflevector(__s0_334, __s0_334, 3, 2, 1, 0); \
+  int32x4_t __rev1_334;  __rev1_334 = __builtin_shufflevector(__s1_334, __s1_334, 3, 2, 1, 0); \
+  int32x4_t __rev2_334;  __rev2_334 = __builtin_shufflevector(__s2_334, __s2_334, 3, 2, 1, 0); \
+  int32x4_t __ret_334; \
+  __ret_334 = __noswap_vqsubq_s32(__rev0_334, __noswap_vqrdmulhq_s32(__rev1_334, __noswap_splatq_laneq_s32(__rev2_334, __p3_334))); \
+  __ret_334 = __builtin_shufflevector(__ret_334, __ret_334, 3, 2, 1, 0); \
+  __ret_334; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmlshq_laneq_s16(__p0_243, __p1_243, __p2_243, __p3_243) __extension__ ({ \
-  int16x8_t __s0_243 = __p0_243; \
-  int16x8_t __s1_243 = __p1_243; \
-  int16x8_t __s2_243 = __p2_243; \
-  int16x8_t __ret_243; \
-  __ret_243 = vqsubq_s16(__s0_243, vqrdmulhq_s16(__s1_243, splatq_laneq_s16(__s2_243, __p3_243))); \
-  __ret_243; \
+#define vqrdmlshq_laneq_s16(__p0_335, __p1_335, __p2_335, __p3_335) __extension__ ({ \
+  int16x8_t __s0_335 = __p0_335; \
+  int16x8_t __s1_335 = __p1_335; \
+  int16x8_t __s2_335 = __p2_335; \
+  int16x8_t __ret_335; \
+  __ret_335 = vqsubq_s16(__s0_335, vqrdmulhq_s16(__s1_335, splatq_laneq_s16(__s2_335, __p3_335))); \
+  __ret_335; \
 })
 #else
-#define vqrdmlshq_laneq_s16(__p0_244, __p1_244, __p2_244, __p3_244) __extension__ ({ \
-  int16x8_t __s0_244 = __p0_244; \
-  int16x8_t __s1_244 = __p1_244; \
-  int16x8_t __s2_244 = __p2_244; \
-  int16x8_t __rev0_244;  __rev0_244 = __builtin_shufflevector(__s0_244, __s0_244, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1_244;  __rev1_244 = __builtin_shufflevector(__s1_244, __s1_244, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev2_244;  __rev2_244 = __builtin_shufflevector(__s2_244, __s2_244, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __ret_244; \
-  __ret_244 = __noswap_vqsubq_s16(__rev0_244, __noswap_vqrdmulhq_s16(__rev1_244, __noswap_splatq_laneq_s16(__rev2_244, __p3_244))); \
-  __ret_244 = __builtin_shufflevector(__ret_244, __ret_244, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_244; \
+#define vqrdmlshq_laneq_s16(__p0_336, __p1_336, __p2_336, __p3_336) __extension__ ({ \
+  int16x8_t __s0_336 = __p0_336; \
+  int16x8_t __s1_336 = __p1_336; \
+  int16x8_t __s2_336 = __p2_336; \
+  int16x8_t __rev0_336;  __rev0_336 = __builtin_shufflevector(__s0_336, __s0_336, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1_336;  __rev1_336 = __builtin_shufflevector(__s1_336, __s1_336, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev2_336;  __rev2_336 = __builtin_shufflevector(__s2_336, __s2_336, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret_336; \
+  __ret_336 = __noswap_vqsubq_s16(__rev0_336, __noswap_vqrdmulhq_s16(__rev1_336, __noswap_splatq_laneq_s16(__rev2_336, __p3_336))); \
+  __ret_336 = __builtin_shufflevector(__ret_336, __ret_336, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_336; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmlsh_laneq_s32(__p0_245, __p1_245, __p2_245, __p3_245) __extension__ ({ \
-  int32x2_t __s0_245 = __p0_245; \
-  int32x2_t __s1_245 = __p1_245; \
-  int32x4_t __s2_245 = __p2_245; \
-  int32x2_t __ret_245; \
-  __ret_245 = vqsub_s32(__s0_245, vqrdmulh_s32(__s1_245, splat_laneq_s32(__s2_245, __p3_245))); \
-  __ret_245; \
+#define vqrdmlsh_laneq_s32(__p0_337, __p1_337, __p2_337, __p3_337) __extension__ ({ \
+  int32x2_t __s0_337 = __p0_337; \
+  int32x2_t __s1_337 = __p1_337; \
+  int32x4_t __s2_337 = __p2_337; \
+  int32x2_t __ret_337; \
+  __ret_337 = vqsub_s32(__s0_337, vqrdmulh_s32(__s1_337, splat_laneq_s32(__s2_337, __p3_337))); \
+  __ret_337; \
 })
 #else
-#define vqrdmlsh_laneq_s32(__p0_246, __p1_246, __p2_246, __p3_246) __extension__ ({ \
-  int32x2_t __s0_246 = __p0_246; \
-  int32x2_t __s1_246 = __p1_246; \
-  int32x4_t __s2_246 = __p2_246; \
-  int32x2_t __rev0_246;  __rev0_246 = __builtin_shufflevector(__s0_246, __s0_246, 1, 0); \
-  int32x2_t __rev1_246;  __rev1_246 = __builtin_shufflevector(__s1_246, __s1_246, 1, 0); \
-  int32x4_t __rev2_246;  __rev2_246 = __builtin_shufflevector(__s2_246, __s2_246, 3, 2, 1, 0); \
-  int32x2_t __ret_246; \
-  __ret_246 = __noswap_vqsub_s32(__rev0_246, __noswap_vqrdmulh_s32(__rev1_246, __noswap_splat_laneq_s32(__rev2_246, __p3_246))); \
-  __ret_246 = __builtin_shufflevector(__ret_246, __ret_246, 1, 0); \
-  __ret_246; \
+#define vqrdmlsh_laneq_s32(__p0_338, __p1_338, __p2_338, __p3_338) __extension__ ({ \
+  int32x2_t __s0_338 = __p0_338; \
+  int32x2_t __s1_338 = __p1_338; \
+  int32x4_t __s2_338 = __p2_338; \
+  int32x2_t __rev0_338;  __rev0_338 = __builtin_shufflevector(__s0_338, __s0_338, 1, 0); \
+  int32x2_t __rev1_338;  __rev1_338 = __builtin_shufflevector(__s1_338, __s1_338, 1, 0); \
+  int32x4_t __rev2_338;  __rev2_338 = __builtin_shufflevector(__s2_338, __s2_338, 3, 2, 1, 0); \
+  int32x2_t __ret_338; \
+  __ret_338 = __noswap_vqsub_s32(__rev0_338, __noswap_vqrdmulh_s32(__rev1_338, __noswap_splat_laneq_s32(__rev2_338, __p3_338))); \
+  __ret_338 = __builtin_shufflevector(__ret_338, __ret_338, 1, 0); \
+  __ret_338; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmlsh_laneq_s16(__p0_247, __p1_247, __p2_247, __p3_247) __extension__ ({ \
-  int16x4_t __s0_247 = __p0_247; \
-  int16x4_t __s1_247 = __p1_247; \
-  int16x8_t __s2_247 = __p2_247; \
-  int16x4_t __ret_247; \
-  __ret_247 = vqsub_s16(__s0_247, vqrdmulh_s16(__s1_247, splat_laneq_s16(__s2_247, __p3_247))); \
-  __ret_247; \
+#define vqrdmlsh_laneq_s16(__p0_339, __p1_339, __p2_339, __p3_339) __extension__ ({ \
+  int16x4_t __s0_339 = __p0_339; \
+  int16x4_t __s1_339 = __p1_339; \
+  int16x8_t __s2_339 = __p2_339; \
+  int16x4_t __ret_339; \
+  __ret_339 = vqsub_s16(__s0_339, vqrdmulh_s16(__s1_339, splat_laneq_s16(__s2_339, __p3_339))); \
+  __ret_339; \
 })
 #else
-#define vqrdmlsh_laneq_s16(__p0_248, __p1_248, __p2_248, __p3_248) __extension__ ({ \
-  int16x4_t __s0_248 = __p0_248; \
-  int16x4_t __s1_248 = __p1_248; \
-  int16x8_t __s2_248 = __p2_248; \
-  int16x4_t __rev0_248;  __rev0_248 = __builtin_shufflevector(__s0_248, __s0_248, 3, 2, 1, 0); \
-  int16x4_t __rev1_248;  __rev1_248 = __builtin_shufflevector(__s1_248, __s1_248, 3, 2, 1, 0); \
-  int16x8_t __rev2_248;  __rev2_248 = __builtin_shufflevector(__s2_248, __s2_248, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x4_t __ret_248; \
-  __ret_248 = __noswap_vqsub_s16(__rev0_248, __noswap_vqrdmulh_s16(__rev1_248, __noswap_splat_laneq_s16(__rev2_248, __p3_248))); \
-  __ret_248 = __builtin_shufflevector(__ret_248, __ret_248, 3, 2, 1, 0); \
-  __ret_248; \
+#define vqrdmlsh_laneq_s16(__p0_340, __p1_340, __p2_340, __p3_340) __extension__ ({ \
+  int16x4_t __s0_340 = __p0_340; \
+  int16x4_t __s1_340 = __p1_340; \
+  int16x8_t __s2_340 = __p2_340; \
+  int16x4_t __rev0_340;  __rev0_340 = __builtin_shufflevector(__s0_340, __s0_340, 3, 2, 1, 0); \
+  int16x4_t __rev1_340;  __rev1_340 = __builtin_shufflevector(__s1_340, __s1_340, 3, 2, 1, 0); \
+  int16x8_t __rev2_340;  __rev2_340 = __builtin_shufflevector(__s2_340, __s2_340, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __ret_340; \
+  __ret_340 = __noswap_vqsub_s16(__rev0_340, __noswap_vqrdmulh_s16(__rev1_340, __noswap_splat_laneq_s16(__rev2_340, __p3_340))); \
+  __ret_340 = __builtin_shufflevector(__ret_340, __ret_340, 3, 2, 1, 0); \
+  __ret_340; \
 })
 #endif
 
@@ -45860,9 +47596,9 @@ __ai uint64_t vceqd_u64(uint64_t __p0, uint64_t __p1) {
   __ret = (uint64_t) __builtin_neon_vceqd_u64(__p0, __p1);
   return __ret;
 }
-__ai int64_t vceqd_s64(int64_t __p0, int64_t __p1) {
-  int64_t __ret;
-  __ret = (int64_t) __builtin_neon_vceqd_s64(__p0, __p1);
+__ai uint64_t vceqd_s64(int64_t __p0, int64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vceqd_s64(__p0, __p1);
   return __ret;
 }
 __ai uint64_t vceqd_f64(float64_t __p0, float64_t __p1) {
@@ -45896,22 +47632,6 @@ __ai uint64x1_t vceqz_p64(poly64x1_t __p0) {
   __ret = (uint64x1_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 19);
   return __ret;
 }
-#ifdef __LITTLE_ENDIAN__
-__ai uint16x4_t vceqz_p16(poly16x4_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 17);
-  return __ret;
-}
-#else
-__ai uint16x4_t vceqz_p16(poly16x4_t __p0) {
-  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vceqz_v((int8x8_t)__rev0, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
 #ifdef __LITTLE_ENDIAN__
 __ai uint8x16_t vceqzq_p8(poly8x16_t __p0) {
   uint8x16_t __ret;
@@ -45944,22 +47664,6 @@ __ai uint64x2_t vceqzq_p64(poly64x2_t __p0) {
 }
 #endif
 
-#ifdef __LITTLE_ENDIAN__
-__ai uint16x8_t vceqzq_p16(poly16x8_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 49);
-  return __ret;
-}
-#else
-__ai uint16x8_t vceqzq_p16(poly16x8_t __p0) {
-  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
 #ifdef __LITTLE_ENDIAN__
 __ai uint8x16_t vceqzq_u8(uint8x16_t __p0) {
   uint8x16_t __ret;
@@ -46252,9 +47956,9 @@ __ai uint64_t vceqzd_u64(uint64_t __p0) {
   __ret = (uint64_t) __builtin_neon_vceqzd_u64(__p0);
   return __ret;
 }
-__ai int64_t vceqzd_s64(int64_t __p0) {
-  int64_t __ret;
-  __ret = (int64_t) __builtin_neon_vceqzd_s64(__p0);
+__ai uint64_t vceqzd_s64(int64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vceqzd_s64(__p0);
   return __ret;
 }
 __ai uint64_t vceqzd_f64(float64_t __p0) {
@@ -46333,9 +48037,9 @@ __ai uint64x1_t vcge_s64(int64x1_t __p0, int64x1_t __p1) {
   __ret = (uint64x1_t)(__p0 >= __p1);
   return __ret;
 }
-__ai int64_t vcged_s64(int64_t __p0, int64_t __p1) {
-  int64_t __ret;
-  __ret = (int64_t) __builtin_neon_vcged_s64(__p0, __p1);
+__ai uint64_t vcged_s64(int64_t __p0, int64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcged_s64(__p0, __p1);
   return __ret;
 }
 __ai uint64_t vcged_u64(uint64_t __p0, uint64_t __p1) {
@@ -46523,9 +48227,9 @@ __ai uint16x4_t vcgez_s16(int16x4_t __p0) {
 }
 #endif
 
-__ai int64_t vcgezd_s64(int64_t __p0) {
-  int64_t __ret;
-  __ret = (int64_t) __builtin_neon_vcgezd_s64(__p0);
+__ai uint64_t vcgezd_s64(int64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcgezd_s64(__p0);
   return __ret;
 }
 __ai uint64_t vcgezd_f64(float64_t __p0) {
@@ -46604,9 +48308,9 @@ __ai uint64x1_t vcgt_s64(int64x1_t __p0, int64x1_t __p1) {
   __ret = (uint64x1_t)(__p0 > __p1);
   return __ret;
 }
-__ai int64_t vcgtd_s64(int64_t __p0, int64_t __p1) {
-  int64_t __ret;
-  __ret = (int64_t) __builtin_neon_vcgtd_s64(__p0, __p1);
+__ai uint64_t vcgtd_s64(int64_t __p0, int64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcgtd_s64(__p0, __p1);
   return __ret;
 }
 __ai uint64_t vcgtd_u64(uint64_t __p0, uint64_t __p1) {
@@ -46794,9 +48498,9 @@ __ai uint16x4_t vcgtz_s16(int16x4_t __p0) {
 }
 #endif
 
-__ai int64_t vcgtzd_s64(int64_t __p0) {
-  int64_t __ret;
-  __ret = (int64_t) __builtin_neon_vcgtzd_s64(__p0);
+__ai uint64_t vcgtzd_s64(int64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcgtzd_s64(__p0);
   return __ret;
 }
 __ai uint64_t vcgtzd_f64(float64_t __p0) {
@@ -46880,9 +48584,9 @@ __ai uint64_t vcled_u64(uint64_t __p0, uint64_t __p1) {
   __ret = (uint64_t) __builtin_neon_vcled_u64(__p0, __p1);
   return __ret;
 }
-__ai int64_t vcled_s64(int64_t __p0, int64_t __p1) {
-  int64_t __ret;
-  __ret = (int64_t) __builtin_neon_vcled_s64(__p0, __p1);
+__ai uint64_t vcled_s64(int64_t __p0, int64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcled_s64(__p0, __p1);
   return __ret;
 }
 __ai uint64_t vcled_f64(float64_t __p0, float64_t __p1) {
@@ -47065,9 +48769,9 @@ __ai uint16x4_t vclez_s16(int16x4_t __p0) {
 }
 #endif
 
-__ai int64_t vclezd_s64(int64_t __p0) {
-  int64_t __ret;
-  __ret = (int64_t) __builtin_neon_vclezd_s64(__p0);
+__ai uint64_t vclezd_s64(int64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vclezd_s64(__p0);
   return __ret;
 }
 __ai uint64_t vclezd_f64(float64_t __p0) {
@@ -47151,9 +48855,9 @@ __ai uint64_t vcltd_u64(uint64_t __p0, uint64_t __p1) {
   __ret = (uint64_t) __builtin_neon_vcltd_u64(__p0, __p1);
   return __ret;
 }
-__ai int64_t vcltd_s64(int64_t __p0, int64_t __p1) {
-  int64_t __ret;
-  __ret = (int64_t) __builtin_neon_vcltd_s64(__p0, __p1);
+__ai uint64_t vcltd_s64(int64_t __p0, int64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcltd_s64(__p0, __p1);
   return __ret;
 }
 __ai uint64_t vcltd_f64(float64_t __p0, float64_t __p1) {
@@ -47336,9 +49040,9 @@ __ai uint16x4_t vcltz_s16(int16x4_t __p0) {
 }
 #endif
 
-__ai int64_t vcltzd_s64(int64_t __p0) {
-  int64_t __ret;
-  __ret = (int64_t) __builtin_neon_vcltzd_s64(__p0);
+__ai uint64_t vcltzd_s64(int64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcltzd_s64(__p0);
   return __ret;
 }
 __ai uint64_t vcltzd_f64(float64_t __p0) {
@@ -47382,892 +49086,892 @@ __ai float64x2_t vcombine_f64(float64x1_t __p0, float64x1_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_p8(__p0_249, __p1_249, __p2_249, __p3_249) __extension__ ({ \
-  poly8x16_t __s0_249 = __p0_249; \
-  poly8x8_t __s2_249 = __p2_249; \
-  poly8x16_t __ret_249; \
-  __ret_249 = vsetq_lane_p8(vget_lane_p8(__s2_249, __p3_249), __s0_249, __p1_249); \
-  __ret_249; \
+#define vcopyq_lane_p8(__p0_341, __p1_341, __p2_341, __p3_341) __extension__ ({ \
+  poly8x16_t __s0_341 = __p0_341; \
+  poly8x8_t __s2_341 = __p2_341; \
+  poly8x16_t __ret_341; \
+  __ret_341 = vsetq_lane_p8(vget_lane_p8(__s2_341, __p3_341), __s0_341, __p1_341); \
+  __ret_341; \
 })
 #else
-#define vcopyq_lane_p8(__p0_250, __p1_250, __p2_250, __p3_250) __extension__ ({ \
-  poly8x16_t __s0_250 = __p0_250; \
-  poly8x8_t __s2_250 = __p2_250; \
-  poly8x16_t __rev0_250;  __rev0_250 = __builtin_shufflevector(__s0_250, __s0_250, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly8x8_t __rev2_250;  __rev2_250 = __builtin_shufflevector(__s2_250, __s2_250, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly8x16_t __ret_250; \
-  __ret_250 = __noswap_vsetq_lane_p8(__noswap_vget_lane_p8(__rev2_250, __p3_250), __rev0_250, __p1_250); \
-  __ret_250 = __builtin_shufflevector(__ret_250, __ret_250, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_250; \
+#define vcopyq_lane_p8(__p0_342, __p1_342, __p2_342, __p3_342) __extension__ ({ \
+  poly8x16_t __s0_342 = __p0_342; \
+  poly8x8_t __s2_342 = __p2_342; \
+  poly8x16_t __rev0_342;  __rev0_342 = __builtin_shufflevector(__s0_342, __s0_342, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8_t __rev2_342;  __rev2_342 = __builtin_shufflevector(__s2_342, __s2_342, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16_t __ret_342; \
+  __ret_342 = __noswap_vsetq_lane_p8(__noswap_vget_lane_p8(__rev2_342, __p3_342), __rev0_342, __p1_342); \
+  __ret_342 = __builtin_shufflevector(__ret_342, __ret_342, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_342; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_p16(__p0_251, __p1_251, __p2_251, __p3_251) __extension__ ({ \
-  poly16x8_t __s0_251 = __p0_251; \
-  poly16x4_t __s2_251 = __p2_251; \
-  poly16x8_t __ret_251; \
-  __ret_251 = vsetq_lane_p16(vget_lane_p16(__s2_251, __p3_251), __s0_251, __p1_251); \
-  __ret_251; \
+#define vcopyq_lane_p16(__p0_343, __p1_343, __p2_343, __p3_343) __extension__ ({ \
+  poly16x8_t __s0_343 = __p0_343; \
+  poly16x4_t __s2_343 = __p2_343; \
+  poly16x8_t __ret_343; \
+  __ret_343 = vsetq_lane_p16(vget_lane_p16(__s2_343, __p3_343), __s0_343, __p1_343); \
+  __ret_343; \
 })
 #else
-#define vcopyq_lane_p16(__p0_252, __p1_252, __p2_252, __p3_252) __extension__ ({ \
-  poly16x8_t __s0_252 = __p0_252; \
-  poly16x4_t __s2_252 = __p2_252; \
-  poly16x8_t __rev0_252;  __rev0_252 = __builtin_shufflevector(__s0_252, __s0_252, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly16x4_t __rev2_252;  __rev2_252 = __builtin_shufflevector(__s2_252, __s2_252, 3, 2, 1, 0); \
-  poly16x8_t __ret_252; \
-  __ret_252 = __noswap_vsetq_lane_p16(__noswap_vget_lane_p16(__rev2_252, __p3_252), __rev0_252, __p1_252); \
-  __ret_252 = __builtin_shufflevector(__ret_252, __ret_252, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_252; \
+#define vcopyq_lane_p16(__p0_344, __p1_344, __p2_344, __p3_344) __extension__ ({ \
+  poly16x8_t __s0_344 = __p0_344; \
+  poly16x4_t __s2_344 = __p2_344; \
+  poly16x8_t __rev0_344;  __rev0_344 = __builtin_shufflevector(__s0_344, __s0_344, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x4_t __rev2_344;  __rev2_344 = __builtin_shufflevector(__s2_344, __s2_344, 3, 2, 1, 0); \
+  poly16x8_t __ret_344; \
+  __ret_344 = __noswap_vsetq_lane_p16(__noswap_vget_lane_p16(__rev2_344, __p3_344), __rev0_344, __p1_344); \
+  __ret_344 = __builtin_shufflevector(__ret_344, __ret_344, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_344; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_u8(__p0_253, __p1_253, __p2_253, __p3_253) __extension__ ({ \
-  uint8x16_t __s0_253 = __p0_253; \
-  uint8x8_t __s2_253 = __p2_253; \
-  uint8x16_t __ret_253; \
-  __ret_253 = vsetq_lane_u8(vget_lane_u8(__s2_253, __p3_253), __s0_253, __p1_253); \
-  __ret_253; \
+#define vcopyq_lane_u8(__p0_345, __p1_345, __p2_345, __p3_345) __extension__ ({ \
+  uint8x16_t __s0_345 = __p0_345; \
+  uint8x8_t __s2_345 = __p2_345; \
+  uint8x16_t __ret_345; \
+  __ret_345 = vsetq_lane_u8(vget_lane_u8(__s2_345, __p3_345), __s0_345, __p1_345); \
+  __ret_345; \
 })
 #else
-#define vcopyq_lane_u8(__p0_254, __p1_254, __p2_254, __p3_254) __extension__ ({ \
-  uint8x16_t __s0_254 = __p0_254; \
-  uint8x8_t __s2_254 = __p2_254; \
-  uint8x16_t __rev0_254;  __rev0_254 = __builtin_shufflevector(__s0_254, __s0_254, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x8_t __rev2_254;  __rev2_254 = __builtin_shufflevector(__s2_254, __s2_254, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x16_t __ret_254; \
-  __ret_254 = __noswap_vsetq_lane_u8(__noswap_vget_lane_u8(__rev2_254, __p3_254), __rev0_254, __p1_254); \
-  __ret_254 = __builtin_shufflevector(__ret_254, __ret_254, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_254; \
+#define vcopyq_lane_u8(__p0_346, __p1_346, __p2_346, __p3_346) __extension__ ({ \
+  uint8x16_t __s0_346 = __p0_346; \
+  uint8x8_t __s2_346 = __p2_346; \
+  uint8x16_t __rev0_346;  __rev0_346 = __builtin_shufflevector(__s0_346, __s0_346, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __rev2_346;  __rev2_346 = __builtin_shufflevector(__s2_346, __s2_346, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret_346; \
+  __ret_346 = __noswap_vsetq_lane_u8(__noswap_vget_lane_u8(__rev2_346, __p3_346), __rev0_346, __p1_346); \
+  __ret_346 = __builtin_shufflevector(__ret_346, __ret_346, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_346; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_u32(__p0_255, __p1_255, __p2_255, __p3_255) __extension__ ({ \
-  uint32x4_t __s0_255 = __p0_255; \
-  uint32x2_t __s2_255 = __p2_255; \
-  uint32x4_t __ret_255; \
-  __ret_255 = vsetq_lane_u32(vget_lane_u32(__s2_255, __p3_255), __s0_255, __p1_255); \
-  __ret_255; \
+#define vcopyq_lane_u32(__p0_347, __p1_347, __p2_347, __p3_347) __extension__ ({ \
+  uint32x4_t __s0_347 = __p0_347; \
+  uint32x2_t __s2_347 = __p2_347; \
+  uint32x4_t __ret_347; \
+  __ret_347 = vsetq_lane_u32(vget_lane_u32(__s2_347, __p3_347), __s0_347, __p1_347); \
+  __ret_347; \
 })
 #else
-#define vcopyq_lane_u32(__p0_256, __p1_256, __p2_256, __p3_256) __extension__ ({ \
-  uint32x4_t __s0_256 = __p0_256; \
-  uint32x2_t __s2_256 = __p2_256; \
-  uint32x4_t __rev0_256;  __rev0_256 = __builtin_shufflevector(__s0_256, __s0_256, 3, 2, 1, 0); \
-  uint32x2_t __rev2_256;  __rev2_256 = __builtin_shufflevector(__s2_256, __s2_256, 1, 0); \
-  uint32x4_t __ret_256; \
-  __ret_256 = __noswap_vsetq_lane_u32(__noswap_vget_lane_u32(__rev2_256, __p3_256), __rev0_256, __p1_256); \
-  __ret_256 = __builtin_shufflevector(__ret_256, __ret_256, 3, 2, 1, 0); \
-  __ret_256; \
+#define vcopyq_lane_u32(__p0_348, __p1_348, __p2_348, __p3_348) __extension__ ({ \
+  uint32x4_t __s0_348 = __p0_348; \
+  uint32x2_t __s2_348 = __p2_348; \
+  uint32x4_t __rev0_348;  __rev0_348 = __builtin_shufflevector(__s0_348, __s0_348, 3, 2, 1, 0); \
+  uint32x2_t __rev2_348;  __rev2_348 = __builtin_shufflevector(__s2_348, __s2_348, 1, 0); \
+  uint32x4_t __ret_348; \
+  __ret_348 = __noswap_vsetq_lane_u32(__noswap_vget_lane_u32(__rev2_348, __p3_348), __rev0_348, __p1_348); \
+  __ret_348 = __builtin_shufflevector(__ret_348, __ret_348, 3, 2, 1, 0); \
+  __ret_348; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_u64(__p0_257, __p1_257, __p2_257, __p3_257) __extension__ ({ \
-  uint64x2_t __s0_257 = __p0_257; \
-  uint64x1_t __s2_257 = __p2_257; \
-  uint64x2_t __ret_257; \
-  __ret_257 = vsetq_lane_u64(vget_lane_u64(__s2_257, __p3_257), __s0_257, __p1_257); \
-  __ret_257; \
+#define vcopyq_lane_u64(__p0_349, __p1_349, __p2_349, __p3_349) __extension__ ({ \
+  uint64x2_t __s0_349 = __p0_349; \
+  uint64x1_t __s2_349 = __p2_349; \
+  uint64x2_t __ret_349; \
+  __ret_349 = vsetq_lane_u64(vget_lane_u64(__s2_349, __p3_349), __s0_349, __p1_349); \
+  __ret_349; \
 })
 #else
-#define vcopyq_lane_u64(__p0_258, __p1_258, __p2_258, __p3_258) __extension__ ({ \
-  uint64x2_t __s0_258 = __p0_258; \
-  uint64x1_t __s2_258 = __p2_258; \
-  uint64x2_t __rev0_258;  __rev0_258 = __builtin_shufflevector(__s0_258, __s0_258, 1, 0); \
-  uint64x2_t __ret_258; \
-  __ret_258 = __noswap_vsetq_lane_u64(vget_lane_u64(__s2_258, __p3_258), __rev0_258, __p1_258); \
-  __ret_258 = __builtin_shufflevector(__ret_258, __ret_258, 1, 0); \
-  __ret_258; \
+#define vcopyq_lane_u64(__p0_350, __p1_350, __p2_350, __p3_350) __extension__ ({ \
+  uint64x2_t __s0_350 = __p0_350; \
+  uint64x1_t __s2_350 = __p2_350; \
+  uint64x2_t __rev0_350;  __rev0_350 = __builtin_shufflevector(__s0_350, __s0_350, 1, 0); \
+  uint64x2_t __ret_350; \
+  __ret_350 = __noswap_vsetq_lane_u64(vget_lane_u64(__s2_350, __p3_350), __rev0_350, __p1_350); \
+  __ret_350 = __builtin_shufflevector(__ret_350, __ret_350, 1, 0); \
+  __ret_350; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_u16(__p0_259, __p1_259, __p2_259, __p3_259) __extension__ ({ \
-  uint16x8_t __s0_259 = __p0_259; \
-  uint16x4_t __s2_259 = __p2_259; \
-  uint16x8_t __ret_259; \
-  __ret_259 = vsetq_lane_u16(vget_lane_u16(__s2_259, __p3_259), __s0_259, __p1_259); \
-  __ret_259; \
+#define vcopyq_lane_u16(__p0_351, __p1_351, __p2_351, __p3_351) __extension__ ({ \
+  uint16x8_t __s0_351 = __p0_351; \
+  uint16x4_t __s2_351 = __p2_351; \
+  uint16x8_t __ret_351; \
+  __ret_351 = vsetq_lane_u16(vget_lane_u16(__s2_351, __p3_351), __s0_351, __p1_351); \
+  __ret_351; \
 })
 #else
-#define vcopyq_lane_u16(__p0_260, __p1_260, __p2_260, __p3_260) __extension__ ({ \
-  uint16x8_t __s0_260 = __p0_260; \
-  uint16x4_t __s2_260 = __p2_260; \
-  uint16x8_t __rev0_260;  __rev0_260 = __builtin_shufflevector(__s0_260, __s0_260, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x4_t __rev2_260;  __rev2_260 = __builtin_shufflevector(__s2_260, __s2_260, 3, 2, 1, 0); \
-  uint16x8_t __ret_260; \
-  __ret_260 = __noswap_vsetq_lane_u16(__noswap_vget_lane_u16(__rev2_260, __p3_260), __rev0_260, __p1_260); \
-  __ret_260 = __builtin_shufflevector(__ret_260, __ret_260, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_260; \
+#define vcopyq_lane_u16(__p0_352, __p1_352, __p2_352, __p3_352) __extension__ ({ \
+  uint16x8_t __s0_352 = __p0_352; \
+  uint16x4_t __s2_352 = __p2_352; \
+  uint16x8_t __rev0_352;  __rev0_352 = __builtin_shufflevector(__s0_352, __s0_352, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x4_t __rev2_352;  __rev2_352 = __builtin_shufflevector(__s2_352, __s2_352, 3, 2, 1, 0); \
+  uint16x8_t __ret_352; \
+  __ret_352 = __noswap_vsetq_lane_u16(__noswap_vget_lane_u16(__rev2_352, __p3_352), __rev0_352, __p1_352); \
+  __ret_352 = __builtin_shufflevector(__ret_352, __ret_352, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_352; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_s8(__p0_261, __p1_261, __p2_261, __p3_261) __extension__ ({ \
-  int8x16_t __s0_261 = __p0_261; \
-  int8x8_t __s2_261 = __p2_261; \
-  int8x16_t __ret_261; \
-  __ret_261 = vsetq_lane_s8(vget_lane_s8(__s2_261, __p3_261), __s0_261, __p1_261); \
-  __ret_261; \
+#define vcopyq_lane_s8(__p0_353, __p1_353, __p2_353, __p3_353) __extension__ ({ \
+  int8x16_t __s0_353 = __p0_353; \
+  int8x8_t __s2_353 = __p2_353; \
+  int8x16_t __ret_353; \
+  __ret_353 = vsetq_lane_s8(vget_lane_s8(__s2_353, __p3_353), __s0_353, __p1_353); \
+  __ret_353; \
 })
 #else
-#define vcopyq_lane_s8(__p0_262, __p1_262, __p2_262, __p3_262) __extension__ ({ \
-  int8x16_t __s0_262 = __p0_262; \
-  int8x8_t __s2_262 = __p2_262; \
-  int8x16_t __rev0_262;  __rev0_262 = __builtin_shufflevector(__s0_262, __s0_262, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x8_t __rev2_262;  __rev2_262 = __builtin_shufflevector(__s2_262, __s2_262, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x16_t __ret_262; \
-  __ret_262 = __noswap_vsetq_lane_s8(__noswap_vget_lane_s8(__rev2_262, __p3_262), __rev0_262, __p1_262); \
-  __ret_262 = __builtin_shufflevector(__ret_262, __ret_262, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_262; \
+#define vcopyq_lane_s8(__p0_354, __p1_354, __p2_354, __p3_354) __extension__ ({ \
+  int8x16_t __s0_354 = __p0_354; \
+  int8x8_t __s2_354 = __p2_354; \
+  int8x16_t __rev0_354;  __rev0_354 = __builtin_shufflevector(__s0_354, __s0_354, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __rev2_354;  __rev2_354 = __builtin_shufflevector(__s2_354, __s2_354, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret_354; \
+  __ret_354 = __noswap_vsetq_lane_s8(__noswap_vget_lane_s8(__rev2_354, __p3_354), __rev0_354, __p1_354); \
+  __ret_354 = __builtin_shufflevector(__ret_354, __ret_354, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_354; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_f32(__p0_263, __p1_263, __p2_263, __p3_263) __extension__ ({ \
-  float32x4_t __s0_263 = __p0_263; \
-  float32x2_t __s2_263 = __p2_263; \
-  float32x4_t __ret_263; \
-  __ret_263 = vsetq_lane_f32(vget_lane_f32(__s2_263, __p3_263), __s0_263, __p1_263); \
-  __ret_263; \
+#define vcopyq_lane_f32(__p0_355, __p1_355, __p2_355, __p3_355) __extension__ ({ \
+  float32x4_t __s0_355 = __p0_355; \
+  float32x2_t __s2_355 = __p2_355; \
+  float32x4_t __ret_355; \
+  __ret_355 = vsetq_lane_f32(vget_lane_f32(__s2_355, __p3_355), __s0_355, __p1_355); \
+  __ret_355; \
 })
 #else
-#define vcopyq_lane_f32(__p0_264, __p1_264, __p2_264, __p3_264) __extension__ ({ \
-  float32x4_t __s0_264 = __p0_264; \
-  float32x2_t __s2_264 = __p2_264; \
-  float32x4_t __rev0_264;  __rev0_264 = __builtin_shufflevector(__s0_264, __s0_264, 3, 2, 1, 0); \
-  float32x2_t __rev2_264;  __rev2_264 = __builtin_shufflevector(__s2_264, __s2_264, 1, 0); \
-  float32x4_t __ret_264; \
-  __ret_264 = __noswap_vsetq_lane_f32(__noswap_vget_lane_f32(__rev2_264, __p3_264), __rev0_264, __p1_264); \
-  __ret_264 = __builtin_shufflevector(__ret_264, __ret_264, 3, 2, 1, 0); \
-  __ret_264; \
+#define vcopyq_lane_f32(__p0_356, __p1_356, __p2_356, __p3_356) __extension__ ({ \
+  float32x4_t __s0_356 = __p0_356; \
+  float32x2_t __s2_356 = __p2_356; \
+  float32x4_t __rev0_356;  __rev0_356 = __builtin_shufflevector(__s0_356, __s0_356, 3, 2, 1, 0); \
+  float32x2_t __rev2_356;  __rev2_356 = __builtin_shufflevector(__s2_356, __s2_356, 1, 0); \
+  float32x4_t __ret_356; \
+  __ret_356 = __noswap_vsetq_lane_f32(__noswap_vget_lane_f32(__rev2_356, __p3_356), __rev0_356, __p1_356); \
+  __ret_356 = __builtin_shufflevector(__ret_356, __ret_356, 3, 2, 1, 0); \
+  __ret_356; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_s32(__p0_265, __p1_265, __p2_265, __p3_265) __extension__ ({ \
-  int32x4_t __s0_265 = __p0_265; \
-  int32x2_t __s2_265 = __p2_265; \
-  int32x4_t __ret_265; \
-  __ret_265 = vsetq_lane_s32(vget_lane_s32(__s2_265, __p3_265), __s0_265, __p1_265); \
-  __ret_265; \
+#define vcopyq_lane_s32(__p0_357, __p1_357, __p2_357, __p3_357) __extension__ ({ \
+  int32x4_t __s0_357 = __p0_357; \
+  int32x2_t __s2_357 = __p2_357; \
+  int32x4_t __ret_357; \
+  __ret_357 = vsetq_lane_s32(vget_lane_s32(__s2_357, __p3_357), __s0_357, __p1_357); \
+  __ret_357; \
 })
 #else
-#define vcopyq_lane_s32(__p0_266, __p1_266, __p2_266, __p3_266) __extension__ ({ \
-  int32x4_t __s0_266 = __p0_266; \
-  int32x2_t __s2_266 = __p2_266; \
-  int32x4_t __rev0_266;  __rev0_266 = __builtin_shufflevector(__s0_266, __s0_266, 3, 2, 1, 0); \
-  int32x2_t __rev2_266;  __rev2_266 = __builtin_shufflevector(__s2_266, __s2_266, 1, 0); \
-  int32x4_t __ret_266; \
-  __ret_266 = __noswap_vsetq_lane_s32(__noswap_vget_lane_s32(__rev2_266, __p3_266), __rev0_266, __p1_266); \
-  __ret_266 = __builtin_shufflevector(__ret_266, __ret_266, 3, 2, 1, 0); \
-  __ret_266; \
+#define vcopyq_lane_s32(__p0_358, __p1_358, __p2_358, __p3_358) __extension__ ({ \
+  int32x4_t __s0_358 = __p0_358; \
+  int32x2_t __s2_358 = __p2_358; \
+  int32x4_t __rev0_358;  __rev0_358 = __builtin_shufflevector(__s0_358, __s0_358, 3, 2, 1, 0); \
+  int32x2_t __rev2_358;  __rev2_358 = __builtin_shufflevector(__s2_358, __s2_358, 1, 0); \
+  int32x4_t __ret_358; \
+  __ret_358 = __noswap_vsetq_lane_s32(__noswap_vget_lane_s32(__rev2_358, __p3_358), __rev0_358, __p1_358); \
+  __ret_358 = __builtin_shufflevector(__ret_358, __ret_358, 3, 2, 1, 0); \
+  __ret_358; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_s64(__p0_267, __p1_267, __p2_267, __p3_267) __extension__ ({ \
-  int64x2_t __s0_267 = __p0_267; \
-  int64x1_t __s2_267 = __p2_267; \
-  int64x2_t __ret_267; \
-  __ret_267 = vsetq_lane_s64(vget_lane_s64(__s2_267, __p3_267), __s0_267, __p1_267); \
-  __ret_267; \
+#define vcopyq_lane_s64(__p0_359, __p1_359, __p2_359, __p3_359) __extension__ ({ \
+  int64x2_t __s0_359 = __p0_359; \
+  int64x1_t __s2_359 = __p2_359; \
+  int64x2_t __ret_359; \
+  __ret_359 = vsetq_lane_s64(vget_lane_s64(__s2_359, __p3_359), __s0_359, __p1_359); \
+  __ret_359; \
 })
 #else
-#define vcopyq_lane_s64(__p0_268, __p1_268, __p2_268, __p3_268) __extension__ ({ \
-  int64x2_t __s0_268 = __p0_268; \
-  int64x1_t __s2_268 = __p2_268; \
-  int64x2_t __rev0_268;  __rev0_268 = __builtin_shufflevector(__s0_268, __s0_268, 1, 0); \
-  int64x2_t __ret_268; \
-  __ret_268 = __noswap_vsetq_lane_s64(vget_lane_s64(__s2_268, __p3_268), __rev0_268, __p1_268); \
-  __ret_268 = __builtin_shufflevector(__ret_268, __ret_268, 1, 0); \
-  __ret_268; \
+#define vcopyq_lane_s64(__p0_360, __p1_360, __p2_360, __p3_360) __extension__ ({ \
+  int64x2_t __s0_360 = __p0_360; \
+  int64x1_t __s2_360 = __p2_360; \
+  int64x2_t __rev0_360;  __rev0_360 = __builtin_shufflevector(__s0_360, __s0_360, 1, 0); \
+  int64x2_t __ret_360; \
+  __ret_360 = __noswap_vsetq_lane_s64(vget_lane_s64(__s2_360, __p3_360), __rev0_360, __p1_360); \
+  __ret_360 = __builtin_shufflevector(__ret_360, __ret_360, 1, 0); \
+  __ret_360; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_s16(__p0_269, __p1_269, __p2_269, __p3_269) __extension__ ({ \
-  int16x8_t __s0_269 = __p0_269; \
-  int16x4_t __s2_269 = __p2_269; \
-  int16x8_t __ret_269; \
-  __ret_269 = vsetq_lane_s16(vget_lane_s16(__s2_269, __p3_269), __s0_269, __p1_269); \
-  __ret_269; \
+#define vcopyq_lane_s16(__p0_361, __p1_361, __p2_361, __p3_361) __extension__ ({ \
+  int16x8_t __s0_361 = __p0_361; \
+  int16x4_t __s2_361 = __p2_361; \
+  int16x8_t __ret_361; \
+  __ret_361 = vsetq_lane_s16(vget_lane_s16(__s2_361, __p3_361), __s0_361, __p1_361); \
+  __ret_361; \
 })
 #else
-#define vcopyq_lane_s16(__p0_270, __p1_270, __p2_270, __p3_270) __extension__ ({ \
-  int16x8_t __s0_270 = __p0_270; \
-  int16x4_t __s2_270 = __p2_270; \
-  int16x8_t __rev0_270;  __rev0_270 = __builtin_shufflevector(__s0_270, __s0_270, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x4_t __rev2_270;  __rev2_270 = __builtin_shufflevector(__s2_270, __s2_270, 3, 2, 1, 0); \
-  int16x8_t __ret_270; \
-  __ret_270 = __noswap_vsetq_lane_s16(__noswap_vget_lane_s16(__rev2_270, __p3_270), __rev0_270, __p1_270); \
-  __ret_270 = __builtin_shufflevector(__ret_270, __ret_270, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_270; \
+#define vcopyq_lane_s16(__p0_362, __p1_362, __p2_362, __p3_362) __extension__ ({ \
+  int16x8_t __s0_362 = __p0_362; \
+  int16x4_t __s2_362 = __p2_362; \
+  int16x8_t __rev0_362;  __rev0_362 = __builtin_shufflevector(__s0_362, __s0_362, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __rev2_362;  __rev2_362 = __builtin_shufflevector(__s2_362, __s2_362, 3, 2, 1, 0); \
+  int16x8_t __ret_362; \
+  __ret_362 = __noswap_vsetq_lane_s16(__noswap_vget_lane_s16(__rev2_362, __p3_362), __rev0_362, __p1_362); \
+  __ret_362 = __builtin_shufflevector(__ret_362, __ret_362, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_362; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_lane_p8(__p0_271, __p1_271, __p2_271, __p3_271) __extension__ ({ \
-  poly8x8_t __s0_271 = __p0_271; \
-  poly8x8_t __s2_271 = __p2_271; \
-  poly8x8_t __ret_271; \
-  __ret_271 = vset_lane_p8(vget_lane_p8(__s2_271, __p3_271), __s0_271, __p1_271); \
-  __ret_271; \
+#define vcopy_lane_p8(__p0_363, __p1_363, __p2_363, __p3_363) __extension__ ({ \
+  poly8x8_t __s0_363 = __p0_363; \
+  poly8x8_t __s2_363 = __p2_363; \
+  poly8x8_t __ret_363; \
+  __ret_363 = vset_lane_p8(vget_lane_p8(__s2_363, __p3_363), __s0_363, __p1_363); \
+  __ret_363; \
 })
 #else
-#define vcopy_lane_p8(__p0_272, __p1_272, __p2_272, __p3_272) __extension__ ({ \
-  poly8x8_t __s0_272 = __p0_272; \
-  poly8x8_t __s2_272 = __p2_272; \
-  poly8x8_t __rev0_272;  __rev0_272 = __builtin_shufflevector(__s0_272, __s0_272, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly8x8_t __rev2_272;  __rev2_272 = __builtin_shufflevector(__s2_272, __s2_272, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly8x8_t __ret_272; \
-  __ret_272 = __noswap_vset_lane_p8(__noswap_vget_lane_p8(__rev2_272, __p3_272), __rev0_272, __p1_272); \
-  __ret_272 = __builtin_shufflevector(__ret_272, __ret_272, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_272; \
+#define vcopy_lane_p8(__p0_364, __p1_364, __p2_364, __p3_364) __extension__ ({ \
+  poly8x8_t __s0_364 = __p0_364; \
+  poly8x8_t __s2_364 = __p2_364; \
+  poly8x8_t __rev0_364;  __rev0_364 = __builtin_shufflevector(__s0_364, __s0_364, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8_t __rev2_364;  __rev2_364 = __builtin_shufflevector(__s2_364, __s2_364, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8_t __ret_364; \
+  __ret_364 = __noswap_vset_lane_p8(__noswap_vget_lane_p8(__rev2_364, __p3_364), __rev0_364, __p1_364); \
+  __ret_364 = __builtin_shufflevector(__ret_364, __ret_364, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_364; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_lane_p16(__p0_273, __p1_273, __p2_273, __p3_273) __extension__ ({ \
-  poly16x4_t __s0_273 = __p0_273; \
-  poly16x4_t __s2_273 = __p2_273; \
-  poly16x4_t __ret_273; \
-  __ret_273 = vset_lane_p16(vget_lane_p16(__s2_273, __p3_273), __s0_273, __p1_273); \
-  __ret_273; \
+#define vcopy_lane_p16(__p0_365, __p1_365, __p2_365, __p3_365) __extension__ ({ \
+  poly16x4_t __s0_365 = __p0_365; \
+  poly16x4_t __s2_365 = __p2_365; \
+  poly16x4_t __ret_365; \
+  __ret_365 = vset_lane_p16(vget_lane_p16(__s2_365, __p3_365), __s0_365, __p1_365); \
+  __ret_365; \
 })
 #else
-#define vcopy_lane_p16(__p0_274, __p1_274, __p2_274, __p3_274) __extension__ ({ \
-  poly16x4_t __s0_274 = __p0_274; \
-  poly16x4_t __s2_274 = __p2_274; \
-  poly16x4_t __rev0_274;  __rev0_274 = __builtin_shufflevector(__s0_274, __s0_274, 3, 2, 1, 0); \
-  poly16x4_t __rev2_274;  __rev2_274 = __builtin_shufflevector(__s2_274, __s2_274, 3, 2, 1, 0); \
-  poly16x4_t __ret_274; \
-  __ret_274 = __noswap_vset_lane_p16(__noswap_vget_lane_p16(__rev2_274, __p3_274), __rev0_274, __p1_274); \
-  __ret_274 = __builtin_shufflevector(__ret_274, __ret_274, 3, 2, 1, 0); \
-  __ret_274; \
+#define vcopy_lane_p16(__p0_366, __p1_366, __p2_366, __p3_366) __extension__ ({ \
+  poly16x4_t __s0_366 = __p0_366; \
+  poly16x4_t __s2_366 = __p2_366; \
+  poly16x4_t __rev0_366;  __rev0_366 = __builtin_shufflevector(__s0_366, __s0_366, 3, 2, 1, 0); \
+  poly16x4_t __rev2_366;  __rev2_366 = __builtin_shufflevector(__s2_366, __s2_366, 3, 2, 1, 0); \
+  poly16x4_t __ret_366; \
+  __ret_366 = __noswap_vset_lane_p16(__noswap_vget_lane_p16(__rev2_366, __p3_366), __rev0_366, __p1_366); \
+  __ret_366 = __builtin_shufflevector(__ret_366, __ret_366, 3, 2, 1, 0); \
+  __ret_366; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_lane_u8(__p0_275, __p1_275, __p2_275, __p3_275) __extension__ ({ \
-  uint8x8_t __s0_275 = __p0_275; \
-  uint8x8_t __s2_275 = __p2_275; \
-  uint8x8_t __ret_275; \
-  __ret_275 = vset_lane_u8(vget_lane_u8(__s2_275, __p3_275), __s0_275, __p1_275); \
-  __ret_275; \
+#define vcopy_lane_u8(__p0_367, __p1_367, __p2_367, __p3_367) __extension__ ({ \
+  uint8x8_t __s0_367 = __p0_367; \
+  uint8x8_t __s2_367 = __p2_367; \
+  uint8x8_t __ret_367; \
+  __ret_367 = vset_lane_u8(vget_lane_u8(__s2_367, __p3_367), __s0_367, __p1_367); \
+  __ret_367; \
 })
 #else
-#define vcopy_lane_u8(__p0_276, __p1_276, __p2_276, __p3_276) __extension__ ({ \
-  uint8x8_t __s0_276 = __p0_276; \
-  uint8x8_t __s2_276 = __p2_276; \
-  uint8x8_t __rev0_276;  __rev0_276 = __builtin_shufflevector(__s0_276, __s0_276, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x8_t __rev2_276;  __rev2_276 = __builtin_shufflevector(__s2_276, __s2_276, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x8_t __ret_276; \
-  __ret_276 = __noswap_vset_lane_u8(__noswap_vget_lane_u8(__rev2_276, __p3_276), __rev0_276, __p1_276); \
-  __ret_276 = __builtin_shufflevector(__ret_276, __ret_276, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_276; \
+#define vcopy_lane_u8(__p0_368, __p1_368, __p2_368, __p3_368) __extension__ ({ \
+  uint8x8_t __s0_368 = __p0_368; \
+  uint8x8_t __s2_368 = __p2_368; \
+  uint8x8_t __rev0_368;  __rev0_368 = __builtin_shufflevector(__s0_368, __s0_368, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __rev2_368;  __rev2_368 = __builtin_shufflevector(__s2_368, __s2_368, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret_368; \
+  __ret_368 = __noswap_vset_lane_u8(__noswap_vget_lane_u8(__rev2_368, __p3_368), __rev0_368, __p1_368); \
+  __ret_368 = __builtin_shufflevector(__ret_368, __ret_368, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_368; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_lane_u32(__p0_277, __p1_277, __p2_277, __p3_277) __extension__ ({ \
-  uint32x2_t __s0_277 = __p0_277; \
-  uint32x2_t __s2_277 = __p2_277; \
-  uint32x2_t __ret_277; \
-  __ret_277 = vset_lane_u32(vget_lane_u32(__s2_277, __p3_277), __s0_277, __p1_277); \
-  __ret_277; \
+#define vcopy_lane_u32(__p0_369, __p1_369, __p2_369, __p3_369) __extension__ ({ \
+  uint32x2_t __s0_369 = __p0_369; \
+  uint32x2_t __s2_369 = __p2_369; \
+  uint32x2_t __ret_369; \
+  __ret_369 = vset_lane_u32(vget_lane_u32(__s2_369, __p3_369), __s0_369, __p1_369); \
+  __ret_369; \
 })
 #else
-#define vcopy_lane_u32(__p0_278, __p1_278, __p2_278, __p3_278) __extension__ ({ \
-  uint32x2_t __s0_278 = __p0_278; \
-  uint32x2_t __s2_278 = __p2_278; \
-  uint32x2_t __rev0_278;  __rev0_278 = __builtin_shufflevector(__s0_278, __s0_278, 1, 0); \
-  uint32x2_t __rev2_278;  __rev2_278 = __builtin_shufflevector(__s2_278, __s2_278, 1, 0); \
-  uint32x2_t __ret_278; \
-  __ret_278 = __noswap_vset_lane_u32(__noswap_vget_lane_u32(__rev2_278, __p3_278), __rev0_278, __p1_278); \
-  __ret_278 = __builtin_shufflevector(__ret_278, __ret_278, 1, 0); \
-  __ret_278; \
+#define vcopy_lane_u32(__p0_370, __p1_370, __p2_370, __p3_370) __extension__ ({ \
+  uint32x2_t __s0_370 = __p0_370; \
+  uint32x2_t __s2_370 = __p2_370; \
+  uint32x2_t __rev0_370;  __rev0_370 = __builtin_shufflevector(__s0_370, __s0_370, 1, 0); \
+  uint32x2_t __rev2_370;  __rev2_370 = __builtin_shufflevector(__s2_370, __s2_370, 1, 0); \
+  uint32x2_t __ret_370; \
+  __ret_370 = __noswap_vset_lane_u32(__noswap_vget_lane_u32(__rev2_370, __p3_370), __rev0_370, __p1_370); \
+  __ret_370 = __builtin_shufflevector(__ret_370, __ret_370, 1, 0); \
+  __ret_370; \
 })
 #endif
 
-#define vcopy_lane_u64(__p0_279, __p1_279, __p2_279, __p3_279) __extension__ ({ \
-  uint64x1_t __s0_279 = __p0_279; \
-  uint64x1_t __s2_279 = __p2_279; \
-  uint64x1_t __ret_279; \
-  __ret_279 = vset_lane_u64(vget_lane_u64(__s2_279, __p3_279), __s0_279, __p1_279); \
-  __ret_279; \
+#define vcopy_lane_u64(__p0_371, __p1_371, __p2_371, __p3_371) __extension__ ({ \
+  uint64x1_t __s0_371 = __p0_371; \
+  uint64x1_t __s2_371 = __p2_371; \
+  uint64x1_t __ret_371; \
+  __ret_371 = vset_lane_u64(vget_lane_u64(__s2_371, __p3_371), __s0_371, __p1_371); \
+  __ret_371; \
 })
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_lane_u16(__p0_280, __p1_280, __p2_280, __p3_280) __extension__ ({ \
-  uint16x4_t __s0_280 = __p0_280; \
-  uint16x4_t __s2_280 = __p2_280; \
-  uint16x4_t __ret_280; \
-  __ret_280 = vset_lane_u16(vget_lane_u16(__s2_280, __p3_280), __s0_280, __p1_280); \
-  __ret_280; \
+#define vcopy_lane_u16(__p0_372, __p1_372, __p2_372, __p3_372) __extension__ ({ \
+  uint16x4_t __s0_372 = __p0_372; \
+  uint16x4_t __s2_372 = __p2_372; \
+  uint16x4_t __ret_372; \
+  __ret_372 = vset_lane_u16(vget_lane_u16(__s2_372, __p3_372), __s0_372, __p1_372); \
+  __ret_372; \
 })
 #else
-#define vcopy_lane_u16(__p0_281, __p1_281, __p2_281, __p3_281) __extension__ ({ \
-  uint16x4_t __s0_281 = __p0_281; \
-  uint16x4_t __s2_281 = __p2_281; \
-  uint16x4_t __rev0_281;  __rev0_281 = __builtin_shufflevector(__s0_281, __s0_281, 3, 2, 1, 0); \
-  uint16x4_t __rev2_281;  __rev2_281 = __builtin_shufflevector(__s2_281, __s2_281, 3, 2, 1, 0); \
-  uint16x4_t __ret_281; \
-  __ret_281 = __noswap_vset_lane_u16(__noswap_vget_lane_u16(__rev2_281, __p3_281), __rev0_281, __p1_281); \
-  __ret_281 = __builtin_shufflevector(__ret_281, __ret_281, 3, 2, 1, 0); \
-  __ret_281; \
+#define vcopy_lane_u16(__p0_373, __p1_373, __p2_373, __p3_373) __extension__ ({ \
+  uint16x4_t __s0_373 = __p0_373; \
+  uint16x4_t __s2_373 = __p2_373; \
+  uint16x4_t __rev0_373;  __rev0_373 = __builtin_shufflevector(__s0_373, __s0_373, 3, 2, 1, 0); \
+  uint16x4_t __rev2_373;  __rev2_373 = __builtin_shufflevector(__s2_373, __s2_373, 3, 2, 1, 0); \
+  uint16x4_t __ret_373; \
+  __ret_373 = __noswap_vset_lane_u16(__noswap_vget_lane_u16(__rev2_373, __p3_373), __rev0_373, __p1_373); \
+  __ret_373 = __builtin_shufflevector(__ret_373, __ret_373, 3, 2, 1, 0); \
+  __ret_373; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_lane_s8(__p0_282, __p1_282, __p2_282, __p3_282) __extension__ ({ \
-  int8x8_t __s0_282 = __p0_282; \
-  int8x8_t __s2_282 = __p2_282; \
-  int8x8_t __ret_282; \
-  __ret_282 = vset_lane_s8(vget_lane_s8(__s2_282, __p3_282), __s0_282, __p1_282); \
-  __ret_282; \
+#define vcopy_lane_s8(__p0_374, __p1_374, __p2_374, __p3_374) __extension__ ({ \
+  int8x8_t __s0_374 = __p0_374; \
+  int8x8_t __s2_374 = __p2_374; \
+  int8x8_t __ret_374; \
+  __ret_374 = vset_lane_s8(vget_lane_s8(__s2_374, __p3_374), __s0_374, __p1_374); \
+  __ret_374; \
 })
 #else
-#define vcopy_lane_s8(__p0_283, __p1_283, __p2_283, __p3_283) __extension__ ({ \
-  int8x8_t __s0_283 = __p0_283; \
-  int8x8_t __s2_283 = __p2_283; \
-  int8x8_t __rev0_283;  __rev0_283 = __builtin_shufflevector(__s0_283, __s0_283, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x8_t __rev2_283;  __rev2_283 = __builtin_shufflevector(__s2_283, __s2_283, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x8_t __ret_283; \
-  __ret_283 = __noswap_vset_lane_s8(__noswap_vget_lane_s8(__rev2_283, __p3_283), __rev0_283, __p1_283); \
-  __ret_283 = __builtin_shufflevector(__ret_283, __ret_283, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_283; \
+#define vcopy_lane_s8(__p0_375, __p1_375, __p2_375, __p3_375) __extension__ ({ \
+  int8x8_t __s0_375 = __p0_375; \
+  int8x8_t __s2_375 = __p2_375; \
+  int8x8_t __rev0_375;  __rev0_375 = __builtin_shufflevector(__s0_375, __s0_375, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __rev2_375;  __rev2_375 = __builtin_shufflevector(__s2_375, __s2_375, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret_375; \
+  __ret_375 = __noswap_vset_lane_s8(__noswap_vget_lane_s8(__rev2_375, __p3_375), __rev0_375, __p1_375); \
+  __ret_375 = __builtin_shufflevector(__ret_375, __ret_375, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_375; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_lane_f32(__p0_284, __p1_284, __p2_284, __p3_284) __extension__ ({ \
-  float32x2_t __s0_284 = __p0_284; \
-  float32x2_t __s2_284 = __p2_284; \
-  float32x2_t __ret_284; \
-  __ret_284 = vset_lane_f32(vget_lane_f32(__s2_284, __p3_284), __s0_284, __p1_284); \
-  __ret_284; \
+#define vcopy_lane_f32(__p0_376, __p1_376, __p2_376, __p3_376) __extension__ ({ \
+  float32x2_t __s0_376 = __p0_376; \
+  float32x2_t __s2_376 = __p2_376; \
+  float32x2_t __ret_376; \
+  __ret_376 = vset_lane_f32(vget_lane_f32(__s2_376, __p3_376), __s0_376, __p1_376); \
+  __ret_376; \
 })
 #else
-#define vcopy_lane_f32(__p0_285, __p1_285, __p2_285, __p3_285) __extension__ ({ \
-  float32x2_t __s0_285 = __p0_285; \
-  float32x2_t __s2_285 = __p2_285; \
-  float32x2_t __rev0_285;  __rev0_285 = __builtin_shufflevector(__s0_285, __s0_285, 1, 0); \
-  float32x2_t __rev2_285;  __rev2_285 = __builtin_shufflevector(__s2_285, __s2_285, 1, 0); \
-  float32x2_t __ret_285; \
-  __ret_285 = __noswap_vset_lane_f32(__noswap_vget_lane_f32(__rev2_285, __p3_285), __rev0_285, __p1_285); \
-  __ret_285 = __builtin_shufflevector(__ret_285, __ret_285, 1, 0); \
-  __ret_285; \
+#define vcopy_lane_f32(__p0_377, __p1_377, __p2_377, __p3_377) __extension__ ({ \
+  float32x2_t __s0_377 = __p0_377; \
+  float32x2_t __s2_377 = __p2_377; \
+  float32x2_t __rev0_377;  __rev0_377 = __builtin_shufflevector(__s0_377, __s0_377, 1, 0); \
+  float32x2_t __rev2_377;  __rev2_377 = __builtin_shufflevector(__s2_377, __s2_377, 1, 0); \
+  float32x2_t __ret_377; \
+  __ret_377 = __noswap_vset_lane_f32(__noswap_vget_lane_f32(__rev2_377, __p3_377), __rev0_377, __p1_377); \
+  __ret_377 = __builtin_shufflevector(__ret_377, __ret_377, 1, 0); \
+  __ret_377; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_lane_s32(__p0_286, __p1_286, __p2_286, __p3_286) __extension__ ({ \
-  int32x2_t __s0_286 = __p0_286; \
-  int32x2_t __s2_286 = __p2_286; \
-  int32x2_t __ret_286; \
-  __ret_286 = vset_lane_s32(vget_lane_s32(__s2_286, __p3_286), __s0_286, __p1_286); \
-  __ret_286; \
+#define vcopy_lane_s32(__p0_378, __p1_378, __p2_378, __p3_378) __extension__ ({ \
+  int32x2_t __s0_378 = __p0_378; \
+  int32x2_t __s2_378 = __p2_378; \
+  int32x2_t __ret_378; \
+  __ret_378 = vset_lane_s32(vget_lane_s32(__s2_378, __p3_378), __s0_378, __p1_378); \
+  __ret_378; \
 })
 #else
-#define vcopy_lane_s32(__p0_287, __p1_287, __p2_287, __p3_287) __extension__ ({ \
-  int32x2_t __s0_287 = __p0_287; \
-  int32x2_t __s2_287 = __p2_287; \
-  int32x2_t __rev0_287;  __rev0_287 = __builtin_shufflevector(__s0_287, __s0_287, 1, 0); \
-  int32x2_t __rev2_287;  __rev2_287 = __builtin_shufflevector(__s2_287, __s2_287, 1, 0); \
-  int32x2_t __ret_287; \
-  __ret_287 = __noswap_vset_lane_s32(__noswap_vget_lane_s32(__rev2_287, __p3_287), __rev0_287, __p1_287); \
-  __ret_287 = __builtin_shufflevector(__ret_287, __ret_287, 1, 0); \
-  __ret_287; \
+#define vcopy_lane_s32(__p0_379, __p1_379, __p2_379, __p3_379) __extension__ ({ \
+  int32x2_t __s0_379 = __p0_379; \
+  int32x2_t __s2_379 = __p2_379; \
+  int32x2_t __rev0_379;  __rev0_379 = __builtin_shufflevector(__s0_379, __s0_379, 1, 0); \
+  int32x2_t __rev2_379;  __rev2_379 = __builtin_shufflevector(__s2_379, __s2_379, 1, 0); \
+  int32x2_t __ret_379; \
+  __ret_379 = __noswap_vset_lane_s32(__noswap_vget_lane_s32(__rev2_379, __p3_379), __rev0_379, __p1_379); \
+  __ret_379 = __builtin_shufflevector(__ret_379, __ret_379, 1, 0); \
+  __ret_379; \
 })
 #endif
 
-#define vcopy_lane_s64(__p0_288, __p1_288, __p2_288, __p3_288) __extension__ ({ \
-  int64x1_t __s0_288 = __p0_288; \
-  int64x1_t __s2_288 = __p2_288; \
-  int64x1_t __ret_288; \
-  __ret_288 = vset_lane_s64(vget_lane_s64(__s2_288, __p3_288), __s0_288, __p1_288); \
-  __ret_288; \
+#define vcopy_lane_s64(__p0_380, __p1_380, __p2_380, __p3_380) __extension__ ({ \
+  int64x1_t __s0_380 = __p0_380; \
+  int64x1_t __s2_380 = __p2_380; \
+  int64x1_t __ret_380; \
+  __ret_380 = vset_lane_s64(vget_lane_s64(__s2_380, __p3_380), __s0_380, __p1_380); \
+  __ret_380; \
 })
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_lane_s16(__p0_289, __p1_289, __p2_289, __p3_289) __extension__ ({ \
-  int16x4_t __s0_289 = __p0_289; \
-  int16x4_t __s2_289 = __p2_289; \
-  int16x4_t __ret_289; \
-  __ret_289 = vset_lane_s16(vget_lane_s16(__s2_289, __p3_289), __s0_289, __p1_289); \
-  __ret_289; \
+#define vcopy_lane_s16(__p0_381, __p1_381, __p2_381, __p3_381) __extension__ ({ \
+  int16x4_t __s0_381 = __p0_381; \
+  int16x4_t __s2_381 = __p2_381; \
+  int16x4_t __ret_381; \
+  __ret_381 = vset_lane_s16(vget_lane_s16(__s2_381, __p3_381), __s0_381, __p1_381); \
+  __ret_381; \
 })
 #else
-#define vcopy_lane_s16(__p0_290, __p1_290, __p2_290, __p3_290) __extension__ ({ \
-  int16x4_t __s0_290 = __p0_290; \
-  int16x4_t __s2_290 = __p2_290; \
-  int16x4_t __rev0_290;  __rev0_290 = __builtin_shufflevector(__s0_290, __s0_290, 3, 2, 1, 0); \
-  int16x4_t __rev2_290;  __rev2_290 = __builtin_shufflevector(__s2_290, __s2_290, 3, 2, 1, 0); \
-  int16x4_t __ret_290; \
-  __ret_290 = __noswap_vset_lane_s16(__noswap_vget_lane_s16(__rev2_290, __p3_290), __rev0_290, __p1_290); \
-  __ret_290 = __builtin_shufflevector(__ret_290, __ret_290, 3, 2, 1, 0); \
-  __ret_290; \
+#define vcopy_lane_s16(__p0_382, __p1_382, __p2_382, __p3_382) __extension__ ({ \
+  int16x4_t __s0_382 = __p0_382; \
+  int16x4_t __s2_382 = __p2_382; \
+  int16x4_t __rev0_382;  __rev0_382 = __builtin_shufflevector(__s0_382, __s0_382, 3, 2, 1, 0); \
+  int16x4_t __rev2_382;  __rev2_382 = __builtin_shufflevector(__s2_382, __s2_382, 3, 2, 1, 0); \
+  int16x4_t __ret_382; \
+  __ret_382 = __noswap_vset_lane_s16(__noswap_vget_lane_s16(__rev2_382, __p3_382), __rev0_382, __p1_382); \
+  __ret_382 = __builtin_shufflevector(__ret_382, __ret_382, 3, 2, 1, 0); \
+  __ret_382; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_p8(__p0_291, __p1_291, __p2_291, __p3_291) __extension__ ({ \
-  poly8x16_t __s0_291 = __p0_291; \
-  poly8x16_t __s2_291 = __p2_291; \
-  poly8x16_t __ret_291; \
-  __ret_291 = vsetq_lane_p8(vgetq_lane_p8(__s2_291, __p3_291), __s0_291, __p1_291); \
-  __ret_291; \
+#define vcopyq_laneq_p8(__p0_383, __p1_383, __p2_383, __p3_383) __extension__ ({ \
+  poly8x16_t __s0_383 = __p0_383; \
+  poly8x16_t __s2_383 = __p2_383; \
+  poly8x16_t __ret_383; \
+  __ret_383 = vsetq_lane_p8(vgetq_lane_p8(__s2_383, __p3_383), __s0_383, __p1_383); \
+  __ret_383; \
 })
 #else
-#define vcopyq_laneq_p8(__p0_292, __p1_292, __p2_292, __p3_292) __extension__ ({ \
-  poly8x16_t __s0_292 = __p0_292; \
-  poly8x16_t __s2_292 = __p2_292; \
-  poly8x16_t __rev0_292;  __rev0_292 = __builtin_shufflevector(__s0_292, __s0_292, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly8x16_t __rev2_292;  __rev2_292 = __builtin_shufflevector(__s2_292, __s2_292, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly8x16_t __ret_292; \
-  __ret_292 = __noswap_vsetq_lane_p8(__noswap_vgetq_lane_p8(__rev2_292, __p3_292), __rev0_292, __p1_292); \
-  __ret_292 = __builtin_shufflevector(__ret_292, __ret_292, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_292; \
+#define vcopyq_laneq_p8(__p0_384, __p1_384, __p2_384, __p3_384) __extension__ ({ \
+  poly8x16_t __s0_384 = __p0_384; \
+  poly8x16_t __s2_384 = __p2_384; \
+  poly8x16_t __rev0_384;  __rev0_384 = __builtin_shufflevector(__s0_384, __s0_384, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16_t __rev2_384;  __rev2_384 = __builtin_shufflevector(__s2_384, __s2_384, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16_t __ret_384; \
+  __ret_384 = __noswap_vsetq_lane_p8(__noswap_vgetq_lane_p8(__rev2_384, __p3_384), __rev0_384, __p1_384); \
+  __ret_384 = __builtin_shufflevector(__ret_384, __ret_384, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_384; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_p16(__p0_293, __p1_293, __p2_293, __p3_293) __extension__ ({ \
-  poly16x8_t __s0_293 = __p0_293; \
-  poly16x8_t __s2_293 = __p2_293; \
-  poly16x8_t __ret_293; \
-  __ret_293 = vsetq_lane_p16(vgetq_lane_p16(__s2_293, __p3_293), __s0_293, __p1_293); \
-  __ret_293; \
+#define vcopyq_laneq_p16(__p0_385, __p1_385, __p2_385, __p3_385) __extension__ ({ \
+  poly16x8_t __s0_385 = __p0_385; \
+  poly16x8_t __s2_385 = __p2_385; \
+  poly16x8_t __ret_385; \
+  __ret_385 = vsetq_lane_p16(vgetq_lane_p16(__s2_385, __p3_385), __s0_385, __p1_385); \
+  __ret_385; \
 })
 #else
-#define vcopyq_laneq_p16(__p0_294, __p1_294, __p2_294, __p3_294) __extension__ ({ \
-  poly16x8_t __s0_294 = __p0_294; \
-  poly16x8_t __s2_294 = __p2_294; \
-  poly16x8_t __rev0_294;  __rev0_294 = __builtin_shufflevector(__s0_294, __s0_294, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly16x8_t __rev2_294;  __rev2_294 = __builtin_shufflevector(__s2_294, __s2_294, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly16x8_t __ret_294; \
-  __ret_294 = __noswap_vsetq_lane_p16(__noswap_vgetq_lane_p16(__rev2_294, __p3_294), __rev0_294, __p1_294); \
-  __ret_294 = __builtin_shufflevector(__ret_294, __ret_294, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_294; \
+#define vcopyq_laneq_p16(__p0_386, __p1_386, __p2_386, __p3_386) __extension__ ({ \
+  poly16x8_t __s0_386 = __p0_386; \
+  poly16x8_t __s2_386 = __p2_386; \
+  poly16x8_t __rev0_386;  __rev0_386 = __builtin_shufflevector(__s0_386, __s0_386, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x8_t __rev2_386;  __rev2_386 = __builtin_shufflevector(__s2_386, __s2_386, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x8_t __ret_386; \
+  __ret_386 = __noswap_vsetq_lane_p16(__noswap_vgetq_lane_p16(__rev2_386, __p3_386), __rev0_386, __p1_386); \
+  __ret_386 = __builtin_shufflevector(__ret_386, __ret_386, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_386; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_u8(__p0_295, __p1_295, __p2_295, __p3_295) __extension__ ({ \
-  uint8x16_t __s0_295 = __p0_295; \
-  uint8x16_t __s2_295 = __p2_295; \
-  uint8x16_t __ret_295; \
-  __ret_295 = vsetq_lane_u8(vgetq_lane_u8(__s2_295, __p3_295), __s0_295, __p1_295); \
-  __ret_295; \
+#define vcopyq_laneq_u8(__p0_387, __p1_387, __p2_387, __p3_387) __extension__ ({ \
+  uint8x16_t __s0_387 = __p0_387; \
+  uint8x16_t __s2_387 = __p2_387; \
+  uint8x16_t __ret_387; \
+  __ret_387 = vsetq_lane_u8(vgetq_lane_u8(__s2_387, __p3_387), __s0_387, __p1_387); \
+  __ret_387; \
 })
 #else
-#define vcopyq_laneq_u8(__p0_296, __p1_296, __p2_296, __p3_296) __extension__ ({ \
-  uint8x16_t __s0_296 = __p0_296; \
-  uint8x16_t __s2_296 = __p2_296; \
-  uint8x16_t __rev0_296;  __rev0_296 = __builtin_shufflevector(__s0_296, __s0_296, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x16_t __rev2_296;  __rev2_296 = __builtin_shufflevector(__s2_296, __s2_296, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x16_t __ret_296; \
-  __ret_296 = __noswap_vsetq_lane_u8(__noswap_vgetq_lane_u8(__rev2_296, __p3_296), __rev0_296, __p1_296); \
-  __ret_296 = __builtin_shufflevector(__ret_296, __ret_296, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_296; \
+#define vcopyq_laneq_u8(__p0_388, __p1_388, __p2_388, __p3_388) __extension__ ({ \
+  uint8x16_t __s0_388 = __p0_388; \
+  uint8x16_t __s2_388 = __p2_388; \
+  uint8x16_t __rev0_388;  __rev0_388 = __builtin_shufflevector(__s0_388, __s0_388, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __rev2_388;  __rev2_388 = __builtin_shufflevector(__s2_388, __s2_388, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret_388; \
+  __ret_388 = __noswap_vsetq_lane_u8(__noswap_vgetq_lane_u8(__rev2_388, __p3_388), __rev0_388, __p1_388); \
+  __ret_388 = __builtin_shufflevector(__ret_388, __ret_388, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_388; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_u32(__p0_297, __p1_297, __p2_297, __p3_297) __extension__ ({ \
-  uint32x4_t __s0_297 = __p0_297; \
-  uint32x4_t __s2_297 = __p2_297; \
-  uint32x4_t __ret_297; \
-  __ret_297 = vsetq_lane_u32(vgetq_lane_u32(__s2_297, __p3_297), __s0_297, __p1_297); \
-  __ret_297; \
+#define vcopyq_laneq_u32(__p0_389, __p1_389, __p2_389, __p3_389) __extension__ ({ \
+  uint32x4_t __s0_389 = __p0_389; \
+  uint32x4_t __s2_389 = __p2_389; \
+  uint32x4_t __ret_389; \
+  __ret_389 = vsetq_lane_u32(vgetq_lane_u32(__s2_389, __p3_389), __s0_389, __p1_389); \
+  __ret_389; \
 })
 #else
-#define vcopyq_laneq_u32(__p0_298, __p1_298, __p2_298, __p3_298) __extension__ ({ \
-  uint32x4_t __s0_298 = __p0_298; \
-  uint32x4_t __s2_298 = __p2_298; \
-  uint32x4_t __rev0_298;  __rev0_298 = __builtin_shufflevector(__s0_298, __s0_298, 3, 2, 1, 0); \
-  uint32x4_t __rev2_298;  __rev2_298 = __builtin_shufflevector(__s2_298, __s2_298, 3, 2, 1, 0); \
-  uint32x4_t __ret_298; \
-  __ret_298 = __noswap_vsetq_lane_u32(__noswap_vgetq_lane_u32(__rev2_298, __p3_298), __rev0_298, __p1_298); \
-  __ret_298 = __builtin_shufflevector(__ret_298, __ret_298, 3, 2, 1, 0); \
-  __ret_298; \
+#define vcopyq_laneq_u32(__p0_390, __p1_390, __p2_390, __p3_390) __extension__ ({ \
+  uint32x4_t __s0_390 = __p0_390; \
+  uint32x4_t __s2_390 = __p2_390; \
+  uint32x4_t __rev0_390;  __rev0_390 = __builtin_shufflevector(__s0_390, __s0_390, 3, 2, 1, 0); \
+  uint32x4_t __rev2_390;  __rev2_390 = __builtin_shufflevector(__s2_390, __s2_390, 3, 2, 1, 0); \
+  uint32x4_t __ret_390; \
+  __ret_390 = __noswap_vsetq_lane_u32(__noswap_vgetq_lane_u32(__rev2_390, __p3_390), __rev0_390, __p1_390); \
+  __ret_390 = __builtin_shufflevector(__ret_390, __ret_390, 3, 2, 1, 0); \
+  __ret_390; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_u64(__p0_299, __p1_299, __p2_299, __p3_299) __extension__ ({ \
-  uint64x2_t __s0_299 = __p0_299; \
-  uint64x2_t __s2_299 = __p2_299; \
-  uint64x2_t __ret_299; \
-  __ret_299 = vsetq_lane_u64(vgetq_lane_u64(__s2_299, __p3_299), __s0_299, __p1_299); \
-  __ret_299; \
+#define vcopyq_laneq_u64(__p0_391, __p1_391, __p2_391, __p3_391) __extension__ ({ \
+  uint64x2_t __s0_391 = __p0_391; \
+  uint64x2_t __s2_391 = __p2_391; \
+  uint64x2_t __ret_391; \
+  __ret_391 = vsetq_lane_u64(vgetq_lane_u64(__s2_391, __p3_391), __s0_391, __p1_391); \
+  __ret_391; \
 })
 #else
-#define vcopyq_laneq_u64(__p0_300, __p1_300, __p2_300, __p3_300) __extension__ ({ \
-  uint64x2_t __s0_300 = __p0_300; \
-  uint64x2_t __s2_300 = __p2_300; \
-  uint64x2_t __rev0_300;  __rev0_300 = __builtin_shufflevector(__s0_300, __s0_300, 1, 0); \
-  uint64x2_t __rev2_300;  __rev2_300 = __builtin_shufflevector(__s2_300, __s2_300, 1, 0); \
-  uint64x2_t __ret_300; \
-  __ret_300 = __noswap_vsetq_lane_u64(__noswap_vgetq_lane_u64(__rev2_300, __p3_300), __rev0_300, __p1_300); \
-  __ret_300 = __builtin_shufflevector(__ret_300, __ret_300, 1, 0); \
-  __ret_300; \
+#define vcopyq_laneq_u64(__p0_392, __p1_392, __p2_392, __p3_392) __extension__ ({ \
+  uint64x2_t __s0_392 = __p0_392; \
+  uint64x2_t __s2_392 = __p2_392; \
+  uint64x2_t __rev0_392;  __rev0_392 = __builtin_shufflevector(__s0_392, __s0_392, 1, 0); \
+  uint64x2_t __rev2_392;  __rev2_392 = __builtin_shufflevector(__s2_392, __s2_392, 1, 0); \
+  uint64x2_t __ret_392; \
+  __ret_392 = __noswap_vsetq_lane_u64(__noswap_vgetq_lane_u64(__rev2_392, __p3_392), __rev0_392, __p1_392); \
+  __ret_392 = __builtin_shufflevector(__ret_392, __ret_392, 1, 0); \
+  __ret_392; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_u16(__p0_301, __p1_301, __p2_301, __p3_301) __extension__ ({ \
-  uint16x8_t __s0_301 = __p0_301; \
-  uint16x8_t __s2_301 = __p2_301; \
-  uint16x8_t __ret_301; \
-  __ret_301 = vsetq_lane_u16(vgetq_lane_u16(__s2_301, __p3_301), __s0_301, __p1_301); \
-  __ret_301; \
+#define vcopyq_laneq_u16(__p0_393, __p1_393, __p2_393, __p3_393) __extension__ ({ \
+  uint16x8_t __s0_393 = __p0_393; \
+  uint16x8_t __s2_393 = __p2_393; \
+  uint16x8_t __ret_393; \
+  __ret_393 = vsetq_lane_u16(vgetq_lane_u16(__s2_393, __p3_393), __s0_393, __p1_393); \
+  __ret_393; \
 })
 #else
-#define vcopyq_laneq_u16(__p0_302, __p1_302, __p2_302, __p3_302) __extension__ ({ \
-  uint16x8_t __s0_302 = __p0_302; \
-  uint16x8_t __s2_302 = __p2_302; \
-  uint16x8_t __rev0_302;  __rev0_302 = __builtin_shufflevector(__s0_302, __s0_302, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __rev2_302;  __rev2_302 = __builtin_shufflevector(__s2_302, __s2_302, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __ret_302; \
-  __ret_302 = __noswap_vsetq_lane_u16(__noswap_vgetq_lane_u16(__rev2_302, __p3_302), __rev0_302, __p1_302); \
-  __ret_302 = __builtin_shufflevector(__ret_302, __ret_302, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_302; \
+#define vcopyq_laneq_u16(__p0_394, __p1_394, __p2_394, __p3_394) __extension__ ({ \
+  uint16x8_t __s0_394 = __p0_394; \
+  uint16x8_t __s2_394 = __p2_394; \
+  uint16x8_t __rev0_394;  __rev0_394 = __builtin_shufflevector(__s0_394, __s0_394, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev2_394;  __rev2_394 = __builtin_shufflevector(__s2_394, __s2_394, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret_394; \
+  __ret_394 = __noswap_vsetq_lane_u16(__noswap_vgetq_lane_u16(__rev2_394, __p3_394), __rev0_394, __p1_394); \
+  __ret_394 = __builtin_shufflevector(__ret_394, __ret_394, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_394; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_s8(__p0_303, __p1_303, __p2_303, __p3_303) __extension__ ({ \
-  int8x16_t __s0_303 = __p0_303; \
-  int8x16_t __s2_303 = __p2_303; \
-  int8x16_t __ret_303; \
-  __ret_303 = vsetq_lane_s8(vgetq_lane_s8(__s2_303, __p3_303), __s0_303, __p1_303); \
-  __ret_303; \
+#define vcopyq_laneq_s8(__p0_395, __p1_395, __p2_395, __p3_395) __extension__ ({ \
+  int8x16_t __s0_395 = __p0_395; \
+  int8x16_t __s2_395 = __p2_395; \
+  int8x16_t __ret_395; \
+  __ret_395 = vsetq_lane_s8(vgetq_lane_s8(__s2_395, __p3_395), __s0_395, __p1_395); \
+  __ret_395; \
 })
 #else
-#define vcopyq_laneq_s8(__p0_304, __p1_304, __p2_304, __p3_304) __extension__ ({ \
-  int8x16_t __s0_304 = __p0_304; \
-  int8x16_t __s2_304 = __p2_304; \
-  int8x16_t __rev0_304;  __rev0_304 = __builtin_shufflevector(__s0_304, __s0_304, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x16_t __rev2_304;  __rev2_304 = __builtin_shufflevector(__s2_304, __s2_304, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x16_t __ret_304; \
-  __ret_304 = __noswap_vsetq_lane_s8(__noswap_vgetq_lane_s8(__rev2_304, __p3_304), __rev0_304, __p1_304); \
-  __ret_304 = __builtin_shufflevector(__ret_304, __ret_304, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_304; \
+#define vcopyq_laneq_s8(__p0_396, __p1_396, __p2_396, __p3_396) __extension__ ({ \
+  int8x16_t __s0_396 = __p0_396; \
+  int8x16_t __s2_396 = __p2_396; \
+  int8x16_t __rev0_396;  __rev0_396 = __builtin_shufflevector(__s0_396, __s0_396, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __rev2_396;  __rev2_396 = __builtin_shufflevector(__s2_396, __s2_396, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret_396; \
+  __ret_396 = __noswap_vsetq_lane_s8(__noswap_vgetq_lane_s8(__rev2_396, __p3_396), __rev0_396, __p1_396); \
+  __ret_396 = __builtin_shufflevector(__ret_396, __ret_396, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_396; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_f32(__p0_305, __p1_305, __p2_305, __p3_305) __extension__ ({ \
-  float32x4_t __s0_305 = __p0_305; \
-  float32x4_t __s2_305 = __p2_305; \
-  float32x4_t __ret_305; \
-  __ret_305 = vsetq_lane_f32(vgetq_lane_f32(__s2_305, __p3_305), __s0_305, __p1_305); \
-  __ret_305; \
+#define vcopyq_laneq_f32(__p0_397, __p1_397, __p2_397, __p3_397) __extension__ ({ \
+  float32x4_t __s0_397 = __p0_397; \
+  float32x4_t __s2_397 = __p2_397; \
+  float32x4_t __ret_397; \
+  __ret_397 = vsetq_lane_f32(vgetq_lane_f32(__s2_397, __p3_397), __s0_397, __p1_397); \
+  __ret_397; \
 })
 #else
-#define vcopyq_laneq_f32(__p0_306, __p1_306, __p2_306, __p3_306) __extension__ ({ \
-  float32x4_t __s0_306 = __p0_306; \
-  float32x4_t __s2_306 = __p2_306; \
-  float32x4_t __rev0_306;  __rev0_306 = __builtin_shufflevector(__s0_306, __s0_306, 3, 2, 1, 0); \
-  float32x4_t __rev2_306;  __rev2_306 = __builtin_shufflevector(__s2_306, __s2_306, 3, 2, 1, 0); \
-  float32x4_t __ret_306; \
-  __ret_306 = __noswap_vsetq_lane_f32(__noswap_vgetq_lane_f32(__rev2_306, __p3_306), __rev0_306, __p1_306); \
-  __ret_306 = __builtin_shufflevector(__ret_306, __ret_306, 3, 2, 1, 0); \
-  __ret_306; \
+#define vcopyq_laneq_f32(__p0_398, __p1_398, __p2_398, __p3_398) __extension__ ({ \
+  float32x4_t __s0_398 = __p0_398; \
+  float32x4_t __s2_398 = __p2_398; \
+  float32x4_t __rev0_398;  __rev0_398 = __builtin_shufflevector(__s0_398, __s0_398, 3, 2, 1, 0); \
+  float32x4_t __rev2_398;  __rev2_398 = __builtin_shufflevector(__s2_398, __s2_398, 3, 2, 1, 0); \
+  float32x4_t __ret_398; \
+  __ret_398 = __noswap_vsetq_lane_f32(__noswap_vgetq_lane_f32(__rev2_398, __p3_398), __rev0_398, __p1_398); \
+  __ret_398 = __builtin_shufflevector(__ret_398, __ret_398, 3, 2, 1, 0); \
+  __ret_398; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_s32(__p0_307, __p1_307, __p2_307, __p3_307) __extension__ ({ \
-  int32x4_t __s0_307 = __p0_307; \
-  int32x4_t __s2_307 = __p2_307; \
-  int32x4_t __ret_307; \
-  __ret_307 = vsetq_lane_s32(vgetq_lane_s32(__s2_307, __p3_307), __s0_307, __p1_307); \
-  __ret_307; \
+#define vcopyq_laneq_s32(__p0_399, __p1_399, __p2_399, __p3_399) __extension__ ({ \
+  int32x4_t __s0_399 = __p0_399; \
+  int32x4_t __s2_399 = __p2_399; \
+  int32x4_t __ret_399; \
+  __ret_399 = vsetq_lane_s32(vgetq_lane_s32(__s2_399, __p3_399), __s0_399, __p1_399); \
+  __ret_399; \
 })
 #else
-#define vcopyq_laneq_s32(__p0_308, __p1_308, __p2_308, __p3_308) __extension__ ({ \
-  int32x4_t __s0_308 = __p0_308; \
-  int32x4_t __s2_308 = __p2_308; \
-  int32x4_t __rev0_308;  __rev0_308 = __builtin_shufflevector(__s0_308, __s0_308, 3, 2, 1, 0); \
-  int32x4_t __rev2_308;  __rev2_308 = __builtin_shufflevector(__s2_308, __s2_308, 3, 2, 1, 0); \
-  int32x4_t __ret_308; \
-  __ret_308 = __noswap_vsetq_lane_s32(__noswap_vgetq_lane_s32(__rev2_308, __p3_308), __rev0_308, __p1_308); \
-  __ret_308 = __builtin_shufflevector(__ret_308, __ret_308, 3, 2, 1, 0); \
-  __ret_308; \
+#define vcopyq_laneq_s32(__p0_400, __p1_400, __p2_400, __p3_400) __extension__ ({ \
+  int32x4_t __s0_400 = __p0_400; \
+  int32x4_t __s2_400 = __p2_400; \
+  int32x4_t __rev0_400;  __rev0_400 = __builtin_shufflevector(__s0_400, __s0_400, 3, 2, 1, 0); \
+  int32x4_t __rev2_400;  __rev2_400 = __builtin_shufflevector(__s2_400, __s2_400, 3, 2, 1, 0); \
+  int32x4_t __ret_400; \
+  __ret_400 = __noswap_vsetq_lane_s32(__noswap_vgetq_lane_s32(__rev2_400, __p3_400), __rev0_400, __p1_400); \
+  __ret_400 = __builtin_shufflevector(__ret_400, __ret_400, 3, 2, 1, 0); \
+  __ret_400; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_s64(__p0_309, __p1_309, __p2_309, __p3_309) __extension__ ({ \
-  int64x2_t __s0_309 = __p0_309; \
-  int64x2_t __s2_309 = __p2_309; \
-  int64x2_t __ret_309; \
-  __ret_309 = vsetq_lane_s64(vgetq_lane_s64(__s2_309, __p3_309), __s0_309, __p1_309); \
-  __ret_309; \
+#define vcopyq_laneq_s64(__p0_401, __p1_401, __p2_401, __p3_401) __extension__ ({ \
+  int64x2_t __s0_401 = __p0_401; \
+  int64x2_t __s2_401 = __p2_401; \
+  int64x2_t __ret_401; \
+  __ret_401 = vsetq_lane_s64(vgetq_lane_s64(__s2_401, __p3_401), __s0_401, __p1_401); \
+  __ret_401; \
 })
 #else
-#define vcopyq_laneq_s64(__p0_310, __p1_310, __p2_310, __p3_310) __extension__ ({ \
-  int64x2_t __s0_310 = __p0_310; \
-  int64x2_t __s2_310 = __p2_310; \
-  int64x2_t __rev0_310;  __rev0_310 = __builtin_shufflevector(__s0_310, __s0_310, 1, 0); \
-  int64x2_t __rev2_310;  __rev2_310 = __builtin_shufflevector(__s2_310, __s2_310, 1, 0); \
-  int64x2_t __ret_310; \
-  __ret_310 = __noswap_vsetq_lane_s64(__noswap_vgetq_lane_s64(__rev2_310, __p3_310), __rev0_310, __p1_310); \
-  __ret_310 = __builtin_shufflevector(__ret_310, __ret_310, 1, 0); \
-  __ret_310; \
+#define vcopyq_laneq_s64(__p0_402, __p1_402, __p2_402, __p3_402) __extension__ ({ \
+  int64x2_t __s0_402 = __p0_402; \
+  int64x2_t __s2_402 = __p2_402; \
+  int64x2_t __rev0_402;  __rev0_402 = __builtin_shufflevector(__s0_402, __s0_402, 1, 0); \
+  int64x2_t __rev2_402;  __rev2_402 = __builtin_shufflevector(__s2_402, __s2_402, 1, 0); \
+  int64x2_t __ret_402; \
+  __ret_402 = __noswap_vsetq_lane_s64(__noswap_vgetq_lane_s64(__rev2_402, __p3_402), __rev0_402, __p1_402); \
+  __ret_402 = __builtin_shufflevector(__ret_402, __ret_402, 1, 0); \
+  __ret_402; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_s16(__p0_311, __p1_311, __p2_311, __p3_311) __extension__ ({ \
-  int16x8_t __s0_311 = __p0_311; \
-  int16x8_t __s2_311 = __p2_311; \
-  int16x8_t __ret_311; \
-  __ret_311 = vsetq_lane_s16(vgetq_lane_s16(__s2_311, __p3_311), __s0_311, __p1_311); \
-  __ret_311; \
+#define vcopyq_laneq_s16(__p0_403, __p1_403, __p2_403, __p3_403) __extension__ ({ \
+  int16x8_t __s0_403 = __p0_403; \
+  int16x8_t __s2_403 = __p2_403; \
+  int16x8_t __ret_403; \
+  __ret_403 = vsetq_lane_s16(vgetq_lane_s16(__s2_403, __p3_403), __s0_403, __p1_403); \
+  __ret_403; \
 })
 #else
-#define vcopyq_laneq_s16(__p0_312, __p1_312, __p2_312, __p3_312) __extension__ ({ \
-  int16x8_t __s0_312 = __p0_312; \
-  int16x8_t __s2_312 = __p2_312; \
-  int16x8_t __rev0_312;  __rev0_312 = __builtin_shufflevector(__s0_312, __s0_312, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev2_312;  __rev2_312 = __builtin_shufflevector(__s2_312, __s2_312, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __ret_312; \
-  __ret_312 = __noswap_vsetq_lane_s16(__noswap_vgetq_lane_s16(__rev2_312, __p3_312), __rev0_312, __p1_312); \
-  __ret_312 = __builtin_shufflevector(__ret_312, __ret_312, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_312; \
+#define vcopyq_laneq_s16(__p0_404, __p1_404, __p2_404, __p3_404) __extension__ ({ \
+  int16x8_t __s0_404 = __p0_404; \
+  int16x8_t __s2_404 = __p2_404; \
+  int16x8_t __rev0_404;  __rev0_404 = __builtin_shufflevector(__s0_404, __s0_404, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev2_404;  __rev2_404 = __builtin_shufflevector(__s2_404, __s2_404, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret_404; \
+  __ret_404 = __noswap_vsetq_lane_s16(__noswap_vgetq_lane_s16(__rev2_404, __p3_404), __rev0_404, __p1_404); \
+  __ret_404 = __builtin_shufflevector(__ret_404, __ret_404, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_404; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_p8(__p0_313, __p1_313, __p2_313, __p3_313) __extension__ ({ \
-  poly8x8_t __s0_313 = __p0_313; \
-  poly8x16_t __s2_313 = __p2_313; \
-  poly8x8_t __ret_313; \
-  __ret_313 = vset_lane_p8(vgetq_lane_p8(__s2_313, __p3_313), __s0_313, __p1_313); \
-  __ret_313; \
+#define vcopy_laneq_p8(__p0_405, __p1_405, __p2_405, __p3_405) __extension__ ({ \
+  poly8x8_t __s0_405 = __p0_405; \
+  poly8x16_t __s2_405 = __p2_405; \
+  poly8x8_t __ret_405; \
+  __ret_405 = vset_lane_p8(vgetq_lane_p8(__s2_405, __p3_405), __s0_405, __p1_405); \
+  __ret_405; \
 })
 #else
-#define vcopy_laneq_p8(__p0_314, __p1_314, __p2_314, __p3_314) __extension__ ({ \
-  poly8x8_t __s0_314 = __p0_314; \
-  poly8x16_t __s2_314 = __p2_314; \
-  poly8x8_t __rev0_314;  __rev0_314 = __builtin_shufflevector(__s0_314, __s0_314, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly8x16_t __rev2_314;  __rev2_314 = __builtin_shufflevector(__s2_314, __s2_314, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly8x8_t __ret_314; \
-  __ret_314 = __noswap_vset_lane_p8(__noswap_vgetq_lane_p8(__rev2_314, __p3_314), __rev0_314, __p1_314); \
-  __ret_314 = __builtin_shufflevector(__ret_314, __ret_314, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_314; \
+#define vcopy_laneq_p8(__p0_406, __p1_406, __p2_406, __p3_406) __extension__ ({ \
+  poly8x8_t __s0_406 = __p0_406; \
+  poly8x16_t __s2_406 = __p2_406; \
+  poly8x8_t __rev0_406;  __rev0_406 = __builtin_shufflevector(__s0_406, __s0_406, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16_t __rev2_406;  __rev2_406 = __builtin_shufflevector(__s2_406, __s2_406, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8_t __ret_406; \
+  __ret_406 = __noswap_vset_lane_p8(__noswap_vgetq_lane_p8(__rev2_406, __p3_406), __rev0_406, __p1_406); \
+  __ret_406 = __builtin_shufflevector(__ret_406, __ret_406, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_406; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_p16(__p0_315, __p1_315, __p2_315, __p3_315) __extension__ ({ \
-  poly16x4_t __s0_315 = __p0_315; \
-  poly16x8_t __s2_315 = __p2_315; \
-  poly16x4_t __ret_315; \
-  __ret_315 = vset_lane_p16(vgetq_lane_p16(__s2_315, __p3_315), __s0_315, __p1_315); \
-  __ret_315; \
+#define vcopy_laneq_p16(__p0_407, __p1_407, __p2_407, __p3_407) __extension__ ({ \
+  poly16x4_t __s0_407 = __p0_407; \
+  poly16x8_t __s2_407 = __p2_407; \
+  poly16x4_t __ret_407; \
+  __ret_407 = vset_lane_p16(vgetq_lane_p16(__s2_407, __p3_407), __s0_407, __p1_407); \
+  __ret_407; \
 })
 #else
-#define vcopy_laneq_p16(__p0_316, __p1_316, __p2_316, __p3_316) __extension__ ({ \
-  poly16x4_t __s0_316 = __p0_316; \
-  poly16x8_t __s2_316 = __p2_316; \
-  poly16x4_t __rev0_316;  __rev0_316 = __builtin_shufflevector(__s0_316, __s0_316, 3, 2, 1, 0); \
-  poly16x8_t __rev2_316;  __rev2_316 = __builtin_shufflevector(__s2_316, __s2_316, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly16x4_t __ret_316; \
-  __ret_316 = __noswap_vset_lane_p16(__noswap_vgetq_lane_p16(__rev2_316, __p3_316), __rev0_316, __p1_316); \
-  __ret_316 = __builtin_shufflevector(__ret_316, __ret_316, 3, 2, 1, 0); \
-  __ret_316; \
+#define vcopy_laneq_p16(__p0_408, __p1_408, __p2_408, __p3_408) __extension__ ({ \
+  poly16x4_t __s0_408 = __p0_408; \
+  poly16x8_t __s2_408 = __p2_408; \
+  poly16x4_t __rev0_408;  __rev0_408 = __builtin_shufflevector(__s0_408, __s0_408, 3, 2, 1, 0); \
+  poly16x8_t __rev2_408;  __rev2_408 = __builtin_shufflevector(__s2_408, __s2_408, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x4_t __ret_408; \
+  __ret_408 = __noswap_vset_lane_p16(__noswap_vgetq_lane_p16(__rev2_408, __p3_408), __rev0_408, __p1_408); \
+  __ret_408 = __builtin_shufflevector(__ret_408, __ret_408, 3, 2, 1, 0); \
+  __ret_408; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_u8(__p0_317, __p1_317, __p2_317, __p3_317) __extension__ ({ \
-  uint8x8_t __s0_317 = __p0_317; \
-  uint8x16_t __s2_317 = __p2_317; \
-  uint8x8_t __ret_317; \
-  __ret_317 = vset_lane_u8(vgetq_lane_u8(__s2_317, __p3_317), __s0_317, __p1_317); \
-  __ret_317; \
+#define vcopy_laneq_u8(__p0_409, __p1_409, __p2_409, __p3_409) __extension__ ({ \
+  uint8x8_t __s0_409 = __p0_409; \
+  uint8x16_t __s2_409 = __p2_409; \
+  uint8x8_t __ret_409; \
+  __ret_409 = vset_lane_u8(vgetq_lane_u8(__s2_409, __p3_409), __s0_409, __p1_409); \
+  __ret_409; \
 })
 #else
-#define vcopy_laneq_u8(__p0_318, __p1_318, __p2_318, __p3_318) __extension__ ({ \
-  uint8x8_t __s0_318 = __p0_318; \
-  uint8x16_t __s2_318 = __p2_318; \
-  uint8x8_t __rev0_318;  __rev0_318 = __builtin_shufflevector(__s0_318, __s0_318, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x16_t __rev2_318;  __rev2_318 = __builtin_shufflevector(__s2_318, __s2_318, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x8_t __ret_318; \
-  __ret_318 = __noswap_vset_lane_u8(__noswap_vgetq_lane_u8(__rev2_318, __p3_318), __rev0_318, __p1_318); \
-  __ret_318 = __builtin_shufflevector(__ret_318, __ret_318, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_318; \
+#define vcopy_laneq_u8(__p0_410, __p1_410, __p2_410, __p3_410) __extension__ ({ \
+  uint8x8_t __s0_410 = __p0_410; \
+  uint8x16_t __s2_410 = __p2_410; \
+  uint8x8_t __rev0_410;  __rev0_410 = __builtin_shufflevector(__s0_410, __s0_410, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __rev2_410;  __rev2_410 = __builtin_shufflevector(__s2_410, __s2_410, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret_410; \
+  __ret_410 = __noswap_vset_lane_u8(__noswap_vgetq_lane_u8(__rev2_410, __p3_410), __rev0_410, __p1_410); \
+  __ret_410 = __builtin_shufflevector(__ret_410, __ret_410, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_410; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_u32(__p0_319, __p1_319, __p2_319, __p3_319) __extension__ ({ \
-  uint32x2_t __s0_319 = __p0_319; \
-  uint32x4_t __s2_319 = __p2_319; \
-  uint32x2_t __ret_319; \
-  __ret_319 = vset_lane_u32(vgetq_lane_u32(__s2_319, __p3_319), __s0_319, __p1_319); \
-  __ret_319; \
+#define vcopy_laneq_u32(__p0_411, __p1_411, __p2_411, __p3_411) __extension__ ({ \
+  uint32x2_t __s0_411 = __p0_411; \
+  uint32x4_t __s2_411 = __p2_411; \
+  uint32x2_t __ret_411; \
+  __ret_411 = vset_lane_u32(vgetq_lane_u32(__s2_411, __p3_411), __s0_411, __p1_411); \
+  __ret_411; \
 })
 #else
-#define vcopy_laneq_u32(__p0_320, __p1_320, __p2_320, __p3_320) __extension__ ({ \
-  uint32x2_t __s0_320 = __p0_320; \
-  uint32x4_t __s2_320 = __p2_320; \
-  uint32x2_t __rev0_320;  __rev0_320 = __builtin_shufflevector(__s0_320, __s0_320, 1, 0); \
-  uint32x4_t __rev2_320;  __rev2_320 = __builtin_shufflevector(__s2_320, __s2_320, 3, 2, 1, 0); \
-  uint32x2_t __ret_320; \
-  __ret_320 = __noswap_vset_lane_u32(__noswap_vgetq_lane_u32(__rev2_320, __p3_320), __rev0_320, __p1_320); \
-  __ret_320 = __builtin_shufflevector(__ret_320, __ret_320, 1, 0); \
-  __ret_320; \
+#define vcopy_laneq_u32(__p0_412, __p1_412, __p2_412, __p3_412) __extension__ ({ \
+  uint32x2_t __s0_412 = __p0_412; \
+  uint32x4_t __s2_412 = __p2_412; \
+  uint32x2_t __rev0_412;  __rev0_412 = __builtin_shufflevector(__s0_412, __s0_412, 1, 0); \
+  uint32x4_t __rev2_412;  __rev2_412 = __builtin_shufflevector(__s2_412, __s2_412, 3, 2, 1, 0); \
+  uint32x2_t __ret_412; \
+  __ret_412 = __noswap_vset_lane_u32(__noswap_vgetq_lane_u32(__rev2_412, __p3_412), __rev0_412, __p1_412); \
+  __ret_412 = __builtin_shufflevector(__ret_412, __ret_412, 1, 0); \
+  __ret_412; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_u64(__p0_321, __p1_321, __p2_321, __p3_321) __extension__ ({ \
-  uint64x1_t __s0_321 = __p0_321; \
-  uint64x2_t __s2_321 = __p2_321; \
-  uint64x1_t __ret_321; \
-  __ret_321 = vset_lane_u64(vgetq_lane_u64(__s2_321, __p3_321), __s0_321, __p1_321); \
-  __ret_321; \
+#define vcopy_laneq_u64(__p0_413, __p1_413, __p2_413, __p3_413) __extension__ ({ \
+  uint64x1_t __s0_413 = __p0_413; \
+  uint64x2_t __s2_413 = __p2_413; \
+  uint64x1_t __ret_413; \
+  __ret_413 = vset_lane_u64(vgetq_lane_u64(__s2_413, __p3_413), __s0_413, __p1_413); \
+  __ret_413; \
 })
 #else
-#define vcopy_laneq_u64(__p0_322, __p1_322, __p2_322, __p3_322) __extension__ ({ \
-  uint64x1_t __s0_322 = __p0_322; \
-  uint64x2_t __s2_322 = __p2_322; \
-  uint64x2_t __rev2_322;  __rev2_322 = __builtin_shufflevector(__s2_322, __s2_322, 1, 0); \
-  uint64x1_t __ret_322; \
-  __ret_322 = vset_lane_u64(__noswap_vgetq_lane_u64(__rev2_322, __p3_322), __s0_322, __p1_322); \
-  __ret_322; \
+#define vcopy_laneq_u64(__p0_414, __p1_414, __p2_414, __p3_414) __extension__ ({ \
+  uint64x1_t __s0_414 = __p0_414; \
+  uint64x2_t __s2_414 = __p2_414; \
+  uint64x2_t __rev2_414;  __rev2_414 = __builtin_shufflevector(__s2_414, __s2_414, 1, 0); \
+  uint64x1_t __ret_414; \
+  __ret_414 = vset_lane_u64(__noswap_vgetq_lane_u64(__rev2_414, __p3_414), __s0_414, __p1_414); \
+  __ret_414; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_u16(__p0_323, __p1_323, __p2_323, __p3_323) __extension__ ({ \
-  uint16x4_t __s0_323 = __p0_323; \
-  uint16x8_t __s2_323 = __p2_323; \
-  uint16x4_t __ret_323; \
-  __ret_323 = vset_lane_u16(vgetq_lane_u16(__s2_323, __p3_323), __s0_323, __p1_323); \
-  __ret_323; \
+#define vcopy_laneq_u16(__p0_415, __p1_415, __p2_415, __p3_415) __extension__ ({ \
+  uint16x4_t __s0_415 = __p0_415; \
+  uint16x8_t __s2_415 = __p2_415; \
+  uint16x4_t __ret_415; \
+  __ret_415 = vset_lane_u16(vgetq_lane_u16(__s2_415, __p3_415), __s0_415, __p1_415); \
+  __ret_415; \
 })
 #else
-#define vcopy_laneq_u16(__p0_324, __p1_324, __p2_324, __p3_324) __extension__ ({ \
-  uint16x4_t __s0_324 = __p0_324; \
-  uint16x8_t __s2_324 = __p2_324; \
-  uint16x4_t __rev0_324;  __rev0_324 = __builtin_shufflevector(__s0_324, __s0_324, 3, 2, 1, 0); \
-  uint16x8_t __rev2_324;  __rev2_324 = __builtin_shufflevector(__s2_324, __s2_324, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x4_t __ret_324; \
-  __ret_324 = __noswap_vset_lane_u16(__noswap_vgetq_lane_u16(__rev2_324, __p3_324), __rev0_324, __p1_324); \
-  __ret_324 = __builtin_shufflevector(__ret_324, __ret_324, 3, 2, 1, 0); \
-  __ret_324; \
+#define vcopy_laneq_u16(__p0_416, __p1_416, __p2_416, __p3_416) __extension__ ({ \
+  uint16x4_t __s0_416 = __p0_416; \
+  uint16x8_t __s2_416 = __p2_416; \
+  uint16x4_t __rev0_416;  __rev0_416 = __builtin_shufflevector(__s0_416, __s0_416, 3, 2, 1, 0); \
+  uint16x8_t __rev2_416;  __rev2_416 = __builtin_shufflevector(__s2_416, __s2_416, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x4_t __ret_416; \
+  __ret_416 = __noswap_vset_lane_u16(__noswap_vgetq_lane_u16(__rev2_416, __p3_416), __rev0_416, __p1_416); \
+  __ret_416 = __builtin_shufflevector(__ret_416, __ret_416, 3, 2, 1, 0); \
+  __ret_416; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_s8(__p0_325, __p1_325, __p2_325, __p3_325) __extension__ ({ \
-  int8x8_t __s0_325 = __p0_325; \
-  int8x16_t __s2_325 = __p2_325; \
-  int8x8_t __ret_325; \
-  __ret_325 = vset_lane_s8(vgetq_lane_s8(__s2_325, __p3_325), __s0_325, __p1_325); \
-  __ret_325; \
+#define vcopy_laneq_s8(__p0_417, __p1_417, __p2_417, __p3_417) __extension__ ({ \
+  int8x8_t __s0_417 = __p0_417; \
+  int8x16_t __s2_417 = __p2_417; \
+  int8x8_t __ret_417; \
+  __ret_417 = vset_lane_s8(vgetq_lane_s8(__s2_417, __p3_417), __s0_417, __p1_417); \
+  __ret_417; \
 })
 #else
-#define vcopy_laneq_s8(__p0_326, __p1_326, __p2_326, __p3_326) __extension__ ({ \
-  int8x8_t __s0_326 = __p0_326; \
-  int8x16_t __s2_326 = __p2_326; \
-  int8x8_t __rev0_326;  __rev0_326 = __builtin_shufflevector(__s0_326, __s0_326, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x16_t __rev2_326;  __rev2_326 = __builtin_shufflevector(__s2_326, __s2_326, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x8_t __ret_326; \
-  __ret_326 = __noswap_vset_lane_s8(__noswap_vgetq_lane_s8(__rev2_326, __p3_326), __rev0_326, __p1_326); \
-  __ret_326 = __builtin_shufflevector(__ret_326, __ret_326, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_326; \
+#define vcopy_laneq_s8(__p0_418, __p1_418, __p2_418, __p3_418) __extension__ ({ \
+  int8x8_t __s0_418 = __p0_418; \
+  int8x16_t __s2_418 = __p2_418; \
+  int8x8_t __rev0_418;  __rev0_418 = __builtin_shufflevector(__s0_418, __s0_418, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __rev2_418;  __rev2_418 = __builtin_shufflevector(__s2_418, __s2_418, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret_418; \
+  __ret_418 = __noswap_vset_lane_s8(__noswap_vgetq_lane_s8(__rev2_418, __p3_418), __rev0_418, __p1_418); \
+  __ret_418 = __builtin_shufflevector(__ret_418, __ret_418, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_418; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_f32(__p0_327, __p1_327, __p2_327, __p3_327) __extension__ ({ \
-  float32x2_t __s0_327 = __p0_327; \
-  float32x4_t __s2_327 = __p2_327; \
-  float32x2_t __ret_327; \
-  __ret_327 = vset_lane_f32(vgetq_lane_f32(__s2_327, __p3_327), __s0_327, __p1_327); \
-  __ret_327; \
+#define vcopy_laneq_f32(__p0_419, __p1_419, __p2_419, __p3_419) __extension__ ({ \
+  float32x2_t __s0_419 = __p0_419; \
+  float32x4_t __s2_419 = __p2_419; \
+  float32x2_t __ret_419; \
+  __ret_419 = vset_lane_f32(vgetq_lane_f32(__s2_419, __p3_419), __s0_419, __p1_419); \
+  __ret_419; \
 })
 #else
-#define vcopy_laneq_f32(__p0_328, __p1_328, __p2_328, __p3_328) __extension__ ({ \
-  float32x2_t __s0_328 = __p0_328; \
-  float32x4_t __s2_328 = __p2_328; \
-  float32x2_t __rev0_328;  __rev0_328 = __builtin_shufflevector(__s0_328, __s0_328, 1, 0); \
-  float32x4_t __rev2_328;  __rev2_328 = __builtin_shufflevector(__s2_328, __s2_328, 3, 2, 1, 0); \
-  float32x2_t __ret_328; \
-  __ret_328 = __noswap_vset_lane_f32(__noswap_vgetq_lane_f32(__rev2_328, __p3_328), __rev0_328, __p1_328); \
-  __ret_328 = __builtin_shufflevector(__ret_328, __ret_328, 1, 0); \
-  __ret_328; \
+#define vcopy_laneq_f32(__p0_420, __p1_420, __p2_420, __p3_420) __extension__ ({ \
+  float32x2_t __s0_420 = __p0_420; \
+  float32x4_t __s2_420 = __p2_420; \
+  float32x2_t __rev0_420;  __rev0_420 = __builtin_shufflevector(__s0_420, __s0_420, 1, 0); \
+  float32x4_t __rev2_420;  __rev2_420 = __builtin_shufflevector(__s2_420, __s2_420, 3, 2, 1, 0); \
+  float32x2_t __ret_420; \
+  __ret_420 = __noswap_vset_lane_f32(__noswap_vgetq_lane_f32(__rev2_420, __p3_420), __rev0_420, __p1_420); \
+  __ret_420 = __builtin_shufflevector(__ret_420, __ret_420, 1, 0); \
+  __ret_420; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_s32(__p0_329, __p1_329, __p2_329, __p3_329) __extension__ ({ \
-  int32x2_t __s0_329 = __p0_329; \
-  int32x4_t __s2_329 = __p2_329; \
-  int32x2_t __ret_329; \
-  __ret_329 = vset_lane_s32(vgetq_lane_s32(__s2_329, __p3_329), __s0_329, __p1_329); \
-  __ret_329; \
+#define vcopy_laneq_s32(__p0_421, __p1_421, __p2_421, __p3_421) __extension__ ({ \
+  int32x2_t __s0_421 = __p0_421; \
+  int32x4_t __s2_421 = __p2_421; \
+  int32x2_t __ret_421; \
+  __ret_421 = vset_lane_s32(vgetq_lane_s32(__s2_421, __p3_421), __s0_421, __p1_421); \
+  __ret_421; \
 })
 #else
-#define vcopy_laneq_s32(__p0_330, __p1_330, __p2_330, __p3_330) __extension__ ({ \
-  int32x2_t __s0_330 = __p0_330; \
-  int32x4_t __s2_330 = __p2_330; \
-  int32x2_t __rev0_330;  __rev0_330 = __builtin_shufflevector(__s0_330, __s0_330, 1, 0); \
-  int32x4_t __rev2_330;  __rev2_330 = __builtin_shufflevector(__s2_330, __s2_330, 3, 2, 1, 0); \
-  int32x2_t __ret_330; \
-  __ret_330 = __noswap_vset_lane_s32(__noswap_vgetq_lane_s32(__rev2_330, __p3_330), __rev0_330, __p1_330); \
-  __ret_330 = __builtin_shufflevector(__ret_330, __ret_330, 1, 0); \
-  __ret_330; \
+#define vcopy_laneq_s32(__p0_422, __p1_422, __p2_422, __p3_422) __extension__ ({ \
+  int32x2_t __s0_422 = __p0_422; \
+  int32x4_t __s2_422 = __p2_422; \
+  int32x2_t __rev0_422;  __rev0_422 = __builtin_shufflevector(__s0_422, __s0_422, 1, 0); \
+  int32x4_t __rev2_422;  __rev2_422 = __builtin_shufflevector(__s2_422, __s2_422, 3, 2, 1, 0); \
+  int32x2_t __ret_422; \
+  __ret_422 = __noswap_vset_lane_s32(__noswap_vgetq_lane_s32(__rev2_422, __p3_422), __rev0_422, __p1_422); \
+  __ret_422 = __builtin_shufflevector(__ret_422, __ret_422, 1, 0); \
+  __ret_422; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_s64(__p0_331, __p1_331, __p2_331, __p3_331) __extension__ ({ \
-  int64x1_t __s0_331 = __p0_331; \
-  int64x2_t __s2_331 = __p2_331; \
-  int64x1_t __ret_331; \
-  __ret_331 = vset_lane_s64(vgetq_lane_s64(__s2_331, __p3_331), __s0_331, __p1_331); \
-  __ret_331; \
+#define vcopy_laneq_s64(__p0_423, __p1_423, __p2_423, __p3_423) __extension__ ({ \
+  int64x1_t __s0_423 = __p0_423; \
+  int64x2_t __s2_423 = __p2_423; \
+  int64x1_t __ret_423; \
+  __ret_423 = vset_lane_s64(vgetq_lane_s64(__s2_423, __p3_423), __s0_423, __p1_423); \
+  __ret_423; \
 })
 #else
-#define vcopy_laneq_s64(__p0_332, __p1_332, __p2_332, __p3_332) __extension__ ({ \
-  int64x1_t __s0_332 = __p0_332; \
-  int64x2_t __s2_332 = __p2_332; \
-  int64x2_t __rev2_332;  __rev2_332 = __builtin_shufflevector(__s2_332, __s2_332, 1, 0); \
-  int64x1_t __ret_332; \
-  __ret_332 = vset_lane_s64(__noswap_vgetq_lane_s64(__rev2_332, __p3_332), __s0_332, __p1_332); \
-  __ret_332; \
+#define vcopy_laneq_s64(__p0_424, __p1_424, __p2_424, __p3_424) __extension__ ({ \
+  int64x1_t __s0_424 = __p0_424; \
+  int64x2_t __s2_424 = __p2_424; \
+  int64x2_t __rev2_424;  __rev2_424 = __builtin_shufflevector(__s2_424, __s2_424, 1, 0); \
+  int64x1_t __ret_424; \
+  __ret_424 = vset_lane_s64(__noswap_vgetq_lane_s64(__rev2_424, __p3_424), __s0_424, __p1_424); \
+  __ret_424; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_s16(__p0_333, __p1_333, __p2_333, __p3_333) __extension__ ({ \
-  int16x4_t __s0_333 = __p0_333; \
-  int16x8_t __s2_333 = __p2_333; \
-  int16x4_t __ret_333; \
-  __ret_333 = vset_lane_s16(vgetq_lane_s16(__s2_333, __p3_333), __s0_333, __p1_333); \
-  __ret_333; \
+#define vcopy_laneq_s16(__p0_425, __p1_425, __p2_425, __p3_425) __extension__ ({ \
+  int16x4_t __s0_425 = __p0_425; \
+  int16x8_t __s2_425 = __p2_425; \
+  int16x4_t __ret_425; \
+  __ret_425 = vset_lane_s16(vgetq_lane_s16(__s2_425, __p3_425), __s0_425, __p1_425); \
+  __ret_425; \
 })
 #else
-#define vcopy_laneq_s16(__p0_334, __p1_334, __p2_334, __p3_334) __extension__ ({ \
-  int16x4_t __s0_334 = __p0_334; \
-  int16x8_t __s2_334 = __p2_334; \
-  int16x4_t __rev0_334;  __rev0_334 = __builtin_shufflevector(__s0_334, __s0_334, 3, 2, 1, 0); \
-  int16x8_t __rev2_334;  __rev2_334 = __builtin_shufflevector(__s2_334, __s2_334, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x4_t __ret_334; \
-  __ret_334 = __noswap_vset_lane_s16(__noswap_vgetq_lane_s16(__rev2_334, __p3_334), __rev0_334, __p1_334); \
-  __ret_334 = __builtin_shufflevector(__ret_334, __ret_334, 3, 2, 1, 0); \
-  __ret_334; \
+#define vcopy_laneq_s16(__p0_426, __p1_426, __p2_426, __p3_426) __extension__ ({ \
+  int16x4_t __s0_426 = __p0_426; \
+  int16x8_t __s2_426 = __p2_426; \
+  int16x4_t __rev0_426;  __rev0_426 = __builtin_shufflevector(__s0_426, __s0_426, 3, 2, 1, 0); \
+  int16x8_t __rev2_426;  __rev2_426 = __builtin_shufflevector(__s2_426, __s2_426, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __ret_426; \
+  __ret_426 = __noswap_vset_lane_s16(__noswap_vgetq_lane_s16(__rev2_426, __p3_426), __rev0_426, __p1_426); \
+  __ret_426 = __builtin_shufflevector(__ret_426, __ret_426, 3, 2, 1, 0); \
+  __ret_426; \
 })
 #endif
 
@@ -49009,85 +50713,85 @@ __ai float32x2_t vdiv_f32(float32x2_t __p0, float32x2_t __p1) {
 })
 #endif
 
-#define vdup_lane_p64(__p0_335, __p1_335) __extension__ ({ \
-  poly64x1_t __s0_335 = __p0_335; \
-  poly64x1_t __ret_335; \
-  __ret_335 = splat_lane_p64(__s0_335, __p1_335); \
-  __ret_335; \
+#define vdup_lane_p64(__p0_427, __p1_427) __extension__ ({ \
+  poly64x1_t __s0_427 = __p0_427; \
+  poly64x1_t __ret_427; \
+  __ret_427 = splat_lane_p64(__s0_427, __p1_427); \
+  __ret_427; \
 })
 #ifdef __LITTLE_ENDIAN__
-#define vdupq_lane_p64(__p0_336, __p1_336) __extension__ ({ \
-  poly64x1_t __s0_336 = __p0_336; \
-  poly64x2_t __ret_336; \
-  __ret_336 = splatq_lane_p64(__s0_336, __p1_336); \
-  __ret_336; \
+#define vdupq_lane_p64(__p0_428, __p1_428) __extension__ ({ \
+  poly64x1_t __s0_428 = __p0_428; \
+  poly64x2_t __ret_428; \
+  __ret_428 = splatq_lane_p64(__s0_428, __p1_428); \
+  __ret_428; \
 })
 #else
-#define vdupq_lane_p64(__p0_337, __p1_337) __extension__ ({ \
-  poly64x1_t __s0_337 = __p0_337; \
-  poly64x2_t __ret_337; \
-  __ret_337 = __noswap_splatq_lane_p64(__s0_337, __p1_337); \
-  __ret_337 = __builtin_shufflevector(__ret_337, __ret_337, 1, 0); \
-  __ret_337; \
+#define vdupq_lane_p64(__p0_429, __p1_429) __extension__ ({ \
+  poly64x1_t __s0_429 = __p0_429; \
+  poly64x2_t __ret_429; \
+  __ret_429 = __noswap_splatq_lane_p64(__s0_429, __p1_429); \
+  __ret_429 = __builtin_shufflevector(__ret_429, __ret_429, 1, 0); \
+  __ret_429; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vdupq_lane_f64(__p0_338, __p1_338) __extension__ ({ \
-  float64x1_t __s0_338 = __p0_338; \
-  float64x2_t __ret_338; \
-  __ret_338 = splatq_lane_f64(__s0_338, __p1_338); \
-  __ret_338; \
+#define vdupq_lane_f64(__p0_430, __p1_430) __extension__ ({ \
+  float64x1_t __s0_430 = __p0_430; \
+  float64x2_t __ret_430; \
+  __ret_430 = splatq_lane_f64(__s0_430, __p1_430); \
+  __ret_430; \
 })
 #else
-#define vdupq_lane_f64(__p0_339, __p1_339) __extension__ ({ \
-  float64x1_t __s0_339 = __p0_339; \
-  float64x2_t __ret_339; \
-  __ret_339 = __noswap_splatq_lane_f64(__s0_339, __p1_339); \
-  __ret_339 = __builtin_shufflevector(__ret_339, __ret_339, 1, 0); \
-  __ret_339; \
+#define vdupq_lane_f64(__p0_431, __p1_431) __extension__ ({ \
+  float64x1_t __s0_431 = __p0_431; \
+  float64x2_t __ret_431; \
+  __ret_431 = __noswap_splatq_lane_f64(__s0_431, __p1_431); \
+  __ret_431 = __builtin_shufflevector(__ret_431, __ret_431, 1, 0); \
+  __ret_431; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vdupq_lane_f16(__p0_340, __p1_340) __extension__ ({ \
-  float16x4_t __s0_340 = __p0_340; \
-  float16x8_t __ret_340; \
-  __ret_340 = splatq_lane_f16(__s0_340, __p1_340); \
-  __ret_340; \
+#define vdupq_lane_f16(__p0_432, __p1_432) __extension__ ({ \
+  float16x4_t __s0_432 = __p0_432; \
+  float16x8_t __ret_432; \
+  __ret_432 = splatq_lane_f16(__s0_432, __p1_432); \
+  __ret_432; \
 })
 #else
-#define vdupq_lane_f16(__p0_341, __p1_341) __extension__ ({ \
-  float16x4_t __s0_341 = __p0_341; \
-  float16x4_t __rev0_341;  __rev0_341 = __builtin_shufflevector(__s0_341, __s0_341, 3, 2, 1, 0); \
-  float16x8_t __ret_341; \
-  __ret_341 = __noswap_splatq_lane_f16(__rev0_341, __p1_341); \
-  __ret_341 = __builtin_shufflevector(__ret_341, __ret_341, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_341; \
+#define vdupq_lane_f16(__p0_433, __p1_433) __extension__ ({ \
+  float16x4_t __s0_433 = __p0_433; \
+  float16x4_t __rev0_433;  __rev0_433 = __builtin_shufflevector(__s0_433, __s0_433, 3, 2, 1, 0); \
+  float16x8_t __ret_433; \
+  __ret_433 = __noswap_splatq_lane_f16(__rev0_433, __p1_433); \
+  __ret_433 = __builtin_shufflevector(__ret_433, __ret_433, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_433; \
 })
 #endif
 
-#define vdup_lane_f64(__p0_342, __p1_342) __extension__ ({ \
-  float64x1_t __s0_342 = __p0_342; \
-  float64x1_t __ret_342; \
-  __ret_342 = splat_lane_f64(__s0_342, __p1_342); \
-  __ret_342; \
+#define vdup_lane_f64(__p0_434, __p1_434) __extension__ ({ \
+  float64x1_t __s0_434 = __p0_434; \
+  float64x1_t __ret_434; \
+  __ret_434 = splat_lane_f64(__s0_434, __p1_434); \
+  __ret_434; \
 })
 #ifdef __LITTLE_ENDIAN__
-#define vdup_lane_f16(__p0_343, __p1_343) __extension__ ({ \
-  float16x4_t __s0_343 = __p0_343; \
-  float16x4_t __ret_343; \
-  __ret_343 = splat_lane_f16(__s0_343, __p1_343); \
-  __ret_343; \
+#define vdup_lane_f16(__p0_435, __p1_435) __extension__ ({ \
+  float16x4_t __s0_435 = __p0_435; \
+  float16x4_t __ret_435; \
+  __ret_435 = splat_lane_f16(__s0_435, __p1_435); \
+  __ret_435; \
 })
 #else
-#define vdup_lane_f16(__p0_344, __p1_344) __extension__ ({ \
-  float16x4_t __s0_344 = __p0_344; \
-  float16x4_t __rev0_344;  __rev0_344 = __builtin_shufflevector(__s0_344, __s0_344, 3, 2, 1, 0); \
-  float16x4_t __ret_344; \
-  __ret_344 = __noswap_splat_lane_f16(__rev0_344, __p1_344); \
-  __ret_344 = __builtin_shufflevector(__ret_344, __ret_344, 3, 2, 1, 0); \
-  __ret_344; \
+#define vdup_lane_f16(__p0_436, __p1_436) __extension__ ({ \
+  float16x4_t __s0_436 = __p0_436; \
+  float16x4_t __rev0_436;  __rev0_436 = __builtin_shufflevector(__s0_436, __s0_436, 3, 2, 1, 0); \
+  float16x4_t __ret_436; \
+  __ret_436 = __noswap_splat_lane_f16(__rev0_436, __p1_436); \
+  __ret_436 = __builtin_shufflevector(__ret_436, __ret_436, 3, 2, 1, 0); \
+  __ret_436; \
 })
 #endif
 
@@ -49296,502 +51000,502 @@ __ai float32x2_t vdiv_f32(float32x2_t __p0, float32x2_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vdup_laneq_p8(__p0_345, __p1_345) __extension__ ({ \
-  poly8x16_t __s0_345 = __p0_345; \
-  poly8x8_t __ret_345; \
-  __ret_345 = splat_laneq_p8(__s0_345, __p1_345); \
-  __ret_345; \
+#define vdup_laneq_p8(__p0_437, __p1_437) __extension__ ({ \
+  poly8x16_t __s0_437 = __p0_437; \
+  poly8x8_t __ret_437; \
+  __ret_437 = splat_laneq_p8(__s0_437, __p1_437); \
+  __ret_437; \
 })
 #else
-#define vdup_laneq_p8(__p0_346, __p1_346) __extension__ ({ \
-  poly8x16_t __s0_346 = __p0_346; \
-  poly8x16_t __rev0_346;  __rev0_346 = __builtin_shufflevector(__s0_346, __s0_346, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly8x8_t __ret_346; \
-  __ret_346 = __noswap_splat_laneq_p8(__rev0_346, __p1_346); \
-  __ret_346 = __builtin_shufflevector(__ret_346, __ret_346, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_346; \
+#define vdup_laneq_p8(__p0_438, __p1_438) __extension__ ({ \
+  poly8x16_t __s0_438 = __p0_438; \
+  poly8x16_t __rev0_438;  __rev0_438 = __builtin_shufflevector(__s0_438, __s0_438, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8_t __ret_438; \
+  __ret_438 = __noswap_splat_laneq_p8(__rev0_438, __p1_438); \
+  __ret_438 = __builtin_shufflevector(__ret_438, __ret_438, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_438; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vdup_laneq_p64(__p0_347, __p1_347) __extension__ ({ \
-  poly64x2_t __s0_347 = __p0_347; \
-  poly64x1_t __ret_347; \
-  __ret_347 = splat_laneq_p64(__s0_347, __p1_347); \
-  __ret_347; \
+#define vdup_laneq_p64(__p0_439, __p1_439) __extension__ ({ \
+  poly64x2_t __s0_439 = __p0_439; \
+  poly64x1_t __ret_439; \
+  __ret_439 = splat_laneq_p64(__s0_439, __p1_439); \
+  __ret_439; \
 })
 #else
-#define vdup_laneq_p64(__p0_348, __p1_348) __extension__ ({ \
-  poly64x2_t __s0_348 = __p0_348; \
-  poly64x2_t __rev0_348;  __rev0_348 = __builtin_shufflevector(__s0_348, __s0_348, 1, 0); \
-  poly64x1_t __ret_348; \
-  __ret_348 = __noswap_splat_laneq_p64(__rev0_348, __p1_348); \
-  __ret_348; \
+#define vdup_laneq_p64(__p0_440, __p1_440) __extension__ ({ \
+  poly64x2_t __s0_440 = __p0_440; \
+  poly64x2_t __rev0_440;  __rev0_440 = __builtin_shufflevector(__s0_440, __s0_440, 1, 0); \
+  poly64x1_t __ret_440; \
+  __ret_440 = __noswap_splat_laneq_p64(__rev0_440, __p1_440); \
+  __ret_440; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vdup_laneq_p16(__p0_349, __p1_349) __extension__ ({ \
-  poly16x8_t __s0_349 = __p0_349; \
-  poly16x4_t __ret_349; \
-  __ret_349 = splat_laneq_p16(__s0_349, __p1_349); \
-  __ret_349; \
+#define vdup_laneq_p16(__p0_441, __p1_441) __extension__ ({ \
+  poly16x8_t __s0_441 = __p0_441; \
+  poly16x4_t __ret_441; \
+  __ret_441 = splat_laneq_p16(__s0_441, __p1_441); \
+  __ret_441; \
 })
 #else
-#define vdup_laneq_p16(__p0_350, __p1_350) __extension__ ({ \
-  poly16x8_t __s0_350 = __p0_350; \
-  poly16x8_t __rev0_350;  __rev0_350 = __builtin_shufflevector(__s0_350, __s0_350, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly16x4_t __ret_350; \
-  __ret_350 = __noswap_splat_laneq_p16(__rev0_350, __p1_350); \
-  __ret_350 = __builtin_shufflevector(__ret_350, __ret_350, 3, 2, 1, 0); \
-  __ret_350; \
+#define vdup_laneq_p16(__p0_442, __p1_442) __extension__ ({ \
+  poly16x8_t __s0_442 = __p0_442; \
+  poly16x8_t __rev0_442;  __rev0_442 = __builtin_shufflevector(__s0_442, __s0_442, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x4_t __ret_442; \
+  __ret_442 = __noswap_splat_laneq_p16(__rev0_442, __p1_442); \
+  __ret_442 = __builtin_shufflevector(__ret_442, __ret_442, 3, 2, 1, 0); \
+  __ret_442; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vdupq_laneq_p8(__p0_351, __p1_351) __extension__ ({ \
-  poly8x16_t __s0_351 = __p0_351; \
-  poly8x16_t __ret_351; \
-  __ret_351 = splatq_laneq_p8(__s0_351, __p1_351); \
-  __ret_351; \
+#define vdupq_laneq_p8(__p0_443, __p1_443) __extension__ ({ \
+  poly8x16_t __s0_443 = __p0_443; \
+  poly8x16_t __ret_443; \
+  __ret_443 = splatq_laneq_p8(__s0_443, __p1_443); \
+  __ret_443; \
 })
 #else
-#define vdupq_laneq_p8(__p0_352, __p1_352) __extension__ ({ \
-  poly8x16_t __s0_352 = __p0_352; \
-  poly8x16_t __rev0_352;  __rev0_352 = __builtin_shufflevector(__s0_352, __s0_352, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly8x16_t __ret_352; \
-  __ret_352 = __noswap_splatq_laneq_p8(__rev0_352, __p1_352); \
-  __ret_352 = __builtin_shufflevector(__ret_352, __ret_352, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_352; \
+#define vdupq_laneq_p8(__p0_444, __p1_444) __extension__ ({ \
+  poly8x16_t __s0_444 = __p0_444; \
+  poly8x16_t __rev0_444;  __rev0_444 = __builtin_shufflevector(__s0_444, __s0_444, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16_t __ret_444; \
+  __ret_444 = __noswap_splatq_laneq_p8(__rev0_444, __p1_444); \
+  __ret_444 = __builtin_shufflevector(__ret_444, __ret_444, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_444; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vdupq_laneq_p64(__p0_353, __p1_353) __extension__ ({ \
-  poly64x2_t __s0_353 = __p0_353; \
-  poly64x2_t __ret_353; \
-  __ret_353 = splatq_laneq_p64(__s0_353, __p1_353); \
-  __ret_353; \
+#define vdupq_laneq_p64(__p0_445, __p1_445) __extension__ ({ \
+  poly64x2_t __s0_445 = __p0_445; \
+  poly64x2_t __ret_445; \
+  __ret_445 = splatq_laneq_p64(__s0_445, __p1_445); \
+  __ret_445; \
 })
 #else
-#define vdupq_laneq_p64(__p0_354, __p1_354) __extension__ ({ \
-  poly64x2_t __s0_354 = __p0_354; \
-  poly64x2_t __rev0_354;  __rev0_354 = __builtin_shufflevector(__s0_354, __s0_354, 1, 0); \
-  poly64x2_t __ret_354; \
-  __ret_354 = __noswap_splatq_laneq_p64(__rev0_354, __p1_354); \
-  __ret_354 = __builtin_shufflevector(__ret_354, __ret_354, 1, 0); \
-  __ret_354; \
+#define vdupq_laneq_p64(__p0_446, __p1_446) __extension__ ({ \
+  poly64x2_t __s0_446 = __p0_446; \
+  poly64x2_t __rev0_446;  __rev0_446 = __builtin_shufflevector(__s0_446, __s0_446, 1, 0); \
+  poly64x2_t __ret_446; \
+  __ret_446 = __noswap_splatq_laneq_p64(__rev0_446, __p1_446); \
+  __ret_446 = __builtin_shufflevector(__ret_446, __ret_446, 1, 0); \
+  __ret_446; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vdupq_laneq_p16(__p0_355, __p1_355) __extension__ ({ \
-  poly16x8_t __s0_355 = __p0_355; \
-  poly16x8_t __ret_355; \
-  __ret_355 = splatq_laneq_p16(__s0_355, __p1_355); \
-  __ret_355; \
+#define vdupq_laneq_p16(__p0_447, __p1_447) __extension__ ({ \
+  poly16x8_t __s0_447 = __p0_447; \
+  poly16x8_t __ret_447; \
+  __ret_447 = splatq_laneq_p16(__s0_447, __p1_447); \
+  __ret_447; \
 })
 #else
-#define vdupq_laneq_p16(__p0_356, __p1_356) __extension__ ({ \
-  poly16x8_t __s0_356 = __p0_356; \
-  poly16x8_t __rev0_356;  __rev0_356 = __builtin_shufflevector(__s0_356, __s0_356, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly16x8_t __ret_356; \
-  __ret_356 = __noswap_splatq_laneq_p16(__rev0_356, __p1_356); \
-  __ret_356 = __builtin_shufflevector(__ret_356, __ret_356, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_356; \
+#define vdupq_laneq_p16(__p0_448, __p1_448) __extension__ ({ \
+  poly16x8_t __s0_448 = __p0_448; \
+  poly16x8_t __rev0_448;  __rev0_448 = __builtin_shufflevector(__s0_448, __s0_448, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x8_t __ret_448; \
+  __ret_448 = __noswap_splatq_laneq_p16(__rev0_448, __p1_448); \
+  __ret_448 = __builtin_shufflevector(__ret_448, __ret_448, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_448; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vdupq_laneq_u8(__p0_357, __p1_357) __extension__ ({ \
-  uint8x16_t __s0_357 = __p0_357; \
-  uint8x16_t __ret_357; \
-  __ret_357 = splatq_laneq_u8(__s0_357, __p1_357); \
-  __ret_357; \
+#define vdupq_laneq_u8(__p0_449, __p1_449) __extension__ ({ \
+  uint8x16_t __s0_449 = __p0_449; \
+  uint8x16_t __ret_449; \
+  __ret_449 = splatq_laneq_u8(__s0_449, __p1_449); \
+  __ret_449; \
 })
 #else
-#define vdupq_laneq_u8(__p0_358, __p1_358) __extension__ ({ \
-  uint8x16_t __s0_358 = __p0_358; \
-  uint8x16_t __rev0_358;  __rev0_358 = __builtin_shufflevector(__s0_358, __s0_358, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x16_t __ret_358; \
-  __ret_358 = __noswap_splatq_laneq_u8(__rev0_358, __p1_358); \
-  __ret_358 = __builtin_shufflevector(__ret_358, __ret_358, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_358; \
+#define vdupq_laneq_u8(__p0_450, __p1_450) __extension__ ({ \
+  uint8x16_t __s0_450 = __p0_450; \
+  uint8x16_t __rev0_450;  __rev0_450 = __builtin_shufflevector(__s0_450, __s0_450, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret_450; \
+  __ret_450 = __noswap_splatq_laneq_u8(__rev0_450, __p1_450); \
+  __ret_450 = __builtin_shufflevector(__ret_450, __ret_450, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_450; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vdupq_laneq_u32(__p0_359, __p1_359) __extension__ ({ \
-  uint32x4_t __s0_359 = __p0_359; \
-  uint32x4_t __ret_359; \
-  __ret_359 = splatq_laneq_u32(__s0_359, __p1_359); \
-  __ret_359; \
+#define vdupq_laneq_u32(__p0_451, __p1_451) __extension__ ({ \
+  uint32x4_t __s0_451 = __p0_451; \
+  uint32x4_t __ret_451; \
+  __ret_451 = splatq_laneq_u32(__s0_451, __p1_451); \
+  __ret_451; \
 })
 #else
-#define vdupq_laneq_u32(__p0_360, __p1_360) __extension__ ({ \
-  uint32x4_t __s0_360 = __p0_360; \
-  uint32x4_t __rev0_360;  __rev0_360 = __builtin_shufflevector(__s0_360, __s0_360, 3, 2, 1, 0); \
-  uint32x4_t __ret_360; \
-  __ret_360 = __noswap_splatq_laneq_u32(__rev0_360, __p1_360); \
-  __ret_360 = __builtin_shufflevector(__ret_360, __ret_360, 3, 2, 1, 0); \
-  __ret_360; \
+#define vdupq_laneq_u32(__p0_452, __p1_452) __extension__ ({ \
+  uint32x4_t __s0_452 = __p0_452; \
+  uint32x4_t __rev0_452;  __rev0_452 = __builtin_shufflevector(__s0_452, __s0_452, 3, 2, 1, 0); \
+  uint32x4_t __ret_452; \
+  __ret_452 = __noswap_splatq_laneq_u32(__rev0_452, __p1_452); \
+  __ret_452 = __builtin_shufflevector(__ret_452, __ret_452, 3, 2, 1, 0); \
+  __ret_452; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vdupq_laneq_u64(__p0_361, __p1_361) __extension__ ({ \
-  uint64x2_t __s0_361 = __p0_361; \
-  uint64x2_t __ret_361; \
-  __ret_361 = splatq_laneq_u64(__s0_361, __p1_361); \
-  __ret_361; \
+#define vdupq_laneq_u64(__p0_453, __p1_453) __extension__ ({ \
+  uint64x2_t __s0_453 = __p0_453; \
+  uint64x2_t __ret_453; \
+  __ret_453 = splatq_laneq_u64(__s0_453, __p1_453); \
+  __ret_453; \
 })
 #else
-#define vdupq_laneq_u64(__p0_362, __p1_362) __extension__ ({ \
-  uint64x2_t __s0_362 = __p0_362; \
-  uint64x2_t __rev0_362;  __rev0_362 = __builtin_shufflevector(__s0_362, __s0_362, 1, 0); \
-  uint64x2_t __ret_362; \
-  __ret_362 = __noswap_splatq_laneq_u64(__rev0_362, __p1_362); \
-  __ret_362 = __builtin_shufflevector(__ret_362, __ret_362, 1, 0); \
-  __ret_362; \
+#define vdupq_laneq_u64(__p0_454, __p1_454) __extension__ ({ \
+  uint64x2_t __s0_454 = __p0_454; \
+  uint64x2_t __rev0_454;  __rev0_454 = __builtin_shufflevector(__s0_454, __s0_454, 1, 0); \
+  uint64x2_t __ret_454; \
+  __ret_454 = __noswap_splatq_laneq_u64(__rev0_454, __p1_454); \
+  __ret_454 = __builtin_shufflevector(__ret_454, __ret_454, 1, 0); \
+  __ret_454; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vdupq_laneq_u16(__p0_363, __p1_363) __extension__ ({ \
-  uint16x8_t __s0_363 = __p0_363; \
-  uint16x8_t __ret_363; \
-  __ret_363 = splatq_laneq_u16(__s0_363, __p1_363); \
-  __ret_363; \
+#define vdupq_laneq_u16(__p0_455, __p1_455) __extension__ ({ \
+  uint16x8_t __s0_455 = __p0_455; \
+  uint16x8_t __ret_455; \
+  __ret_455 = splatq_laneq_u16(__s0_455, __p1_455); \
+  __ret_455; \
 })
 #else
-#define vdupq_laneq_u16(__p0_364, __p1_364) __extension__ ({ \
-  uint16x8_t __s0_364 = __p0_364; \
-  uint16x8_t __rev0_364;  __rev0_364 = __builtin_shufflevector(__s0_364, __s0_364, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __ret_364; \
-  __ret_364 = __noswap_splatq_laneq_u16(__rev0_364, __p1_364); \
-  __ret_364 = __builtin_shufflevector(__ret_364, __ret_364, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_364; \
+#define vdupq_laneq_u16(__p0_456, __p1_456) __extension__ ({ \
+  uint16x8_t __s0_456 = __p0_456; \
+  uint16x8_t __rev0_456;  __rev0_456 = __builtin_shufflevector(__s0_456, __s0_456, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret_456; \
+  __ret_456 = __noswap_splatq_laneq_u16(__rev0_456, __p1_456); \
+  __ret_456 = __builtin_shufflevector(__ret_456, __ret_456, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_456; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vdupq_laneq_s8(__p0_365, __p1_365) __extension__ ({ \
-  int8x16_t __s0_365 = __p0_365; \
-  int8x16_t __ret_365; \
-  __ret_365 = splatq_laneq_s8(__s0_365, __p1_365); \
-  __ret_365; \
+#define vdupq_laneq_s8(__p0_457, __p1_457) __extension__ ({ \
+  int8x16_t __s0_457 = __p0_457; \
+  int8x16_t __ret_457; \
+  __ret_457 = splatq_laneq_s8(__s0_457, __p1_457); \
+  __ret_457; \
 })
 #else
-#define vdupq_laneq_s8(__p0_366, __p1_366) __extension__ ({ \
-  int8x16_t __s0_366 = __p0_366; \
-  int8x16_t __rev0_366;  __rev0_366 = __builtin_shufflevector(__s0_366, __s0_366, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x16_t __ret_366; \
-  __ret_366 = __noswap_splatq_laneq_s8(__rev0_366, __p1_366); \
-  __ret_366 = __builtin_shufflevector(__ret_366, __ret_366, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_366; \
+#define vdupq_laneq_s8(__p0_458, __p1_458) __extension__ ({ \
+  int8x16_t __s0_458 = __p0_458; \
+  int8x16_t __rev0_458;  __rev0_458 = __builtin_shufflevector(__s0_458, __s0_458, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret_458; \
+  __ret_458 = __noswap_splatq_laneq_s8(__rev0_458, __p1_458); \
+  __ret_458 = __builtin_shufflevector(__ret_458, __ret_458, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_458; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vdupq_laneq_f64(__p0_367, __p1_367) __extension__ ({ \
-  float64x2_t __s0_367 = __p0_367; \
-  float64x2_t __ret_367; \
-  __ret_367 = splatq_laneq_f64(__s0_367, __p1_367); \
-  __ret_367; \
+#define vdupq_laneq_f64(__p0_459, __p1_459) __extension__ ({ \
+  float64x2_t __s0_459 = __p0_459; \
+  float64x2_t __ret_459; \
+  __ret_459 = splatq_laneq_f64(__s0_459, __p1_459); \
+  __ret_459; \
 })
 #else
-#define vdupq_laneq_f64(__p0_368, __p1_368) __extension__ ({ \
-  float64x2_t __s0_368 = __p0_368; \
-  float64x2_t __rev0_368;  __rev0_368 = __builtin_shufflevector(__s0_368, __s0_368, 1, 0); \
-  float64x2_t __ret_368; \
-  __ret_368 = __noswap_splatq_laneq_f64(__rev0_368, __p1_368); \
-  __ret_368 = __builtin_shufflevector(__ret_368, __ret_368, 1, 0); \
-  __ret_368; \
+#define vdupq_laneq_f64(__p0_460, __p1_460) __extension__ ({ \
+  float64x2_t __s0_460 = __p0_460; \
+  float64x2_t __rev0_460;  __rev0_460 = __builtin_shufflevector(__s0_460, __s0_460, 1, 0); \
+  float64x2_t __ret_460; \
+  __ret_460 = __noswap_splatq_laneq_f64(__rev0_460, __p1_460); \
+  __ret_460 = __builtin_shufflevector(__ret_460, __ret_460, 1, 0); \
+  __ret_460; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vdupq_laneq_f32(__p0_369, __p1_369) __extension__ ({ \
-  float32x4_t __s0_369 = __p0_369; \
-  float32x4_t __ret_369; \
-  __ret_369 = splatq_laneq_f32(__s0_369, __p1_369); \
-  __ret_369; \
+#define vdupq_laneq_f32(__p0_461, __p1_461) __extension__ ({ \
+  float32x4_t __s0_461 = __p0_461; \
+  float32x4_t __ret_461; \
+  __ret_461 = splatq_laneq_f32(__s0_461, __p1_461); \
+  __ret_461; \
 })
 #else
-#define vdupq_laneq_f32(__p0_370, __p1_370) __extension__ ({ \
-  float32x4_t __s0_370 = __p0_370; \
-  float32x4_t __rev0_370;  __rev0_370 = __builtin_shufflevector(__s0_370, __s0_370, 3, 2, 1, 0); \
-  float32x4_t __ret_370; \
-  __ret_370 = __noswap_splatq_laneq_f32(__rev0_370, __p1_370); \
-  __ret_370 = __builtin_shufflevector(__ret_370, __ret_370, 3, 2, 1, 0); \
-  __ret_370; \
+#define vdupq_laneq_f32(__p0_462, __p1_462) __extension__ ({ \
+  float32x4_t __s0_462 = __p0_462; \
+  float32x4_t __rev0_462;  __rev0_462 = __builtin_shufflevector(__s0_462, __s0_462, 3, 2, 1, 0); \
+  float32x4_t __ret_462; \
+  __ret_462 = __noswap_splatq_laneq_f32(__rev0_462, __p1_462); \
+  __ret_462 = __builtin_shufflevector(__ret_462, __ret_462, 3, 2, 1, 0); \
+  __ret_462; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vdupq_laneq_f16(__p0_371, __p1_371) __extension__ ({ \
-  float16x8_t __s0_371 = __p0_371; \
-  float16x8_t __ret_371; \
-  __ret_371 = splatq_laneq_f16(__s0_371, __p1_371); \
-  __ret_371; \
+#define vdupq_laneq_f16(__p0_463, __p1_463) __extension__ ({ \
+  float16x8_t __s0_463 = __p0_463; \
+  float16x8_t __ret_463; \
+  __ret_463 = splatq_laneq_f16(__s0_463, __p1_463); \
+  __ret_463; \
 })
 #else
-#define vdupq_laneq_f16(__p0_372, __p1_372) __extension__ ({ \
-  float16x8_t __s0_372 = __p0_372; \
-  float16x8_t __rev0_372;  __rev0_372 = __builtin_shufflevector(__s0_372, __s0_372, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __ret_372; \
-  __ret_372 = __noswap_splatq_laneq_f16(__rev0_372, __p1_372); \
-  __ret_372 = __builtin_shufflevector(__ret_372, __ret_372, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_372; \
+#define vdupq_laneq_f16(__p0_464, __p1_464) __extension__ ({ \
+  float16x8_t __s0_464 = __p0_464; \
+  float16x8_t __rev0_464;  __rev0_464 = __builtin_shufflevector(__s0_464, __s0_464, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __ret_464; \
+  __ret_464 = __noswap_splatq_laneq_f16(__rev0_464, __p1_464); \
+  __ret_464 = __builtin_shufflevector(__ret_464, __ret_464, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_464; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vdupq_laneq_s32(__p0_373, __p1_373) __extension__ ({ \
-  int32x4_t __s0_373 = __p0_373; \
-  int32x4_t __ret_373; \
-  __ret_373 = splatq_laneq_s32(__s0_373, __p1_373); \
-  __ret_373; \
+#define vdupq_laneq_s32(__p0_465, __p1_465) __extension__ ({ \
+  int32x4_t __s0_465 = __p0_465; \
+  int32x4_t __ret_465; \
+  __ret_465 = splatq_laneq_s32(__s0_465, __p1_465); \
+  __ret_465; \
 })
 #else
-#define vdupq_laneq_s32(__p0_374, __p1_374) __extension__ ({ \
-  int32x4_t __s0_374 = __p0_374; \
-  int32x4_t __rev0_374;  __rev0_374 = __builtin_shufflevector(__s0_374, __s0_374, 3, 2, 1, 0); \
-  int32x4_t __ret_374; \
-  __ret_374 = __noswap_splatq_laneq_s32(__rev0_374, __p1_374); \
-  __ret_374 = __builtin_shufflevector(__ret_374, __ret_374, 3, 2, 1, 0); \
-  __ret_374; \
+#define vdupq_laneq_s32(__p0_466, __p1_466) __extension__ ({ \
+  int32x4_t __s0_466 = __p0_466; \
+  int32x4_t __rev0_466;  __rev0_466 = __builtin_shufflevector(__s0_466, __s0_466, 3, 2, 1, 0); \
+  int32x4_t __ret_466; \
+  __ret_466 = __noswap_splatq_laneq_s32(__rev0_466, __p1_466); \
+  __ret_466 = __builtin_shufflevector(__ret_466, __ret_466, 3, 2, 1, 0); \
+  __ret_466; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vdupq_laneq_s64(__p0_375, __p1_375) __extension__ ({ \
-  int64x2_t __s0_375 = __p0_375; \
-  int64x2_t __ret_375; \
-  __ret_375 = splatq_laneq_s64(__s0_375, __p1_375); \
-  __ret_375; \
+#define vdupq_laneq_s64(__p0_467, __p1_467) __extension__ ({ \
+  int64x2_t __s0_467 = __p0_467; \
+  int64x2_t __ret_467; \
+  __ret_467 = splatq_laneq_s64(__s0_467, __p1_467); \
+  __ret_467; \
 })
 #else
-#define vdupq_laneq_s64(__p0_376, __p1_376) __extension__ ({ \
-  int64x2_t __s0_376 = __p0_376; \
-  int64x2_t __rev0_376;  __rev0_376 = __builtin_shufflevector(__s0_376, __s0_376, 1, 0); \
-  int64x2_t __ret_376; \
-  __ret_376 = __noswap_splatq_laneq_s64(__rev0_376, __p1_376); \
-  __ret_376 = __builtin_shufflevector(__ret_376, __ret_376, 1, 0); \
-  __ret_376; \
+#define vdupq_laneq_s64(__p0_468, __p1_468) __extension__ ({ \
+  int64x2_t __s0_468 = __p0_468; \
+  int64x2_t __rev0_468;  __rev0_468 = __builtin_shufflevector(__s0_468, __s0_468, 1, 0); \
+  int64x2_t __ret_468; \
+  __ret_468 = __noswap_splatq_laneq_s64(__rev0_468, __p1_468); \
+  __ret_468 = __builtin_shufflevector(__ret_468, __ret_468, 1, 0); \
+  __ret_468; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vdupq_laneq_s16(__p0_377, __p1_377) __extension__ ({ \
-  int16x8_t __s0_377 = __p0_377; \
-  int16x8_t __ret_377; \
-  __ret_377 = splatq_laneq_s16(__s0_377, __p1_377); \
-  __ret_377; \
+#define vdupq_laneq_s16(__p0_469, __p1_469) __extension__ ({ \
+  int16x8_t __s0_469 = __p0_469; \
+  int16x8_t __ret_469; \
+  __ret_469 = splatq_laneq_s16(__s0_469, __p1_469); \
+  __ret_469; \
 })
 #else
-#define vdupq_laneq_s16(__p0_378, __p1_378) __extension__ ({ \
-  int16x8_t __s0_378 = __p0_378; \
-  int16x8_t __rev0_378;  __rev0_378 = __builtin_shufflevector(__s0_378, __s0_378, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __ret_378; \
-  __ret_378 = __noswap_splatq_laneq_s16(__rev0_378, __p1_378); \
-  __ret_378 = __builtin_shufflevector(__ret_378, __ret_378, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_378; \
+#define vdupq_laneq_s16(__p0_470, __p1_470) __extension__ ({ \
+  int16x8_t __s0_470 = __p0_470; \
+  int16x8_t __rev0_470;  __rev0_470 = __builtin_shufflevector(__s0_470, __s0_470, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret_470; \
+  __ret_470 = __noswap_splatq_laneq_s16(__rev0_470, __p1_470); \
+  __ret_470 = __builtin_shufflevector(__ret_470, __ret_470, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_470; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vdup_laneq_u8(__p0_379, __p1_379) __extension__ ({ \
-  uint8x16_t __s0_379 = __p0_379; \
-  uint8x8_t __ret_379; \
-  __ret_379 = splat_laneq_u8(__s0_379, __p1_379); \
-  __ret_379; \
+#define vdup_laneq_u8(__p0_471, __p1_471) __extension__ ({ \
+  uint8x16_t __s0_471 = __p0_471; \
+  uint8x8_t __ret_471; \
+  __ret_471 = splat_laneq_u8(__s0_471, __p1_471); \
+  __ret_471; \
 })
 #else
-#define vdup_laneq_u8(__p0_380, __p1_380) __extension__ ({ \
-  uint8x16_t __s0_380 = __p0_380; \
-  uint8x16_t __rev0_380;  __rev0_380 = __builtin_shufflevector(__s0_380, __s0_380, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x8_t __ret_380; \
-  __ret_380 = __noswap_splat_laneq_u8(__rev0_380, __p1_380); \
-  __ret_380 = __builtin_shufflevector(__ret_380, __ret_380, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_380; \
+#define vdup_laneq_u8(__p0_472, __p1_472) __extension__ ({ \
+  uint8x16_t __s0_472 = __p0_472; \
+  uint8x16_t __rev0_472;  __rev0_472 = __builtin_shufflevector(__s0_472, __s0_472, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret_472; \
+  __ret_472 = __noswap_splat_laneq_u8(__rev0_472, __p1_472); \
+  __ret_472 = __builtin_shufflevector(__ret_472, __ret_472, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_472; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vdup_laneq_u32(__p0_381, __p1_381) __extension__ ({ \
-  uint32x4_t __s0_381 = __p0_381; \
-  uint32x2_t __ret_381; \
-  __ret_381 = splat_laneq_u32(__s0_381, __p1_381); \
-  __ret_381; \
+#define vdup_laneq_u32(__p0_473, __p1_473) __extension__ ({ \
+  uint32x4_t __s0_473 = __p0_473; \
+  uint32x2_t __ret_473; \
+  __ret_473 = splat_laneq_u32(__s0_473, __p1_473); \
+  __ret_473; \
 })
 #else
-#define vdup_laneq_u32(__p0_382, __p1_382) __extension__ ({ \
-  uint32x4_t __s0_382 = __p0_382; \
-  uint32x4_t __rev0_382;  __rev0_382 = __builtin_shufflevector(__s0_382, __s0_382, 3, 2, 1, 0); \
-  uint32x2_t __ret_382; \
-  __ret_382 = __noswap_splat_laneq_u32(__rev0_382, __p1_382); \
-  __ret_382 = __builtin_shufflevector(__ret_382, __ret_382, 1, 0); \
-  __ret_382; \
+#define vdup_laneq_u32(__p0_474, __p1_474) __extension__ ({ \
+  uint32x4_t __s0_474 = __p0_474; \
+  uint32x4_t __rev0_474;  __rev0_474 = __builtin_shufflevector(__s0_474, __s0_474, 3, 2, 1, 0); \
+  uint32x2_t __ret_474; \
+  __ret_474 = __noswap_splat_laneq_u32(__rev0_474, __p1_474); \
+  __ret_474 = __builtin_shufflevector(__ret_474, __ret_474, 1, 0); \
+  __ret_474; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vdup_laneq_u64(__p0_383, __p1_383) __extension__ ({ \
-  uint64x2_t __s0_383 = __p0_383; \
-  uint64x1_t __ret_383; \
-  __ret_383 = splat_laneq_u64(__s0_383, __p1_383); \
-  __ret_383; \
+#define vdup_laneq_u64(__p0_475, __p1_475) __extension__ ({ \
+  uint64x2_t __s0_475 = __p0_475; \
+  uint64x1_t __ret_475; \
+  __ret_475 = splat_laneq_u64(__s0_475, __p1_475); \
+  __ret_475; \
 })
 #else
-#define vdup_laneq_u64(__p0_384, __p1_384) __extension__ ({ \
-  uint64x2_t __s0_384 = __p0_384; \
-  uint64x2_t __rev0_384;  __rev0_384 = __builtin_shufflevector(__s0_384, __s0_384, 1, 0); \
-  uint64x1_t __ret_384; \
-  __ret_384 = __noswap_splat_laneq_u64(__rev0_384, __p1_384); \
-  __ret_384; \
+#define vdup_laneq_u64(__p0_476, __p1_476) __extension__ ({ \
+  uint64x2_t __s0_476 = __p0_476; \
+  uint64x2_t __rev0_476;  __rev0_476 = __builtin_shufflevector(__s0_476, __s0_476, 1, 0); \
+  uint64x1_t __ret_476; \
+  __ret_476 = __noswap_splat_laneq_u64(__rev0_476, __p1_476); \
+  __ret_476; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vdup_laneq_u16(__p0_385, __p1_385) __extension__ ({ \
-  uint16x8_t __s0_385 = __p0_385; \
-  uint16x4_t __ret_385; \
-  __ret_385 = splat_laneq_u16(__s0_385, __p1_385); \
-  __ret_385; \
+#define vdup_laneq_u16(__p0_477, __p1_477) __extension__ ({ \
+  uint16x8_t __s0_477 = __p0_477; \
+  uint16x4_t __ret_477; \
+  __ret_477 = splat_laneq_u16(__s0_477, __p1_477); \
+  __ret_477; \
 })
 #else
-#define vdup_laneq_u16(__p0_386, __p1_386) __extension__ ({ \
-  uint16x8_t __s0_386 = __p0_386; \
-  uint16x8_t __rev0_386;  __rev0_386 = __builtin_shufflevector(__s0_386, __s0_386, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x4_t __ret_386; \
-  __ret_386 = __noswap_splat_laneq_u16(__rev0_386, __p1_386); \
-  __ret_386 = __builtin_shufflevector(__ret_386, __ret_386, 3, 2, 1, 0); \
-  __ret_386; \
+#define vdup_laneq_u16(__p0_478, __p1_478) __extension__ ({ \
+  uint16x8_t __s0_478 = __p0_478; \
+  uint16x8_t __rev0_478;  __rev0_478 = __builtin_shufflevector(__s0_478, __s0_478, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x4_t __ret_478; \
+  __ret_478 = __noswap_splat_laneq_u16(__rev0_478, __p1_478); \
+  __ret_478 = __builtin_shufflevector(__ret_478, __ret_478, 3, 2, 1, 0); \
+  __ret_478; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vdup_laneq_s8(__p0_387, __p1_387) __extension__ ({ \
-  int8x16_t __s0_387 = __p0_387; \
-  int8x8_t __ret_387; \
-  __ret_387 = splat_laneq_s8(__s0_387, __p1_387); \
-  __ret_387; \
+#define vdup_laneq_s8(__p0_479, __p1_479) __extension__ ({ \
+  int8x16_t __s0_479 = __p0_479; \
+  int8x8_t __ret_479; \
+  __ret_479 = splat_laneq_s8(__s0_479, __p1_479); \
+  __ret_479; \
 })
 #else
-#define vdup_laneq_s8(__p0_388, __p1_388) __extension__ ({ \
-  int8x16_t __s0_388 = __p0_388; \
-  int8x16_t __rev0_388;  __rev0_388 = __builtin_shufflevector(__s0_388, __s0_388, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x8_t __ret_388; \
-  __ret_388 = __noswap_splat_laneq_s8(__rev0_388, __p1_388); \
-  __ret_388 = __builtin_shufflevector(__ret_388, __ret_388, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_388; \
+#define vdup_laneq_s8(__p0_480, __p1_480) __extension__ ({ \
+  int8x16_t __s0_480 = __p0_480; \
+  int8x16_t __rev0_480;  __rev0_480 = __builtin_shufflevector(__s0_480, __s0_480, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret_480; \
+  __ret_480 = __noswap_splat_laneq_s8(__rev0_480, __p1_480); \
+  __ret_480 = __builtin_shufflevector(__ret_480, __ret_480, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_480; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vdup_laneq_f64(__p0_389, __p1_389) __extension__ ({ \
-  float64x2_t __s0_389 = __p0_389; \
-  float64x1_t __ret_389; \
-  __ret_389 = splat_laneq_f64(__s0_389, __p1_389); \
-  __ret_389; \
+#define vdup_laneq_f64(__p0_481, __p1_481) __extension__ ({ \
+  float64x2_t __s0_481 = __p0_481; \
+  float64x1_t __ret_481; \
+  __ret_481 = splat_laneq_f64(__s0_481, __p1_481); \
+  __ret_481; \
 })
 #else
-#define vdup_laneq_f64(__p0_390, __p1_390) __extension__ ({ \
-  float64x2_t __s0_390 = __p0_390; \
-  float64x2_t __rev0_390;  __rev0_390 = __builtin_shufflevector(__s0_390, __s0_390, 1, 0); \
-  float64x1_t __ret_390; \
-  __ret_390 = __noswap_splat_laneq_f64(__rev0_390, __p1_390); \
-  __ret_390; \
+#define vdup_laneq_f64(__p0_482, __p1_482) __extension__ ({ \
+  float64x2_t __s0_482 = __p0_482; \
+  float64x2_t __rev0_482;  __rev0_482 = __builtin_shufflevector(__s0_482, __s0_482, 1, 0); \
+  float64x1_t __ret_482; \
+  __ret_482 = __noswap_splat_laneq_f64(__rev0_482, __p1_482); \
+  __ret_482; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vdup_laneq_f32(__p0_391, __p1_391) __extension__ ({ \
-  float32x4_t __s0_391 = __p0_391; \
-  float32x2_t __ret_391; \
-  __ret_391 = splat_laneq_f32(__s0_391, __p1_391); \
-  __ret_391; \
+#define vdup_laneq_f32(__p0_483, __p1_483) __extension__ ({ \
+  float32x4_t __s0_483 = __p0_483; \
+  float32x2_t __ret_483; \
+  __ret_483 = splat_laneq_f32(__s0_483, __p1_483); \
+  __ret_483; \
 })
 #else
-#define vdup_laneq_f32(__p0_392, __p1_392) __extension__ ({ \
-  float32x4_t __s0_392 = __p0_392; \
-  float32x4_t __rev0_392;  __rev0_392 = __builtin_shufflevector(__s0_392, __s0_392, 3, 2, 1, 0); \
-  float32x2_t __ret_392; \
-  __ret_392 = __noswap_splat_laneq_f32(__rev0_392, __p1_392); \
-  __ret_392 = __builtin_shufflevector(__ret_392, __ret_392, 1, 0); \
-  __ret_392; \
+#define vdup_laneq_f32(__p0_484, __p1_484) __extension__ ({ \
+  float32x4_t __s0_484 = __p0_484; \
+  float32x4_t __rev0_484;  __rev0_484 = __builtin_shufflevector(__s0_484, __s0_484, 3, 2, 1, 0); \
+  float32x2_t __ret_484; \
+  __ret_484 = __noswap_splat_laneq_f32(__rev0_484, __p1_484); \
+  __ret_484 = __builtin_shufflevector(__ret_484, __ret_484, 1, 0); \
+  __ret_484; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vdup_laneq_f16(__p0_393, __p1_393) __extension__ ({ \
-  float16x8_t __s0_393 = __p0_393; \
-  float16x4_t __ret_393; \
-  __ret_393 = splat_laneq_f16(__s0_393, __p1_393); \
-  __ret_393; \
+#define vdup_laneq_f16(__p0_485, __p1_485) __extension__ ({ \
+  float16x8_t __s0_485 = __p0_485; \
+  float16x4_t __ret_485; \
+  __ret_485 = splat_laneq_f16(__s0_485, __p1_485); \
+  __ret_485; \
 })
 #else
-#define vdup_laneq_f16(__p0_394, __p1_394) __extension__ ({ \
-  float16x8_t __s0_394 = __p0_394; \
-  float16x8_t __rev0_394;  __rev0_394 = __builtin_shufflevector(__s0_394, __s0_394, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x4_t __ret_394; \
-  __ret_394 = __noswap_splat_laneq_f16(__rev0_394, __p1_394); \
-  __ret_394 = __builtin_shufflevector(__ret_394, __ret_394, 3, 2, 1, 0); \
-  __ret_394; \
+#define vdup_laneq_f16(__p0_486, __p1_486) __extension__ ({ \
+  float16x8_t __s0_486 = __p0_486; \
+  float16x8_t __rev0_486;  __rev0_486 = __builtin_shufflevector(__s0_486, __s0_486, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x4_t __ret_486; \
+  __ret_486 = __noswap_splat_laneq_f16(__rev0_486, __p1_486); \
+  __ret_486 = __builtin_shufflevector(__ret_486, __ret_486, 3, 2, 1, 0); \
+  __ret_486; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vdup_laneq_s32(__p0_395, __p1_395) __extension__ ({ \
-  int32x4_t __s0_395 = __p0_395; \
-  int32x2_t __ret_395; \
-  __ret_395 = splat_laneq_s32(__s0_395, __p1_395); \
-  __ret_395; \
+#define vdup_laneq_s32(__p0_487, __p1_487) __extension__ ({ \
+  int32x4_t __s0_487 = __p0_487; \
+  int32x2_t __ret_487; \
+  __ret_487 = splat_laneq_s32(__s0_487, __p1_487); \
+  __ret_487; \
 })
 #else
-#define vdup_laneq_s32(__p0_396, __p1_396) __extension__ ({ \
-  int32x4_t __s0_396 = __p0_396; \
-  int32x4_t __rev0_396;  __rev0_396 = __builtin_shufflevector(__s0_396, __s0_396, 3, 2, 1, 0); \
-  int32x2_t __ret_396; \
-  __ret_396 = __noswap_splat_laneq_s32(__rev0_396, __p1_396); \
-  __ret_396 = __builtin_shufflevector(__ret_396, __ret_396, 1, 0); \
-  __ret_396; \
+#define vdup_laneq_s32(__p0_488, __p1_488) __extension__ ({ \
+  int32x4_t __s0_488 = __p0_488; \
+  int32x4_t __rev0_488;  __rev0_488 = __builtin_shufflevector(__s0_488, __s0_488, 3, 2, 1, 0); \
+  int32x2_t __ret_488; \
+  __ret_488 = __noswap_splat_laneq_s32(__rev0_488, __p1_488); \
+  __ret_488 = __builtin_shufflevector(__ret_488, __ret_488, 1, 0); \
+  __ret_488; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vdup_laneq_s64(__p0_397, __p1_397) __extension__ ({ \
-  int64x2_t __s0_397 = __p0_397; \
-  int64x1_t __ret_397; \
-  __ret_397 = splat_laneq_s64(__s0_397, __p1_397); \
-  __ret_397; \
+#define vdup_laneq_s64(__p0_489, __p1_489) __extension__ ({ \
+  int64x2_t __s0_489 = __p0_489; \
+  int64x1_t __ret_489; \
+  __ret_489 = splat_laneq_s64(__s0_489, __p1_489); \
+  __ret_489; \
 })
 #else
-#define vdup_laneq_s64(__p0_398, __p1_398) __extension__ ({ \
-  int64x2_t __s0_398 = __p0_398; \
-  int64x2_t __rev0_398;  __rev0_398 = __builtin_shufflevector(__s0_398, __s0_398, 1, 0); \
-  int64x1_t __ret_398; \
-  __ret_398 = __noswap_splat_laneq_s64(__rev0_398, __p1_398); \
-  __ret_398; \
+#define vdup_laneq_s64(__p0_490, __p1_490) __extension__ ({ \
+  int64x2_t __s0_490 = __p0_490; \
+  int64x2_t __rev0_490;  __rev0_490 = __builtin_shufflevector(__s0_490, __s0_490, 1, 0); \
+  int64x1_t __ret_490; \
+  __ret_490 = __noswap_splat_laneq_s64(__rev0_490, __p1_490); \
+  __ret_490; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vdup_laneq_s16(__p0_399, __p1_399) __extension__ ({ \
-  int16x8_t __s0_399 = __p0_399; \
-  int16x4_t __ret_399; \
-  __ret_399 = splat_laneq_s16(__s0_399, __p1_399); \
-  __ret_399; \
+#define vdup_laneq_s16(__p0_491, __p1_491) __extension__ ({ \
+  int16x8_t __s0_491 = __p0_491; \
+  int16x4_t __ret_491; \
+  __ret_491 = splat_laneq_s16(__s0_491, __p1_491); \
+  __ret_491; \
 })
 #else
-#define vdup_laneq_s16(__p0_400, __p1_400) __extension__ ({ \
-  int16x8_t __s0_400 = __p0_400; \
-  int16x8_t __rev0_400;  __rev0_400 = __builtin_shufflevector(__s0_400, __s0_400, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x4_t __ret_400; \
-  __ret_400 = __noswap_splat_laneq_s16(__rev0_400, __p1_400); \
-  __ret_400 = __builtin_shufflevector(__ret_400, __ret_400, 3, 2, 1, 0); \
-  __ret_400; \
+#define vdup_laneq_s16(__p0_492, __p1_492) __extension__ ({ \
+  int16x8_t __s0_492 = __p0_492; \
+  int16x8_t __rev0_492;  __rev0_492 = __builtin_shufflevector(__s0_492, __s0_492, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __ret_492; \
+  __ret_492 = __noswap_splat_laneq_s16(__rev0_492, __p1_492); \
+  __ret_492 = __builtin_shufflevector(__ret_492, __ret_492, 3, 2, 1, 0); \
+  __ret_492; \
 })
 #endif
 
@@ -50287,246 +51991,246 @@ __ai float64x1_t vfms_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2)
   __ret = vfma_f64(__p0, -__p1, __p2);
   return __ret;
 }
-#define vfmsd_lane_f64(__p0_401, __p1_401, __p2_401, __p3_401) __extension__ ({ \
-  float64_t __s0_401 = __p0_401; \
-  float64_t __s1_401 = __p1_401; \
-  float64x1_t __s2_401 = __p2_401; \
-  float64_t __ret_401; \
-  __ret_401 = vfmad_lane_f64(__s0_401, -__s1_401, __s2_401, __p3_401); \
-  __ret_401; \
+#define vfmsd_lane_f64(__p0_493, __p1_493, __p2_493, __p3_493) __extension__ ({ \
+  float64_t __s0_493 = __p0_493; \
+  float64_t __s1_493 = __p1_493; \
+  float64x1_t __s2_493 = __p2_493; \
+  float64_t __ret_493; \
+  __ret_493 = vfmad_lane_f64(__s0_493, -__s1_493, __s2_493, __p3_493); \
+  __ret_493; \
 })
 #ifdef __LITTLE_ENDIAN__
-#define vfmss_lane_f32(__p0_402, __p1_402, __p2_402, __p3_402) __extension__ ({ \
-  float32_t __s0_402 = __p0_402; \
-  float32_t __s1_402 = __p1_402; \
-  float32x2_t __s2_402 = __p2_402; \
-  float32_t __ret_402; \
-  __ret_402 = vfmas_lane_f32(__s0_402, -__s1_402, __s2_402, __p3_402); \
-  __ret_402; \
+#define vfmss_lane_f32(__p0_494, __p1_494, __p2_494, __p3_494) __extension__ ({ \
+  float32_t __s0_494 = __p0_494; \
+  float32_t __s1_494 = __p1_494; \
+  float32x2_t __s2_494 = __p2_494; \
+  float32_t __ret_494; \
+  __ret_494 = vfmas_lane_f32(__s0_494, -__s1_494, __s2_494, __p3_494); \
+  __ret_494; \
 })
 #else
-#define vfmss_lane_f32(__p0_403, __p1_403, __p2_403, __p3_403) __extension__ ({ \
-  float32_t __s0_403 = __p0_403; \
-  float32_t __s1_403 = __p1_403; \
-  float32x2_t __s2_403 = __p2_403; \
-  float32x2_t __rev2_403;  __rev2_403 = __builtin_shufflevector(__s2_403, __s2_403, 1, 0); \
-  float32_t __ret_403; \
-  __ret_403 = __noswap_vfmas_lane_f32(__s0_403, -__s1_403, __rev2_403, __p3_403); \
-  __ret_403; \
+#define vfmss_lane_f32(__p0_495, __p1_495, __p2_495, __p3_495) __extension__ ({ \
+  float32_t __s0_495 = __p0_495; \
+  float32_t __s1_495 = __p1_495; \
+  float32x2_t __s2_495 = __p2_495; \
+  float32x2_t __rev2_495;  __rev2_495 = __builtin_shufflevector(__s2_495, __s2_495, 1, 0); \
+  float32_t __ret_495; \
+  __ret_495 = __noswap_vfmas_lane_f32(__s0_495, -__s1_495, __rev2_495, __p3_495); \
+  __ret_495; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmsq_lane_f64(__p0_404, __p1_404, __p2_404, __p3_404) __extension__ ({ \
-  float64x2_t __s0_404 = __p0_404; \
-  float64x2_t __s1_404 = __p1_404; \
-  float64x1_t __s2_404 = __p2_404; \
-  float64x2_t __ret_404; \
-  __ret_404 = vfmaq_lane_f64(__s0_404, -__s1_404, __s2_404, __p3_404); \
-  __ret_404; \
+#define vfmsq_lane_f64(__p0_496, __p1_496, __p2_496, __p3_496) __extension__ ({ \
+  float64x2_t __s0_496 = __p0_496; \
+  float64x2_t __s1_496 = __p1_496; \
+  float64x1_t __s2_496 = __p2_496; \
+  float64x2_t __ret_496; \
+  __ret_496 = vfmaq_lane_f64(__s0_496, -__s1_496, __s2_496, __p3_496); \
+  __ret_496; \
 })
 #else
-#define vfmsq_lane_f64(__p0_405, __p1_405, __p2_405, __p3_405) __extension__ ({ \
-  float64x2_t __s0_405 = __p0_405; \
-  float64x2_t __s1_405 = __p1_405; \
-  float64x1_t __s2_405 = __p2_405; \
-  float64x2_t __rev0_405;  __rev0_405 = __builtin_shufflevector(__s0_405, __s0_405, 1, 0); \
-  float64x2_t __rev1_405;  __rev1_405 = __builtin_shufflevector(__s1_405, __s1_405, 1, 0); \
-  float64x2_t __ret_405; \
-  __ret_405 = __noswap_vfmaq_lane_f64(__rev0_405, -__rev1_405, __s2_405, __p3_405); \
-  __ret_405 = __builtin_shufflevector(__ret_405, __ret_405, 1, 0); \
-  __ret_405; \
+#define vfmsq_lane_f64(__p0_497, __p1_497, __p2_497, __p3_497) __extension__ ({ \
+  float64x2_t __s0_497 = __p0_497; \
+  float64x2_t __s1_497 = __p1_497; \
+  float64x1_t __s2_497 = __p2_497; \
+  float64x2_t __rev0_497;  __rev0_497 = __builtin_shufflevector(__s0_497, __s0_497, 1, 0); \
+  float64x2_t __rev1_497;  __rev1_497 = __builtin_shufflevector(__s1_497, __s1_497, 1, 0); \
+  float64x2_t __ret_497; \
+  __ret_497 = __noswap_vfmaq_lane_f64(__rev0_497, -__rev1_497, __s2_497, __p3_497); \
+  __ret_497 = __builtin_shufflevector(__ret_497, __ret_497, 1, 0); \
+  __ret_497; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmsq_lane_f32(__p0_406, __p1_406, __p2_406, __p3_406) __extension__ ({ \
-  float32x4_t __s0_406 = __p0_406; \
-  float32x4_t __s1_406 = __p1_406; \
-  float32x2_t __s2_406 = __p2_406; \
-  float32x4_t __ret_406; \
-  __ret_406 = vfmaq_lane_f32(__s0_406, -__s1_406, __s2_406, __p3_406); \
-  __ret_406; \
+#define vfmsq_lane_f32(__p0_498, __p1_498, __p2_498, __p3_498) __extension__ ({ \
+  float32x4_t __s0_498 = __p0_498; \
+  float32x4_t __s1_498 = __p1_498; \
+  float32x2_t __s2_498 = __p2_498; \
+  float32x4_t __ret_498; \
+  __ret_498 = vfmaq_lane_f32(__s0_498, -__s1_498, __s2_498, __p3_498); \
+  __ret_498; \
 })
 #else
-#define vfmsq_lane_f32(__p0_407, __p1_407, __p2_407, __p3_407) __extension__ ({ \
-  float32x4_t __s0_407 = __p0_407; \
-  float32x4_t __s1_407 = __p1_407; \
-  float32x2_t __s2_407 = __p2_407; \
-  float32x4_t __rev0_407;  __rev0_407 = __builtin_shufflevector(__s0_407, __s0_407, 3, 2, 1, 0); \
-  float32x4_t __rev1_407;  __rev1_407 = __builtin_shufflevector(__s1_407, __s1_407, 3, 2, 1, 0); \
-  float32x2_t __rev2_407;  __rev2_407 = __builtin_shufflevector(__s2_407, __s2_407, 1, 0); \
-  float32x4_t __ret_407; \
-  __ret_407 = __noswap_vfmaq_lane_f32(__rev0_407, -__rev1_407, __rev2_407, __p3_407); \
-  __ret_407 = __builtin_shufflevector(__ret_407, __ret_407, 3, 2, 1, 0); \
-  __ret_407; \
+#define vfmsq_lane_f32(__p0_499, __p1_499, __p2_499, __p3_499) __extension__ ({ \
+  float32x4_t __s0_499 = __p0_499; \
+  float32x4_t __s1_499 = __p1_499; \
+  float32x2_t __s2_499 = __p2_499; \
+  float32x4_t __rev0_499;  __rev0_499 = __builtin_shufflevector(__s0_499, __s0_499, 3, 2, 1, 0); \
+  float32x4_t __rev1_499;  __rev1_499 = __builtin_shufflevector(__s1_499, __s1_499, 3, 2, 1, 0); \
+  float32x2_t __rev2_499;  __rev2_499 = __builtin_shufflevector(__s2_499, __s2_499, 1, 0); \
+  float32x4_t __ret_499; \
+  __ret_499 = __noswap_vfmaq_lane_f32(__rev0_499, -__rev1_499, __rev2_499, __p3_499); \
+  __ret_499 = __builtin_shufflevector(__ret_499, __ret_499, 3, 2, 1, 0); \
+  __ret_499; \
 })
 #endif
 
-#define vfms_lane_f64(__p0_408, __p1_408, __p2_408, __p3_408) __extension__ ({ \
-  float64x1_t __s0_408 = __p0_408; \
-  float64x1_t __s1_408 = __p1_408; \
-  float64x1_t __s2_408 = __p2_408; \
-  float64x1_t __ret_408; \
-  __ret_408 = vfma_lane_f64(__s0_408, -__s1_408, __s2_408, __p3_408); \
-  __ret_408; \
+#define vfms_lane_f64(__p0_500, __p1_500, __p2_500, __p3_500) __extension__ ({ \
+  float64x1_t __s0_500 = __p0_500; \
+  float64x1_t __s1_500 = __p1_500; \
+  float64x1_t __s2_500 = __p2_500; \
+  float64x1_t __ret_500; \
+  __ret_500 = vfma_lane_f64(__s0_500, -__s1_500, __s2_500, __p3_500); \
+  __ret_500; \
 })
 #ifdef __LITTLE_ENDIAN__
-#define vfms_lane_f32(__p0_409, __p1_409, __p2_409, __p3_409) __extension__ ({ \
-  float32x2_t __s0_409 = __p0_409; \
-  float32x2_t __s1_409 = __p1_409; \
-  float32x2_t __s2_409 = __p2_409; \
-  float32x2_t __ret_409; \
-  __ret_409 = vfma_lane_f32(__s0_409, -__s1_409, __s2_409, __p3_409); \
-  __ret_409; \
+#define vfms_lane_f32(__p0_501, __p1_501, __p2_501, __p3_501) __extension__ ({ \
+  float32x2_t __s0_501 = __p0_501; \
+  float32x2_t __s1_501 = __p1_501; \
+  float32x2_t __s2_501 = __p2_501; \
+  float32x2_t __ret_501; \
+  __ret_501 = vfma_lane_f32(__s0_501, -__s1_501, __s2_501, __p3_501); \
+  __ret_501; \
 })
 #else
-#define vfms_lane_f32(__p0_410, __p1_410, __p2_410, __p3_410) __extension__ ({ \
-  float32x2_t __s0_410 = __p0_410; \
-  float32x2_t __s1_410 = __p1_410; \
-  float32x2_t __s2_410 = __p2_410; \
-  float32x2_t __rev0_410;  __rev0_410 = __builtin_shufflevector(__s0_410, __s0_410, 1, 0); \
-  float32x2_t __rev1_410;  __rev1_410 = __builtin_shufflevector(__s1_410, __s1_410, 1, 0); \
-  float32x2_t __rev2_410;  __rev2_410 = __builtin_shufflevector(__s2_410, __s2_410, 1, 0); \
-  float32x2_t __ret_410; \
-  __ret_410 = __noswap_vfma_lane_f32(__rev0_410, -__rev1_410, __rev2_410, __p3_410); \
-  __ret_410 = __builtin_shufflevector(__ret_410, __ret_410, 1, 0); \
-  __ret_410; \
+#define vfms_lane_f32(__p0_502, __p1_502, __p2_502, __p3_502) __extension__ ({ \
+  float32x2_t __s0_502 = __p0_502; \
+  float32x2_t __s1_502 = __p1_502; \
+  float32x2_t __s2_502 = __p2_502; \
+  float32x2_t __rev0_502;  __rev0_502 = __builtin_shufflevector(__s0_502, __s0_502, 1, 0); \
+  float32x2_t __rev1_502;  __rev1_502 = __builtin_shufflevector(__s1_502, __s1_502, 1, 0); \
+  float32x2_t __rev2_502;  __rev2_502 = __builtin_shufflevector(__s2_502, __s2_502, 1, 0); \
+  float32x2_t __ret_502; \
+  __ret_502 = __noswap_vfma_lane_f32(__rev0_502, -__rev1_502, __rev2_502, __p3_502); \
+  __ret_502 = __builtin_shufflevector(__ret_502, __ret_502, 1, 0); \
+  __ret_502; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmsd_laneq_f64(__p0_411, __p1_411, __p2_411, __p3_411) __extension__ ({ \
-  float64_t __s0_411 = __p0_411; \
-  float64_t __s1_411 = __p1_411; \
-  float64x2_t __s2_411 = __p2_411; \
-  float64_t __ret_411; \
-  __ret_411 = vfmad_laneq_f64(__s0_411, -__s1_411, __s2_411, __p3_411); \
-  __ret_411; \
+#define vfmsd_laneq_f64(__p0_503, __p1_503, __p2_503, __p3_503) __extension__ ({ \
+  float64_t __s0_503 = __p0_503; \
+  float64_t __s1_503 = __p1_503; \
+  float64x2_t __s2_503 = __p2_503; \
+  float64_t __ret_503; \
+  __ret_503 = vfmad_laneq_f64(__s0_503, -__s1_503, __s2_503, __p3_503); \
+  __ret_503; \
 })
 #else
-#define vfmsd_laneq_f64(__p0_412, __p1_412, __p2_412, __p3_412) __extension__ ({ \
-  float64_t __s0_412 = __p0_412; \
-  float64_t __s1_412 = __p1_412; \
-  float64x2_t __s2_412 = __p2_412; \
-  float64x2_t __rev2_412;  __rev2_412 = __builtin_shufflevector(__s2_412, __s2_412, 1, 0); \
-  float64_t __ret_412; \
-  __ret_412 = __noswap_vfmad_laneq_f64(__s0_412, -__s1_412, __rev2_412, __p3_412); \
-  __ret_412; \
+#define vfmsd_laneq_f64(__p0_504, __p1_504, __p2_504, __p3_504) __extension__ ({ \
+  float64_t __s0_504 = __p0_504; \
+  float64_t __s1_504 = __p1_504; \
+  float64x2_t __s2_504 = __p2_504; \
+  float64x2_t __rev2_504;  __rev2_504 = __builtin_shufflevector(__s2_504, __s2_504, 1, 0); \
+  float64_t __ret_504; \
+  __ret_504 = __noswap_vfmad_laneq_f64(__s0_504, -__s1_504, __rev2_504, __p3_504); \
+  __ret_504; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmss_laneq_f32(__p0_413, __p1_413, __p2_413, __p3_413) __extension__ ({ \
-  float32_t __s0_413 = __p0_413; \
-  float32_t __s1_413 = __p1_413; \
-  float32x4_t __s2_413 = __p2_413; \
-  float32_t __ret_413; \
-  __ret_413 = vfmas_laneq_f32(__s0_413, -__s1_413, __s2_413, __p3_413); \
-  __ret_413; \
+#define vfmss_laneq_f32(__p0_505, __p1_505, __p2_505, __p3_505) __extension__ ({ \
+  float32_t __s0_505 = __p0_505; \
+  float32_t __s1_505 = __p1_505; \
+  float32x4_t __s2_505 = __p2_505; \
+  float32_t __ret_505; \
+  __ret_505 = vfmas_laneq_f32(__s0_505, -__s1_505, __s2_505, __p3_505); \
+  __ret_505; \
 })
 #else
-#define vfmss_laneq_f32(__p0_414, __p1_414, __p2_414, __p3_414) __extension__ ({ \
-  float32_t __s0_414 = __p0_414; \
-  float32_t __s1_414 = __p1_414; \
-  float32x4_t __s2_414 = __p2_414; \
-  float32x4_t __rev2_414;  __rev2_414 = __builtin_shufflevector(__s2_414, __s2_414, 3, 2, 1, 0); \
-  float32_t __ret_414; \
-  __ret_414 = __noswap_vfmas_laneq_f32(__s0_414, -__s1_414, __rev2_414, __p3_414); \
-  __ret_414; \
+#define vfmss_laneq_f32(__p0_506, __p1_506, __p2_506, __p3_506) __extension__ ({ \
+  float32_t __s0_506 = __p0_506; \
+  float32_t __s1_506 = __p1_506; \
+  float32x4_t __s2_506 = __p2_506; \
+  float32x4_t __rev2_506;  __rev2_506 = __builtin_shufflevector(__s2_506, __s2_506, 3, 2, 1, 0); \
+  float32_t __ret_506; \
+  __ret_506 = __noswap_vfmas_laneq_f32(__s0_506, -__s1_506, __rev2_506, __p3_506); \
+  __ret_506; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmsq_laneq_f64(__p0_415, __p1_415, __p2_415, __p3_415) __extension__ ({ \
-  float64x2_t __s0_415 = __p0_415; \
-  float64x2_t __s1_415 = __p1_415; \
-  float64x2_t __s2_415 = __p2_415; \
-  float64x2_t __ret_415; \
-  __ret_415 = vfmaq_laneq_f64(__s0_415, -__s1_415, __s2_415, __p3_415); \
-  __ret_415; \
+#define vfmsq_laneq_f64(__p0_507, __p1_507, __p2_507, __p3_507) __extension__ ({ \
+  float64x2_t __s0_507 = __p0_507; \
+  float64x2_t __s1_507 = __p1_507; \
+  float64x2_t __s2_507 = __p2_507; \
+  float64x2_t __ret_507; \
+  __ret_507 = vfmaq_laneq_f64(__s0_507, -__s1_507, __s2_507, __p3_507); \
+  __ret_507; \
 })
 #else
-#define vfmsq_laneq_f64(__p0_416, __p1_416, __p2_416, __p3_416) __extension__ ({ \
-  float64x2_t __s0_416 = __p0_416; \
-  float64x2_t __s1_416 = __p1_416; \
-  float64x2_t __s2_416 = __p2_416; \
-  float64x2_t __rev0_416;  __rev0_416 = __builtin_shufflevector(__s0_416, __s0_416, 1, 0); \
-  float64x2_t __rev1_416;  __rev1_416 = __builtin_shufflevector(__s1_416, __s1_416, 1, 0); \
-  float64x2_t __rev2_416;  __rev2_416 = __builtin_shufflevector(__s2_416, __s2_416, 1, 0); \
-  float64x2_t __ret_416; \
-  __ret_416 = __noswap_vfmaq_laneq_f64(__rev0_416, -__rev1_416, __rev2_416, __p3_416); \
-  __ret_416 = __builtin_shufflevector(__ret_416, __ret_416, 1, 0); \
-  __ret_416; \
+#define vfmsq_laneq_f64(__p0_508, __p1_508, __p2_508, __p3_508) __extension__ ({ \
+  float64x2_t __s0_508 = __p0_508; \
+  float64x2_t __s1_508 = __p1_508; \
+  float64x2_t __s2_508 = __p2_508; \
+  float64x2_t __rev0_508;  __rev0_508 = __builtin_shufflevector(__s0_508, __s0_508, 1, 0); \
+  float64x2_t __rev1_508;  __rev1_508 = __builtin_shufflevector(__s1_508, __s1_508, 1, 0); \
+  float64x2_t __rev2_508;  __rev2_508 = __builtin_shufflevector(__s2_508, __s2_508, 1, 0); \
+  float64x2_t __ret_508; \
+  __ret_508 = __noswap_vfmaq_laneq_f64(__rev0_508, -__rev1_508, __rev2_508, __p3_508); \
+  __ret_508 = __builtin_shufflevector(__ret_508, __ret_508, 1, 0); \
+  __ret_508; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmsq_laneq_f32(__p0_417, __p1_417, __p2_417, __p3_417) __extension__ ({ \
-  float32x4_t __s0_417 = __p0_417; \
-  float32x4_t __s1_417 = __p1_417; \
-  float32x4_t __s2_417 = __p2_417; \
-  float32x4_t __ret_417; \
-  __ret_417 = vfmaq_laneq_f32(__s0_417, -__s1_417, __s2_417, __p3_417); \
-  __ret_417; \
+#define vfmsq_laneq_f32(__p0_509, __p1_509, __p2_509, __p3_509) __extension__ ({ \
+  float32x4_t __s0_509 = __p0_509; \
+  float32x4_t __s1_509 = __p1_509; \
+  float32x4_t __s2_509 = __p2_509; \
+  float32x4_t __ret_509; \
+  __ret_509 = vfmaq_laneq_f32(__s0_509, -__s1_509, __s2_509, __p3_509); \
+  __ret_509; \
 })
 #else
-#define vfmsq_laneq_f32(__p0_418, __p1_418, __p2_418, __p3_418) __extension__ ({ \
-  float32x4_t __s0_418 = __p0_418; \
-  float32x4_t __s1_418 = __p1_418; \
-  float32x4_t __s2_418 = __p2_418; \
-  float32x4_t __rev0_418;  __rev0_418 = __builtin_shufflevector(__s0_418, __s0_418, 3, 2, 1, 0); \
-  float32x4_t __rev1_418;  __rev1_418 = __builtin_shufflevector(__s1_418, __s1_418, 3, 2, 1, 0); \
-  float32x4_t __rev2_418;  __rev2_418 = __builtin_shufflevector(__s2_418, __s2_418, 3, 2, 1, 0); \
-  float32x4_t __ret_418; \
-  __ret_418 = __noswap_vfmaq_laneq_f32(__rev0_418, -__rev1_418, __rev2_418, __p3_418); \
-  __ret_418 = __builtin_shufflevector(__ret_418, __ret_418, 3, 2, 1, 0); \
-  __ret_418; \
+#define vfmsq_laneq_f32(__p0_510, __p1_510, __p2_510, __p3_510) __extension__ ({ \
+  float32x4_t __s0_510 = __p0_510; \
+  float32x4_t __s1_510 = __p1_510; \
+  float32x4_t __s2_510 = __p2_510; \
+  float32x4_t __rev0_510;  __rev0_510 = __builtin_shufflevector(__s0_510, __s0_510, 3, 2, 1, 0); \
+  float32x4_t __rev1_510;  __rev1_510 = __builtin_shufflevector(__s1_510, __s1_510, 3, 2, 1, 0); \
+  float32x4_t __rev2_510;  __rev2_510 = __builtin_shufflevector(__s2_510, __s2_510, 3, 2, 1, 0); \
+  float32x4_t __ret_510; \
+  __ret_510 = __noswap_vfmaq_laneq_f32(__rev0_510, -__rev1_510, __rev2_510, __p3_510); \
+  __ret_510 = __builtin_shufflevector(__ret_510, __ret_510, 3, 2, 1, 0); \
+  __ret_510; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfms_laneq_f64(__p0_419, __p1_419, __p2_419, __p3_419) __extension__ ({ \
-  float64x1_t __s0_419 = __p0_419; \
-  float64x1_t __s1_419 = __p1_419; \
-  float64x2_t __s2_419 = __p2_419; \
-  float64x1_t __ret_419; \
-  __ret_419 = vfma_laneq_f64(__s0_419, -__s1_419, __s2_419, __p3_419); \
-  __ret_419; \
+#define vfms_laneq_f64(__p0_511, __p1_511, __p2_511, __p3_511) __extension__ ({ \
+  float64x1_t __s0_511 = __p0_511; \
+  float64x1_t __s1_511 = __p1_511; \
+  float64x2_t __s2_511 = __p2_511; \
+  float64x1_t __ret_511; \
+  __ret_511 = vfma_laneq_f64(__s0_511, -__s1_511, __s2_511, __p3_511); \
+  __ret_511; \
 })
 #else
-#define vfms_laneq_f64(__p0_420, __p1_420, __p2_420, __p3_420) __extension__ ({ \
-  float64x1_t __s0_420 = __p0_420; \
-  float64x1_t __s1_420 = __p1_420; \
-  float64x2_t __s2_420 = __p2_420; \
-  float64x2_t __rev2_420;  __rev2_420 = __builtin_shufflevector(__s2_420, __s2_420, 1, 0); \
-  float64x1_t __ret_420; \
-  __ret_420 = __noswap_vfma_laneq_f64(__s0_420, -__s1_420, __rev2_420, __p3_420); \
-  __ret_420; \
+#define vfms_laneq_f64(__p0_512, __p1_512, __p2_512, __p3_512) __extension__ ({ \
+  float64x1_t __s0_512 = __p0_512; \
+  float64x1_t __s1_512 = __p1_512; \
+  float64x2_t __s2_512 = __p2_512; \
+  float64x2_t __rev2_512;  __rev2_512 = __builtin_shufflevector(__s2_512, __s2_512, 1, 0); \
+  float64x1_t __ret_512; \
+  __ret_512 = __noswap_vfma_laneq_f64(__s0_512, -__s1_512, __rev2_512, __p3_512); \
+  __ret_512; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfms_laneq_f32(__p0_421, __p1_421, __p2_421, __p3_421) __extension__ ({ \
-  float32x2_t __s0_421 = __p0_421; \
-  float32x2_t __s1_421 = __p1_421; \
-  float32x4_t __s2_421 = __p2_421; \
-  float32x2_t __ret_421; \
-  __ret_421 = vfma_laneq_f32(__s0_421, -__s1_421, __s2_421, __p3_421); \
-  __ret_421; \
+#define vfms_laneq_f32(__p0_513, __p1_513, __p2_513, __p3_513) __extension__ ({ \
+  float32x2_t __s0_513 = __p0_513; \
+  float32x2_t __s1_513 = __p1_513; \
+  float32x4_t __s2_513 = __p2_513; \
+  float32x2_t __ret_513; \
+  __ret_513 = vfma_laneq_f32(__s0_513, -__s1_513, __s2_513, __p3_513); \
+  __ret_513; \
 })
 #else
-#define vfms_laneq_f32(__p0_422, __p1_422, __p2_422, __p3_422) __extension__ ({ \
-  float32x2_t __s0_422 = __p0_422; \
-  float32x2_t __s1_422 = __p1_422; \
-  float32x4_t __s2_422 = __p2_422; \
-  float32x2_t __rev0_422;  __rev0_422 = __builtin_shufflevector(__s0_422, __s0_422, 1, 0); \
-  float32x2_t __rev1_422;  __rev1_422 = __builtin_shufflevector(__s1_422, __s1_422, 1, 0); \
-  float32x4_t __rev2_422;  __rev2_422 = __builtin_shufflevector(__s2_422, __s2_422, 3, 2, 1, 0); \
-  float32x2_t __ret_422; \
-  __ret_422 = __noswap_vfma_laneq_f32(__rev0_422, -__rev1_422, __rev2_422, __p3_422); \
-  __ret_422 = __builtin_shufflevector(__ret_422, __ret_422, 1, 0); \
-  __ret_422; \
+#define vfms_laneq_f32(__p0_514, __p1_514, __p2_514, __p3_514) __extension__ ({ \
+  float32x2_t __s0_514 = __p0_514; \
+  float32x2_t __s1_514 = __p1_514; \
+  float32x4_t __s2_514 = __p2_514; \
+  float32x2_t __rev0_514;  __rev0_514 = __builtin_shufflevector(__s0_514, __s0_514, 1, 0); \
+  float32x2_t __rev1_514;  __rev1_514 = __builtin_shufflevector(__s1_514, __s1_514, 1, 0); \
+  float32x4_t __rev2_514;  __rev2_514 = __builtin_shufflevector(__s2_514, __s2_514, 3, 2, 1, 0); \
+  float32x2_t __ret_514; \
+  __ret_514 = __noswap_vfma_laneq_f32(__rev0_514, -__rev1_514, __rev2_514, __p3_514); \
+  __ret_514 = __builtin_shufflevector(__ret_514, __ret_514, 1, 0); \
+  __ret_514; \
 })
 #endif
 
@@ -52548,547 +54252,530 @@ __ai float64x1_t vmla_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2)
   return __ret;
 }
 #ifdef __LITTLE_ENDIAN__
-#define vmlaq_laneq_u32(__p0_423, __p1_423, __p2_423, __p3_423) __extension__ ({ \
-  uint32x4_t __s0_423 = __p0_423; \
-  uint32x4_t __s1_423 = __p1_423; \
-  uint32x4_t __s2_423 = __p2_423; \
-  uint32x4_t __ret_423; \
-  __ret_423 = __s0_423 + __s1_423 * splatq_laneq_u32(__s2_423, __p3_423); \
-  __ret_423; \
+#define vmlaq_laneq_u32(__p0_515, __p1_515, __p2_515, __p3_515) __extension__ ({ \
+  uint32x4_t __s0_515 = __p0_515; \
+  uint32x4_t __s1_515 = __p1_515; \
+  uint32x4_t __s2_515 = __p2_515; \
+  uint32x4_t __ret_515; \
+  __ret_515 = __s0_515 + __s1_515 * splatq_laneq_u32(__s2_515, __p3_515); \
+  __ret_515; \
 })
 #else
-#define vmlaq_laneq_u32(__p0_424, __p1_424, __p2_424, __p3_424) __extension__ ({ \
-  uint32x4_t __s0_424 = __p0_424; \
-  uint32x4_t __s1_424 = __p1_424; \
-  uint32x4_t __s2_424 = __p2_424; \
-  uint32x4_t __rev0_424;  __rev0_424 = __builtin_shufflevector(__s0_424, __s0_424, 3, 2, 1, 0); \
-  uint32x4_t __rev1_424;  __rev1_424 = __builtin_shufflevector(__s1_424, __s1_424, 3, 2, 1, 0); \
-  uint32x4_t __rev2_424;  __rev2_424 = __builtin_shufflevector(__s2_424, __s2_424, 3, 2, 1, 0); \
-  uint32x4_t __ret_424; \
-  __ret_424 = __rev0_424 + __rev1_424 * __noswap_splatq_laneq_u32(__rev2_424, __p3_424); \
-  __ret_424 = __builtin_shufflevector(__ret_424, __ret_424, 3, 2, 1, 0); \
-  __ret_424; \
+#define vmlaq_laneq_u32(__p0_516, __p1_516, __p2_516, __p3_516) __extension__ ({ \
+  uint32x4_t __s0_516 = __p0_516; \
+  uint32x4_t __s1_516 = __p1_516; \
+  uint32x4_t __s2_516 = __p2_516; \
+  uint32x4_t __rev0_516;  __rev0_516 = __builtin_shufflevector(__s0_516, __s0_516, 3, 2, 1, 0); \
+  uint32x4_t __rev1_516;  __rev1_516 = __builtin_shufflevector(__s1_516, __s1_516, 3, 2, 1, 0); \
+  uint32x4_t __rev2_516;  __rev2_516 = __builtin_shufflevector(__s2_516, __s2_516, 3, 2, 1, 0); \
+  uint32x4_t __ret_516; \
+  __ret_516 = __rev0_516 + __rev1_516 * __noswap_splatq_laneq_u32(__rev2_516, __p3_516); \
+  __ret_516 = __builtin_shufflevector(__ret_516, __ret_516, 3, 2, 1, 0); \
+  __ret_516; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmlaq_laneq_u16(__p0_425, __p1_425, __p2_425, __p3_425) __extension__ ({ \
-  uint16x8_t __s0_425 = __p0_425; \
-  uint16x8_t __s1_425 = __p1_425; \
-  uint16x8_t __s2_425 = __p2_425; \
-  uint16x8_t __ret_425; \
-  __ret_425 = __s0_425 + __s1_425 * splatq_laneq_u16(__s2_425, __p3_425); \
-  __ret_425; \
+#define vmlaq_laneq_u16(__p0_517, __p1_517, __p2_517, __p3_517) __extension__ ({ \
+  uint16x8_t __s0_517 = __p0_517; \
+  uint16x8_t __s1_517 = __p1_517; \
+  uint16x8_t __s2_517 = __p2_517; \
+  uint16x8_t __ret_517; \
+  __ret_517 = __s0_517 + __s1_517 * splatq_laneq_u16(__s2_517, __p3_517); \
+  __ret_517; \
 })
 #else
-#define vmlaq_laneq_u16(__p0_426, __p1_426, __p2_426, __p3_426) __extension__ ({ \
-  uint16x8_t __s0_426 = __p0_426; \
-  uint16x8_t __s1_426 = __p1_426; \
-  uint16x8_t __s2_426 = __p2_426; \
-  uint16x8_t __rev0_426;  __rev0_426 = __builtin_shufflevector(__s0_426, __s0_426, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __rev1_426;  __rev1_426 = __builtin_shufflevector(__s1_426, __s1_426, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __rev2_426;  __rev2_426 = __builtin_shufflevector(__s2_426, __s2_426, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __ret_426; \
-  __ret_426 = __rev0_426 + __rev1_426 * __noswap_splatq_laneq_u16(__rev2_426, __p3_426); \
-  __ret_426 = __builtin_shufflevector(__ret_426, __ret_426, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_426; \
+#define vmlaq_laneq_u16(__p0_518, __p1_518, __p2_518, __p3_518) __extension__ ({ \
+  uint16x8_t __s0_518 = __p0_518; \
+  uint16x8_t __s1_518 = __p1_518; \
+  uint16x8_t __s2_518 = __p2_518; \
+  uint16x8_t __rev0_518;  __rev0_518 = __builtin_shufflevector(__s0_518, __s0_518, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1_518;  __rev1_518 = __builtin_shufflevector(__s1_518, __s1_518, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev2_518;  __rev2_518 = __builtin_shufflevector(__s2_518, __s2_518, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret_518; \
+  __ret_518 = __rev0_518 + __rev1_518 * __noswap_splatq_laneq_u16(__rev2_518, __p3_518); \
+  __ret_518 = __builtin_shufflevector(__ret_518, __ret_518, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_518; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmlaq_laneq_f32(__p0_427, __p1_427, __p2_427, __p3_427) __extension__ ({ \
-  float32x4_t __s0_427 = __p0_427; \
-  float32x4_t __s1_427 = __p1_427; \
-  float32x4_t __s2_427 = __p2_427; \
-  float32x4_t __ret_427; \
-  __ret_427 = __s0_427 + __s1_427 * splatq_laneq_f32(__s2_427, __p3_427); \
-  __ret_427; \
+#define vmlaq_laneq_f32(__p0_519, __p1_519, __p2_519, __p3_519) __extension__ ({ \
+  float32x4_t __s0_519 = __p0_519; \
+  float32x4_t __s1_519 = __p1_519; \
+  float32x4_t __s2_519 = __p2_519; \
+  float32x4_t __ret_519; \
+  __ret_519 = __s0_519 + __s1_519 * splatq_laneq_f32(__s2_519, __p3_519); \
+  __ret_519; \
 })
 #else
-#define vmlaq_laneq_f32(__p0_428, __p1_428, __p2_428, __p3_428) __extension__ ({ \
-  float32x4_t __s0_428 = __p0_428; \
-  float32x4_t __s1_428 = __p1_428; \
-  float32x4_t __s2_428 = __p2_428; \
-  float32x4_t __rev0_428;  __rev0_428 = __builtin_shufflevector(__s0_428, __s0_428, 3, 2, 1, 0); \
-  float32x4_t __rev1_428;  __rev1_428 = __builtin_shufflevector(__s1_428, __s1_428, 3, 2, 1, 0); \
-  float32x4_t __rev2_428;  __rev2_428 = __builtin_shufflevector(__s2_428, __s2_428, 3, 2, 1, 0); \
-  float32x4_t __ret_428; \
-  __ret_428 = __rev0_428 + __rev1_428 * __noswap_splatq_laneq_f32(__rev2_428, __p3_428); \
-  __ret_428 = __builtin_shufflevector(__ret_428, __ret_428, 3, 2, 1, 0); \
-  __ret_428; \
+#define vmlaq_laneq_f32(__p0_520, __p1_520, __p2_520, __p3_520) __extension__ ({ \
+  float32x4_t __s0_520 = __p0_520; \
+  float32x4_t __s1_520 = __p1_520; \
+  float32x4_t __s2_520 = __p2_520; \
+  float32x4_t __rev0_520;  __rev0_520 = __builtin_shufflevector(__s0_520, __s0_520, 3, 2, 1, 0); \
+  float32x4_t __rev1_520;  __rev1_520 = __builtin_shufflevector(__s1_520, __s1_520, 3, 2, 1, 0); \
+  float32x4_t __rev2_520;  __rev2_520 = __builtin_shufflevector(__s2_520, __s2_520, 3, 2, 1, 0); \
+  float32x4_t __ret_520; \
+  __ret_520 = __rev0_520 + __rev1_520 * __noswap_splatq_laneq_f32(__rev2_520, __p3_520); \
+  __ret_520 = __builtin_shufflevector(__ret_520, __ret_520, 3, 2, 1, 0); \
+  __ret_520; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmlaq_laneq_s32(__p0_429, __p1_429, __p2_429, __p3_429) __extension__ ({ \
-  int32x4_t __s0_429 = __p0_429; \
-  int32x4_t __s1_429 = __p1_429; \
-  int32x4_t __s2_429 = __p2_429; \
-  int32x4_t __ret_429; \
-  __ret_429 = __s0_429 + __s1_429 * splatq_laneq_s32(__s2_429, __p3_429); \
-  __ret_429; \
+#define vmlaq_laneq_s32(__p0_521, __p1_521, __p2_521, __p3_521) __extension__ ({ \
+  int32x4_t __s0_521 = __p0_521; \
+  int32x4_t __s1_521 = __p1_521; \
+  int32x4_t __s2_521 = __p2_521; \
+  int32x4_t __ret_521; \
+  __ret_521 = __s0_521 + __s1_521 * splatq_laneq_s32(__s2_521, __p3_521); \
+  __ret_521; \
 })
 #else
-#define vmlaq_laneq_s32(__p0_430, __p1_430, __p2_430, __p3_430) __extension__ ({ \
-  int32x4_t __s0_430 = __p0_430; \
-  int32x4_t __s1_430 = __p1_430; \
-  int32x4_t __s2_430 = __p2_430; \
-  int32x4_t __rev0_430;  __rev0_430 = __builtin_shufflevector(__s0_430, __s0_430, 3, 2, 1, 0); \
-  int32x4_t __rev1_430;  __rev1_430 = __builtin_shufflevector(__s1_430, __s1_430, 3, 2, 1, 0); \
-  int32x4_t __rev2_430;  __rev2_430 = __builtin_shufflevector(__s2_430, __s2_430, 3, 2, 1, 0); \
-  int32x4_t __ret_430; \
-  __ret_430 = __rev0_430 + __rev1_430 * __noswap_splatq_laneq_s32(__rev2_430, __p3_430); \
-  __ret_430 = __builtin_shufflevector(__ret_430, __ret_430, 3, 2, 1, 0); \
-  __ret_430; \
+#define vmlaq_laneq_s32(__p0_522, __p1_522, __p2_522, __p3_522) __extension__ ({ \
+  int32x4_t __s0_522 = __p0_522; \
+  int32x4_t __s1_522 = __p1_522; \
+  int32x4_t __s2_522 = __p2_522; \
+  int32x4_t __rev0_522;  __rev0_522 = __builtin_shufflevector(__s0_522, __s0_522, 3, 2, 1, 0); \
+  int32x4_t __rev1_522;  __rev1_522 = __builtin_shufflevector(__s1_522, __s1_522, 3, 2, 1, 0); \
+  int32x4_t __rev2_522;  __rev2_522 = __builtin_shufflevector(__s2_522, __s2_522, 3, 2, 1, 0); \
+  int32x4_t __ret_522; \
+  __ret_522 = __rev0_522 + __rev1_522 * __noswap_splatq_laneq_s32(__rev2_522, __p3_522); \
+  __ret_522 = __builtin_shufflevector(__ret_522, __ret_522, 3, 2, 1, 0); \
+  __ret_522; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmlaq_laneq_s16(__p0_431, __p1_431, __p2_431, __p3_431) __extension__ ({ \
-  int16x8_t __s0_431 = __p0_431; \
-  int16x8_t __s1_431 = __p1_431; \
-  int16x8_t __s2_431 = __p2_431; \
-  int16x8_t __ret_431; \
-  __ret_431 = __s0_431 + __s1_431 * splatq_laneq_s16(__s2_431, __p3_431); \
-  __ret_431; \
+#define vmlaq_laneq_s16(__p0_523, __p1_523, __p2_523, __p3_523) __extension__ ({ \
+  int16x8_t __s0_523 = __p0_523; \
+  int16x8_t __s1_523 = __p1_523; \
+  int16x8_t __s2_523 = __p2_523; \
+  int16x8_t __ret_523; \
+  __ret_523 = __s0_523 + __s1_523 * splatq_laneq_s16(__s2_523, __p3_523); \
+  __ret_523; \
 })
 #else
-#define vmlaq_laneq_s16(__p0_432, __p1_432, __p2_432, __p3_432) __extension__ ({ \
-  int16x8_t __s0_432 = __p0_432; \
-  int16x8_t __s1_432 = __p1_432; \
-  int16x8_t __s2_432 = __p2_432; \
-  int16x8_t __rev0_432;  __rev0_432 = __builtin_shufflevector(__s0_432, __s0_432, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1_432;  __rev1_432 = __builtin_shufflevector(__s1_432, __s1_432, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev2_432;  __rev2_432 = __builtin_shufflevector(__s2_432, __s2_432, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __ret_432; \
-  __ret_432 = __rev0_432 + __rev1_432 * __noswap_splatq_laneq_s16(__rev2_432, __p3_432); \
-  __ret_432 = __builtin_shufflevector(__ret_432, __ret_432, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_432; \
+#define vmlaq_laneq_s16(__p0_524, __p1_524, __p2_524, __p3_524) __extension__ ({ \
+  int16x8_t __s0_524 = __p0_524; \
+  int16x8_t __s1_524 = __p1_524; \
+  int16x8_t __s2_524 = __p2_524; \
+  int16x8_t __rev0_524;  __rev0_524 = __builtin_shufflevector(__s0_524, __s0_524, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1_524;  __rev1_524 = __builtin_shufflevector(__s1_524, __s1_524, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev2_524;  __rev2_524 = __builtin_shufflevector(__s2_524, __s2_524, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret_524; \
+  __ret_524 = __rev0_524 + __rev1_524 * __noswap_splatq_laneq_s16(__rev2_524, __p3_524); \
+  __ret_524 = __builtin_shufflevector(__ret_524, __ret_524, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_524; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmla_laneq_u32(__p0_433, __p1_433, __p2_433, __p3_433) __extension__ ({ \
-  uint32x2_t __s0_433 = __p0_433; \
-  uint32x2_t __s1_433 = __p1_433; \
-  uint32x4_t __s2_433 = __p2_433; \
-  uint32x2_t __ret_433; \
-  __ret_433 = __s0_433 + __s1_433 * splat_laneq_u32(__s2_433, __p3_433); \
-  __ret_433; \
+#define vmla_laneq_u32(__p0_525, __p1_525, __p2_525, __p3_525) __extension__ ({ \
+  uint32x2_t __s0_525 = __p0_525; \
+  uint32x2_t __s1_525 = __p1_525; \
+  uint32x4_t __s2_525 = __p2_525; \
+  uint32x2_t __ret_525; \
+  __ret_525 = __s0_525 + __s1_525 * splat_laneq_u32(__s2_525, __p3_525); \
+  __ret_525; \
 })
 #else
-#define vmla_laneq_u32(__p0_434, __p1_434, __p2_434, __p3_434) __extension__ ({ \
-  uint32x2_t __s0_434 = __p0_434; \
-  uint32x2_t __s1_434 = __p1_434; \
-  uint32x4_t __s2_434 = __p2_434; \
-  uint32x2_t __rev0_434;  __rev0_434 = __builtin_shufflevector(__s0_434, __s0_434, 1, 0); \
-  uint32x2_t __rev1_434;  __rev1_434 = __builtin_shufflevector(__s1_434, __s1_434, 1, 0); \
-  uint32x4_t __rev2_434;  __rev2_434 = __builtin_shufflevector(__s2_434, __s2_434, 3, 2, 1, 0); \
-  uint32x2_t __ret_434; \
-  __ret_434 = __rev0_434 + __rev1_434 * __noswap_splat_laneq_u32(__rev2_434, __p3_434); \
-  __ret_434 = __builtin_shufflevector(__ret_434, __ret_434, 1, 0); \
-  __ret_434; \
+#define vmla_laneq_u32(__p0_526, __p1_526, __p2_526, __p3_526) __extension__ ({ \
+  uint32x2_t __s0_526 = __p0_526; \
+  uint32x2_t __s1_526 = __p1_526; \
+  uint32x4_t __s2_526 = __p2_526; \
+  uint32x2_t __rev0_526;  __rev0_526 = __builtin_shufflevector(__s0_526, __s0_526, 1, 0); \
+  uint32x2_t __rev1_526;  __rev1_526 = __builtin_shufflevector(__s1_526, __s1_526, 1, 0); \
+  uint32x4_t __rev2_526;  __rev2_526 = __builtin_shufflevector(__s2_526, __s2_526, 3, 2, 1, 0); \
+  uint32x2_t __ret_526; \
+  __ret_526 = __rev0_526 + __rev1_526 * __noswap_splat_laneq_u32(__rev2_526, __p3_526); \
+  __ret_526 = __builtin_shufflevector(__ret_526, __ret_526, 1, 0); \
+  __ret_526; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmla_laneq_u16(__p0_435, __p1_435, __p2_435, __p3_435) __extension__ ({ \
-  uint16x4_t __s0_435 = __p0_435; \
-  uint16x4_t __s1_435 = __p1_435; \
-  uint16x8_t __s2_435 = __p2_435; \
-  uint16x4_t __ret_435; \
-  __ret_435 = __s0_435 + __s1_435 * splat_laneq_u16(__s2_435, __p3_435); \
-  __ret_435; \
+#define vmla_laneq_u16(__p0_527, __p1_527, __p2_527, __p3_527) __extension__ ({ \
+  uint16x4_t __s0_527 = __p0_527; \
+  uint16x4_t __s1_527 = __p1_527; \
+  uint16x8_t __s2_527 = __p2_527; \
+  uint16x4_t __ret_527; \
+  __ret_527 = __s0_527 + __s1_527 * splat_laneq_u16(__s2_527, __p3_527); \
+  __ret_527; \
 })
 #else
-#define vmla_laneq_u16(__p0_436, __p1_436, __p2_436, __p3_436) __extension__ ({ \
-  uint16x4_t __s0_436 = __p0_436; \
-  uint16x4_t __s1_436 = __p1_436; \
-  uint16x8_t __s2_436 = __p2_436; \
-  uint16x4_t __rev0_436;  __rev0_436 = __builtin_shufflevector(__s0_436, __s0_436, 3, 2, 1, 0); \
-  uint16x4_t __rev1_436;  __rev1_436 = __builtin_shufflevector(__s1_436, __s1_436, 3, 2, 1, 0); \
-  uint16x8_t __rev2_436;  __rev2_436 = __builtin_shufflevector(__s2_436, __s2_436, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x4_t __ret_436; \
-  __ret_436 = __rev0_436 + __rev1_436 * __noswap_splat_laneq_u16(__rev2_436, __p3_436); \
-  __ret_436 = __builtin_shufflevector(__ret_436, __ret_436, 3, 2, 1, 0); \
-  __ret_436; \
+#define vmla_laneq_u16(__p0_528, __p1_528, __p2_528, __p3_528) __extension__ ({ \
+  uint16x4_t __s0_528 = __p0_528; \
+  uint16x4_t __s1_528 = __p1_528; \
+  uint16x8_t __s2_528 = __p2_528; \
+  uint16x4_t __rev0_528;  __rev0_528 = __builtin_shufflevector(__s0_528, __s0_528, 3, 2, 1, 0); \
+  uint16x4_t __rev1_528;  __rev1_528 = __builtin_shufflevector(__s1_528, __s1_528, 3, 2, 1, 0); \
+  uint16x8_t __rev2_528;  __rev2_528 = __builtin_shufflevector(__s2_528, __s2_528, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x4_t __ret_528; \
+  __ret_528 = __rev0_528 + __rev1_528 * __noswap_splat_laneq_u16(__rev2_528, __p3_528); \
+  __ret_528 = __builtin_shufflevector(__ret_528, __ret_528, 3, 2, 1, 0); \
+  __ret_528; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmla_laneq_f32(__p0_437, __p1_437, __p2_437, __p3_437) __extension__ ({ \
-  float32x2_t __s0_437 = __p0_437; \
-  float32x2_t __s1_437 = __p1_437; \
-  float32x4_t __s2_437 = __p2_437; \
-  float32x2_t __ret_437; \
-  __ret_437 = __s0_437 + __s1_437 * splat_laneq_f32(__s2_437, __p3_437); \
-  __ret_437; \
+#define vmla_laneq_f32(__p0_529, __p1_529, __p2_529, __p3_529) __extension__ ({ \
+  float32x2_t __s0_529 = __p0_529; \
+  float32x2_t __s1_529 = __p1_529; \
+  float32x4_t __s2_529 = __p2_529; \
+  float32x2_t __ret_529; \
+  __ret_529 = __s0_529 + __s1_529 * splat_laneq_f32(__s2_529, __p3_529); \
+  __ret_529; \
 })
 #else
-#define vmla_laneq_f32(__p0_438, __p1_438, __p2_438, __p3_438) __extension__ ({ \
-  float32x2_t __s0_438 = __p0_438; \
-  float32x2_t __s1_438 = __p1_438; \
-  float32x4_t __s2_438 = __p2_438; \
-  float32x2_t __rev0_438;  __rev0_438 = __builtin_shufflevector(__s0_438, __s0_438, 1, 0); \
-  float32x2_t __rev1_438;  __rev1_438 = __builtin_shufflevector(__s1_438, __s1_438, 1, 0); \
-  float32x4_t __rev2_438;  __rev2_438 = __builtin_shufflevector(__s2_438, __s2_438, 3, 2, 1, 0); \
-  float32x2_t __ret_438; \
-  __ret_438 = __rev0_438 + __rev1_438 * __noswap_splat_laneq_f32(__rev2_438, __p3_438); \
-  __ret_438 = __builtin_shufflevector(__ret_438, __ret_438, 1, 0); \
-  __ret_438; \
+#define vmla_laneq_f32(__p0_530, __p1_530, __p2_530, __p3_530) __extension__ ({ \
+  float32x2_t __s0_530 = __p0_530; \
+  float32x2_t __s1_530 = __p1_530; \
+  float32x4_t __s2_530 = __p2_530; \
+  float32x2_t __rev0_530;  __rev0_530 = __builtin_shufflevector(__s0_530, __s0_530, 1, 0); \
+  float32x2_t __rev1_530;  __rev1_530 = __builtin_shufflevector(__s1_530, __s1_530, 1, 0); \
+  float32x4_t __rev2_530;  __rev2_530 = __builtin_shufflevector(__s2_530, __s2_530, 3, 2, 1, 0); \
+  float32x2_t __ret_530; \
+  __ret_530 = __rev0_530 + __rev1_530 * __noswap_splat_laneq_f32(__rev2_530, __p3_530); \
+  __ret_530 = __builtin_shufflevector(__ret_530, __ret_530, 1, 0); \
+  __ret_530; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmla_laneq_s32(__p0_439, __p1_439, __p2_439, __p3_439) __extension__ ({ \
-  int32x2_t __s0_439 = __p0_439; \
-  int32x2_t __s1_439 = __p1_439; \
-  int32x4_t __s2_439 = __p2_439; \
-  int32x2_t __ret_439; \
-  __ret_439 = __s0_439 + __s1_439 * splat_laneq_s32(__s2_439, __p3_439); \
-  __ret_439; \
+#define vmla_laneq_s32(__p0_531, __p1_531, __p2_531, __p3_531) __extension__ ({ \
+  int32x2_t __s0_531 = __p0_531; \
+  int32x2_t __s1_531 = __p1_531; \
+  int32x4_t __s2_531 = __p2_531; \
+  int32x2_t __ret_531; \
+  __ret_531 = __s0_531 + __s1_531 * splat_laneq_s32(__s2_531, __p3_531); \
+  __ret_531; \
 })
 #else
-#define vmla_laneq_s32(__p0_440, __p1_440, __p2_440, __p3_440) __extension__ ({ \
-  int32x2_t __s0_440 = __p0_440; \
-  int32x2_t __s1_440 = __p1_440; \
-  int32x4_t __s2_440 = __p2_440; \
-  int32x2_t __rev0_440;  __rev0_440 = __builtin_shufflevector(__s0_440, __s0_440, 1, 0); \
-  int32x2_t __rev1_440;  __rev1_440 = __builtin_shufflevector(__s1_440, __s1_440, 1, 0); \
-  int32x4_t __rev2_440;  __rev2_440 = __builtin_shufflevector(__s2_440, __s2_440, 3, 2, 1, 0); \
-  int32x2_t __ret_440; \
-  __ret_440 = __rev0_440 + __rev1_440 * __noswap_splat_laneq_s32(__rev2_440, __p3_440); \
-  __ret_440 = __builtin_shufflevector(__ret_440, __ret_440, 1, 0); \
-  __ret_440; \
+#define vmla_laneq_s32(__p0_532, __p1_532, __p2_532, __p3_532) __extension__ ({ \
+  int32x2_t __s0_532 = __p0_532; \
+  int32x2_t __s1_532 = __p1_532; \
+  int32x4_t __s2_532 = __p2_532; \
+  int32x2_t __rev0_532;  __rev0_532 = __builtin_shufflevector(__s0_532, __s0_532, 1, 0); \
+  int32x2_t __rev1_532;  __rev1_532 = __builtin_shufflevector(__s1_532, __s1_532, 1, 0); \
+  int32x4_t __rev2_532;  __rev2_532 = __builtin_shufflevector(__s2_532, __s2_532, 3, 2, 1, 0); \
+  int32x2_t __ret_532; \
+  __ret_532 = __rev0_532 + __rev1_532 * __noswap_splat_laneq_s32(__rev2_532, __p3_532); \
+  __ret_532 = __builtin_shufflevector(__ret_532, __ret_532, 1, 0); \
+  __ret_532; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmla_laneq_s16(__p0_441, __p1_441, __p2_441, __p3_441) __extension__ ({ \
-  int16x4_t __s0_441 = __p0_441; \
-  int16x4_t __s1_441 = __p1_441; \
-  int16x8_t __s2_441 = __p2_441; \
-  int16x4_t __ret_441; \
-  __ret_441 = __s0_441 + __s1_441 * splat_laneq_s16(__s2_441, __p3_441); \
-  __ret_441; \
+#define vmla_laneq_s16(__p0_533, __p1_533, __p2_533, __p3_533) __extension__ ({ \
+  int16x4_t __s0_533 = __p0_533; \
+  int16x4_t __s1_533 = __p1_533; \
+  int16x8_t __s2_533 = __p2_533; \
+  int16x4_t __ret_533; \
+  __ret_533 = __s0_533 + __s1_533 * splat_laneq_s16(__s2_533, __p3_533); \
+  __ret_533; \
 })
 #else
-#define vmla_laneq_s16(__p0_442, __p1_442, __p2_442, __p3_442) __extension__ ({ \
-  int16x4_t __s0_442 = __p0_442; \
-  int16x4_t __s1_442 = __p1_442; \
-  int16x8_t __s2_442 = __p2_442; \
-  int16x4_t __rev0_442;  __rev0_442 = __builtin_shufflevector(__s0_442, __s0_442, 3, 2, 1, 0); \
-  int16x4_t __rev1_442;  __rev1_442 = __builtin_shufflevector(__s1_442, __s1_442, 3, 2, 1, 0); \
-  int16x8_t __rev2_442;  __rev2_442 = __builtin_shufflevector(__s2_442, __s2_442, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x4_t __ret_442; \
-  __ret_442 = __rev0_442 + __rev1_442 * __noswap_splat_laneq_s16(__rev2_442, __p3_442); \
-  __ret_442 = __builtin_shufflevector(__ret_442, __ret_442, 3, 2, 1, 0); \
-  __ret_442; \
+#define vmla_laneq_s16(__p0_534, __p1_534, __p2_534, __p3_534) __extension__ ({ \
+  int16x4_t __s0_534 = __p0_534; \
+  int16x4_t __s1_534 = __p1_534; \
+  int16x8_t __s2_534 = __p2_534; \
+  int16x4_t __rev0_534;  __rev0_534 = __builtin_shufflevector(__s0_534, __s0_534, 3, 2, 1, 0); \
+  int16x4_t __rev1_534;  __rev1_534 = __builtin_shufflevector(__s1_534, __s1_534, 3, 2, 1, 0); \
+  int16x8_t __rev2_534;  __rev2_534 = __builtin_shufflevector(__s2_534, __s2_534, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __ret_534; \
+  __ret_534 = __rev0_534 + __rev1_534 * __noswap_splat_laneq_s16(__rev2_534, __p3_534); \
+  __ret_534 = __builtin_shufflevector(__ret_534, __ret_534, 3, 2, 1, 0); \
+  __ret_534; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai float64x2_t vmlaq_n_f64(float64x2_t __p0, float64x2_t __p1, float64_t __p2) {
-  float64x2_t __ret;
-  __ret = __p0 + __p1 * (float64x2_t) {__p2, __p2};
-  return __ret;
-}
-#else
-__ai float64x2_t vmlaq_n_f64(float64x2_t __p0, float64x2_t __p1, float64_t __p2) {
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  float64x2_t __ret;
-  __ret = __rev0 + __rev1 * (float64x2_t) {__p2, __p2};
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlal_high_lane_u32(__p0_443, __p1_443, __p2_443, __p3_443) __extension__ ({ \
-  uint64x2_t __s0_443 = __p0_443; \
-  uint32x4_t __s1_443 = __p1_443; \
-  uint32x2_t __s2_443 = __p2_443; \
-  uint64x2_t __ret_443; \
-  __ret_443 = __s0_443 + vmull_u32(vget_high_u32(__s1_443), splat_lane_u32(__s2_443, __p3_443)); \
-  __ret_443; \
+#define vmlal_high_lane_u32(__p0_535, __p1_535, __p2_535, __p3_535) __extension__ ({ \
+  uint64x2_t __s0_535 = __p0_535; \
+  uint32x4_t __s1_535 = __p1_535; \
+  uint32x2_t __s2_535 = __p2_535; \
+  uint64x2_t __ret_535; \
+  __ret_535 = __s0_535 + vmull_u32(vget_high_u32(__s1_535), splat_lane_u32(__s2_535, __p3_535)); \
+  __ret_535; \
 })
 #else
-#define vmlal_high_lane_u32(__p0_444, __p1_444, __p2_444, __p3_444) __extension__ ({ \
-  uint64x2_t __s0_444 = __p0_444; \
-  uint32x4_t __s1_444 = __p1_444; \
-  uint32x2_t __s2_444 = __p2_444; \
-  uint64x2_t __rev0_444;  __rev0_444 = __builtin_shufflevector(__s0_444, __s0_444, 1, 0); \
-  uint32x4_t __rev1_444;  __rev1_444 = __builtin_shufflevector(__s1_444, __s1_444, 3, 2, 1, 0); \
-  uint32x2_t __rev2_444;  __rev2_444 = __builtin_shufflevector(__s2_444, __s2_444, 1, 0); \
-  uint64x2_t __ret_444; \
-  __ret_444 = __rev0_444 + __noswap_vmull_u32(__noswap_vget_high_u32(__rev1_444), __noswap_splat_lane_u32(__rev2_444, __p3_444)); \
-  __ret_444 = __builtin_shufflevector(__ret_444, __ret_444, 1, 0); \
-  __ret_444; \
+#define vmlal_high_lane_u32(__p0_536, __p1_536, __p2_536, __p3_536) __extension__ ({ \
+  uint64x2_t __s0_536 = __p0_536; \
+  uint32x4_t __s1_536 = __p1_536; \
+  uint32x2_t __s2_536 = __p2_536; \
+  uint64x2_t __rev0_536;  __rev0_536 = __builtin_shufflevector(__s0_536, __s0_536, 1, 0); \
+  uint32x4_t __rev1_536;  __rev1_536 = __builtin_shufflevector(__s1_536, __s1_536, 3, 2, 1, 0); \
+  uint32x2_t __rev2_536;  __rev2_536 = __builtin_shufflevector(__s2_536, __s2_536, 1, 0); \
+  uint64x2_t __ret_536; \
+  __ret_536 = __rev0_536 + __noswap_vmull_u32(__noswap_vget_high_u32(__rev1_536), __noswap_splat_lane_u32(__rev2_536, __p3_536)); \
+  __ret_536 = __builtin_shufflevector(__ret_536, __ret_536, 1, 0); \
+  __ret_536; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmlal_high_lane_u16(__p0_445, __p1_445, __p2_445, __p3_445) __extension__ ({ \
-  uint32x4_t __s0_445 = __p0_445; \
-  uint16x8_t __s1_445 = __p1_445; \
-  uint16x4_t __s2_445 = __p2_445; \
-  uint32x4_t __ret_445; \
-  __ret_445 = __s0_445 + vmull_u16(vget_high_u16(__s1_445), splat_lane_u16(__s2_445, __p3_445)); \
-  __ret_445; \
+#define vmlal_high_lane_u16(__p0_537, __p1_537, __p2_537, __p3_537) __extension__ ({ \
+  uint32x4_t __s0_537 = __p0_537; \
+  uint16x8_t __s1_537 = __p1_537; \
+  uint16x4_t __s2_537 = __p2_537; \
+  uint32x4_t __ret_537; \
+  __ret_537 = __s0_537 + vmull_u16(vget_high_u16(__s1_537), splat_lane_u16(__s2_537, __p3_537)); \
+  __ret_537; \
 })
 #else
-#define vmlal_high_lane_u16(__p0_446, __p1_446, __p2_446, __p3_446) __extension__ ({ \
-  uint32x4_t __s0_446 = __p0_446; \
-  uint16x8_t __s1_446 = __p1_446; \
-  uint16x4_t __s2_446 = __p2_446; \
-  uint32x4_t __rev0_446;  __rev0_446 = __builtin_shufflevector(__s0_446, __s0_446, 3, 2, 1, 0); \
-  uint16x8_t __rev1_446;  __rev1_446 = __builtin_shufflevector(__s1_446, __s1_446, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x4_t __rev2_446;  __rev2_446 = __builtin_shufflevector(__s2_446, __s2_446, 3, 2, 1, 0); \
-  uint32x4_t __ret_446; \
-  __ret_446 = __rev0_446 + __noswap_vmull_u16(__noswap_vget_high_u16(__rev1_446), __noswap_splat_lane_u16(__rev2_446, __p3_446)); \
-  __ret_446 = __builtin_shufflevector(__ret_446, __ret_446, 3, 2, 1, 0); \
-  __ret_446; \
+#define vmlal_high_lane_u16(__p0_538, __p1_538, __p2_538, __p3_538) __extension__ ({ \
+  uint32x4_t __s0_538 = __p0_538; \
+  uint16x8_t __s1_538 = __p1_538; \
+  uint16x4_t __s2_538 = __p2_538; \
+  uint32x4_t __rev0_538;  __rev0_538 = __builtin_shufflevector(__s0_538, __s0_538, 3, 2, 1, 0); \
+  uint16x8_t __rev1_538;  __rev1_538 = __builtin_shufflevector(__s1_538, __s1_538, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x4_t __rev2_538;  __rev2_538 = __builtin_shufflevector(__s2_538, __s2_538, 3, 2, 1, 0); \
+  uint32x4_t __ret_538; \
+  __ret_538 = __rev0_538 + __noswap_vmull_u16(__noswap_vget_high_u16(__rev1_538), __noswap_splat_lane_u16(__rev2_538, __p3_538)); \
+  __ret_538 = __builtin_shufflevector(__ret_538, __ret_538, 3, 2, 1, 0); \
+  __ret_538; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmlal_high_lane_s32(__p0_447, __p1_447, __p2_447, __p3_447) __extension__ ({ \
-  int64x2_t __s0_447 = __p0_447; \
-  int32x4_t __s1_447 = __p1_447; \
-  int32x2_t __s2_447 = __p2_447; \
-  int64x2_t __ret_447; \
-  __ret_447 = __s0_447 + vmull_s32(vget_high_s32(__s1_447), splat_lane_s32(__s2_447, __p3_447)); \
-  __ret_447; \
+#define vmlal_high_lane_s32(__p0_539, __p1_539, __p2_539, __p3_539) __extension__ ({ \
+  int64x2_t __s0_539 = __p0_539; \
+  int32x4_t __s1_539 = __p1_539; \
+  int32x2_t __s2_539 = __p2_539; \
+  int64x2_t __ret_539; \
+  __ret_539 = __s0_539 + vmull_s32(vget_high_s32(__s1_539), splat_lane_s32(__s2_539, __p3_539)); \
+  __ret_539; \
 })
 #else
-#define vmlal_high_lane_s32(__p0_448, __p1_448, __p2_448, __p3_448) __extension__ ({ \
-  int64x2_t __s0_448 = __p0_448; \
-  int32x4_t __s1_448 = __p1_448; \
-  int32x2_t __s2_448 = __p2_448; \
-  int64x2_t __rev0_448;  __rev0_448 = __builtin_shufflevector(__s0_448, __s0_448, 1, 0); \
-  int32x4_t __rev1_448;  __rev1_448 = __builtin_shufflevector(__s1_448, __s1_448, 3, 2, 1, 0); \
-  int32x2_t __rev2_448;  __rev2_448 = __builtin_shufflevector(__s2_448, __s2_448, 1, 0); \
-  int64x2_t __ret_448; \
-  __ret_448 = __rev0_448 + __noswap_vmull_s32(__noswap_vget_high_s32(__rev1_448), __noswap_splat_lane_s32(__rev2_448, __p3_448)); \
-  __ret_448 = __builtin_shufflevector(__ret_448, __ret_448, 1, 0); \
-  __ret_448; \
+#define vmlal_high_lane_s32(__p0_540, __p1_540, __p2_540, __p3_540) __extension__ ({ \
+  int64x2_t __s0_540 = __p0_540; \
+  int32x4_t __s1_540 = __p1_540; \
+  int32x2_t __s2_540 = __p2_540; \
+  int64x2_t __rev0_540;  __rev0_540 = __builtin_shufflevector(__s0_540, __s0_540, 1, 0); \
+  int32x4_t __rev1_540;  __rev1_540 = __builtin_shufflevector(__s1_540, __s1_540, 3, 2, 1, 0); \
+  int32x2_t __rev2_540;  __rev2_540 = __builtin_shufflevector(__s2_540, __s2_540, 1, 0); \
+  int64x2_t __ret_540; \
+  __ret_540 = __rev0_540 + __noswap_vmull_s32(__noswap_vget_high_s32(__rev1_540), __noswap_splat_lane_s32(__rev2_540, __p3_540)); \
+  __ret_540 = __builtin_shufflevector(__ret_540, __ret_540, 1, 0); \
+  __ret_540; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmlal_high_lane_s16(__p0_449, __p1_449, __p2_449, __p3_449) __extension__ ({ \
-  int32x4_t __s0_449 = __p0_449; \
-  int16x8_t __s1_449 = __p1_449; \
-  int16x4_t __s2_449 = __p2_449; \
-  int32x4_t __ret_449; \
-  __ret_449 = __s0_449 + vmull_s16(vget_high_s16(__s1_449), splat_lane_s16(__s2_449, __p3_449)); \
-  __ret_449; \
+#define vmlal_high_lane_s16(__p0_541, __p1_541, __p2_541, __p3_541) __extension__ ({ \
+  int32x4_t __s0_541 = __p0_541; \
+  int16x8_t __s1_541 = __p1_541; \
+  int16x4_t __s2_541 = __p2_541; \
+  int32x4_t __ret_541; \
+  __ret_541 = __s0_541 + vmull_s16(vget_high_s16(__s1_541), splat_lane_s16(__s2_541, __p3_541)); \
+  __ret_541; \
 })
 #else
-#define vmlal_high_lane_s16(__p0_450, __p1_450, __p2_450, __p3_450) __extension__ ({ \
-  int32x4_t __s0_450 = __p0_450; \
-  int16x8_t __s1_450 = __p1_450; \
-  int16x4_t __s2_450 = __p2_450; \
-  int32x4_t __rev0_450;  __rev0_450 = __builtin_shufflevector(__s0_450, __s0_450, 3, 2, 1, 0); \
-  int16x8_t __rev1_450;  __rev1_450 = __builtin_shufflevector(__s1_450, __s1_450, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x4_t __rev2_450;  __rev2_450 = __builtin_shufflevector(__s2_450, __s2_450, 3, 2, 1, 0); \
-  int32x4_t __ret_450; \
-  __ret_450 = __rev0_450 + __noswap_vmull_s16(__noswap_vget_high_s16(__rev1_450), __noswap_splat_lane_s16(__rev2_450, __p3_450)); \
-  __ret_450 = __builtin_shufflevector(__ret_450, __ret_450, 3, 2, 1, 0); \
-  __ret_450; \
+#define vmlal_high_lane_s16(__p0_542, __p1_542, __p2_542, __p3_542) __extension__ ({ \
+  int32x4_t __s0_542 = __p0_542; \
+  int16x8_t __s1_542 = __p1_542; \
+  int16x4_t __s2_542 = __p2_542; \
+  int32x4_t __rev0_542;  __rev0_542 = __builtin_shufflevector(__s0_542, __s0_542, 3, 2, 1, 0); \
+  int16x8_t __rev1_542;  __rev1_542 = __builtin_shufflevector(__s1_542, __s1_542, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __rev2_542;  __rev2_542 = __builtin_shufflevector(__s2_542, __s2_542, 3, 2, 1, 0); \
+  int32x4_t __ret_542; \
+  __ret_542 = __rev0_542 + __noswap_vmull_s16(__noswap_vget_high_s16(__rev1_542), __noswap_splat_lane_s16(__rev2_542, __p3_542)); \
+  __ret_542 = __builtin_shufflevector(__ret_542, __ret_542, 3, 2, 1, 0); \
+  __ret_542; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmlal_high_laneq_u32(__p0_451, __p1_451, __p2_451, __p3_451) __extension__ ({ \
-  uint64x2_t __s0_451 = __p0_451; \
-  uint32x4_t __s1_451 = __p1_451; \
-  uint32x4_t __s2_451 = __p2_451; \
-  uint64x2_t __ret_451; \
-  __ret_451 = __s0_451 + vmull_u32(vget_high_u32(__s1_451), splat_laneq_u32(__s2_451, __p3_451)); \
-  __ret_451; \
+#define vmlal_high_laneq_u32(__p0_543, __p1_543, __p2_543, __p3_543) __extension__ ({ \
+  uint64x2_t __s0_543 = __p0_543; \
+  uint32x4_t __s1_543 = __p1_543; \
+  uint32x4_t __s2_543 = __p2_543; \
+  uint64x2_t __ret_543; \
+  __ret_543 = __s0_543 + vmull_u32(vget_high_u32(__s1_543), splat_laneq_u32(__s2_543, __p3_543)); \
+  __ret_543; \
 })
 #else
-#define vmlal_high_laneq_u32(__p0_452, __p1_452, __p2_452, __p3_452) __extension__ ({ \
-  uint64x2_t __s0_452 = __p0_452; \
-  uint32x4_t __s1_452 = __p1_452; \
-  uint32x4_t __s2_452 = __p2_452; \
-  uint64x2_t __rev0_452;  __rev0_452 = __builtin_shufflevector(__s0_452, __s0_452, 1, 0); \
-  uint32x4_t __rev1_452;  __rev1_452 = __builtin_shufflevector(__s1_452, __s1_452, 3, 2, 1, 0); \
-  uint32x4_t __rev2_452;  __rev2_452 = __builtin_shufflevector(__s2_452, __s2_452, 3, 2, 1, 0); \
-  uint64x2_t __ret_452; \
-  __ret_452 = __rev0_452 + __noswap_vmull_u32(__noswap_vget_high_u32(__rev1_452), __noswap_splat_laneq_u32(__rev2_452, __p3_452)); \
-  __ret_452 = __builtin_shufflevector(__ret_452, __ret_452, 1, 0); \
-  __ret_452; \
+#define vmlal_high_laneq_u32(__p0_544, __p1_544, __p2_544, __p3_544) __extension__ ({ \
+  uint64x2_t __s0_544 = __p0_544; \
+  uint32x4_t __s1_544 = __p1_544; \
+  uint32x4_t __s2_544 = __p2_544; \
+  uint64x2_t __rev0_544;  __rev0_544 = __builtin_shufflevector(__s0_544, __s0_544, 1, 0); \
+  uint32x4_t __rev1_544;  __rev1_544 = __builtin_shufflevector(__s1_544, __s1_544, 3, 2, 1, 0); \
+  uint32x4_t __rev2_544;  __rev2_544 = __builtin_shufflevector(__s2_544, __s2_544, 3, 2, 1, 0); \
+  uint64x2_t __ret_544; \
+  __ret_544 = __rev0_544 + __noswap_vmull_u32(__noswap_vget_high_u32(__rev1_544), __noswap_splat_laneq_u32(__rev2_544, __p3_544)); \
+  __ret_544 = __builtin_shufflevector(__ret_544, __ret_544, 1, 0); \
+  __ret_544; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmlal_high_laneq_u16(__p0_453, __p1_453, __p2_453, __p3_453) __extension__ ({ \
-  uint32x4_t __s0_453 = __p0_453; \
-  uint16x8_t __s1_453 = __p1_453; \
-  uint16x8_t __s2_453 = __p2_453; \
-  uint32x4_t __ret_453; \
-  __ret_453 = __s0_453 + vmull_u16(vget_high_u16(__s1_453), splat_laneq_u16(__s2_453, __p3_453)); \
-  __ret_453; \
+#define vmlal_high_laneq_u16(__p0_545, __p1_545, __p2_545, __p3_545) __extension__ ({ \
+  uint32x4_t __s0_545 = __p0_545; \
+  uint16x8_t __s1_545 = __p1_545; \
+  uint16x8_t __s2_545 = __p2_545; \
+  uint32x4_t __ret_545; \
+  __ret_545 = __s0_545 + vmull_u16(vget_high_u16(__s1_545), splat_laneq_u16(__s2_545, __p3_545)); \
+  __ret_545; \
 })
 #else
-#define vmlal_high_laneq_u16(__p0_454, __p1_454, __p2_454, __p3_454) __extension__ ({ \
-  uint32x4_t __s0_454 = __p0_454; \
-  uint16x8_t __s1_454 = __p1_454; \
-  uint16x8_t __s2_454 = __p2_454; \
-  uint32x4_t __rev0_454;  __rev0_454 = __builtin_shufflevector(__s0_454, __s0_454, 3, 2, 1, 0); \
-  uint16x8_t __rev1_454;  __rev1_454 = __builtin_shufflevector(__s1_454, __s1_454, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __rev2_454;  __rev2_454 = __builtin_shufflevector(__s2_454, __s2_454, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint32x4_t __ret_454; \
-  __ret_454 = __rev0_454 + __noswap_vmull_u16(__noswap_vget_high_u16(__rev1_454), __noswap_splat_laneq_u16(__rev2_454, __p3_454)); \
-  __ret_454 = __builtin_shufflevector(__ret_454, __ret_454, 3, 2, 1, 0); \
-  __ret_454; \
+#define vmlal_high_laneq_u16(__p0_546, __p1_546, __p2_546, __p3_546) __extension__ ({ \
+  uint32x4_t __s0_546 = __p0_546; \
+  uint16x8_t __s1_546 = __p1_546; \
+  uint16x8_t __s2_546 = __p2_546; \
+  uint32x4_t __rev0_546;  __rev0_546 = __builtin_shufflevector(__s0_546, __s0_546, 3, 2, 1, 0); \
+  uint16x8_t __rev1_546;  __rev1_546 = __builtin_shufflevector(__s1_546, __s1_546, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev2_546;  __rev2_546 = __builtin_shufflevector(__s2_546, __s2_546, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint32x4_t __ret_546; \
+  __ret_546 = __rev0_546 + __noswap_vmull_u16(__noswap_vget_high_u16(__rev1_546), __noswap_splat_laneq_u16(__rev2_546, __p3_546)); \
+  __ret_546 = __builtin_shufflevector(__ret_546, __ret_546, 3, 2, 1, 0); \
+  __ret_546; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmlal_high_laneq_s32(__p0_455, __p1_455, __p2_455, __p3_455) __extension__ ({ \
-  int64x2_t __s0_455 = __p0_455; \
-  int32x4_t __s1_455 = __p1_455; \
-  int32x4_t __s2_455 = __p2_455; \
-  int64x2_t __ret_455; \
-  __ret_455 = __s0_455 + vmull_s32(vget_high_s32(__s1_455), splat_laneq_s32(__s2_455, __p3_455)); \
-  __ret_455; \
+#define vmlal_high_laneq_s32(__p0_547, __p1_547, __p2_547, __p3_547) __extension__ ({ \
+  int64x2_t __s0_547 = __p0_547; \
+  int32x4_t __s1_547 = __p1_547; \
+  int32x4_t __s2_547 = __p2_547; \
+  int64x2_t __ret_547; \
+  __ret_547 = __s0_547 + vmull_s32(vget_high_s32(__s1_547), splat_laneq_s32(__s2_547, __p3_547)); \
+  __ret_547; \
 })
 #else
-#define vmlal_high_laneq_s32(__p0_456, __p1_456, __p2_456, __p3_456) __extension__ ({ \
-  int64x2_t __s0_456 = __p0_456; \
-  int32x4_t __s1_456 = __p1_456; \
-  int32x4_t __s2_456 = __p2_456; \
-  int64x2_t __rev0_456;  __rev0_456 = __builtin_shufflevector(__s0_456, __s0_456, 1, 0); \
-  int32x4_t __rev1_456;  __rev1_456 = __builtin_shufflevector(__s1_456, __s1_456, 3, 2, 1, 0); \
-  int32x4_t __rev2_456;  __rev2_456 = __builtin_shufflevector(__s2_456, __s2_456, 3, 2, 1, 0); \
-  int64x2_t __ret_456; \
-  __ret_456 = __rev0_456 + __noswap_vmull_s32(__noswap_vget_high_s32(__rev1_456), __noswap_splat_laneq_s32(__rev2_456, __p3_456)); \
-  __ret_456 = __builtin_shufflevector(__ret_456, __ret_456, 1, 0); \
-  __ret_456; \
+#define vmlal_high_laneq_s32(__p0_548, __p1_548, __p2_548, __p3_548) __extension__ ({ \
+  int64x2_t __s0_548 = __p0_548; \
+  int32x4_t __s1_548 = __p1_548; \
+  int32x4_t __s2_548 = __p2_548; \
+  int64x2_t __rev0_548;  __rev0_548 = __builtin_shufflevector(__s0_548, __s0_548, 1, 0); \
+  int32x4_t __rev1_548;  __rev1_548 = __builtin_shufflevector(__s1_548, __s1_548, 3, 2, 1, 0); \
+  int32x4_t __rev2_548;  __rev2_548 = __builtin_shufflevector(__s2_548, __s2_548, 3, 2, 1, 0); \
+  int64x2_t __ret_548; \
+  __ret_548 = __rev0_548 + __noswap_vmull_s32(__noswap_vget_high_s32(__rev1_548), __noswap_splat_laneq_s32(__rev2_548, __p3_548)); \
+  __ret_548 = __builtin_shufflevector(__ret_548, __ret_548, 1, 0); \
+  __ret_548; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmlal_high_laneq_s16(__p0_457, __p1_457, __p2_457, __p3_457) __extension__ ({ \
-  int32x4_t __s0_457 = __p0_457; \
-  int16x8_t __s1_457 = __p1_457; \
-  int16x8_t __s2_457 = __p2_457; \
-  int32x4_t __ret_457; \
-  __ret_457 = __s0_457 + vmull_s16(vget_high_s16(__s1_457), splat_laneq_s16(__s2_457, __p3_457)); \
-  __ret_457; \
+#define vmlal_high_laneq_s16(__p0_549, __p1_549, __p2_549, __p3_549) __extension__ ({ \
+  int32x4_t __s0_549 = __p0_549; \
+  int16x8_t __s1_549 = __p1_549; \
+  int16x8_t __s2_549 = __p2_549; \
+  int32x4_t __ret_549; \
+  __ret_549 = __s0_549 + vmull_s16(vget_high_s16(__s1_549), splat_laneq_s16(__s2_549, __p3_549)); \
+  __ret_549; \
 })
 #else
-#define vmlal_high_laneq_s16(__p0_458, __p1_458, __p2_458, __p3_458) __extension__ ({ \
-  int32x4_t __s0_458 = __p0_458; \
-  int16x8_t __s1_458 = __p1_458; \
-  int16x8_t __s2_458 = __p2_458; \
-  int32x4_t __rev0_458;  __rev0_458 = __builtin_shufflevector(__s0_458, __s0_458, 3, 2, 1, 0); \
-  int16x8_t __rev1_458;  __rev1_458 = __builtin_shufflevector(__s1_458, __s1_458, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev2_458;  __rev2_458 = __builtin_shufflevector(__s2_458, __s2_458, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int32x4_t __ret_458; \
-  __ret_458 = __rev0_458 + __noswap_vmull_s16(__noswap_vget_high_s16(__rev1_458), __noswap_splat_laneq_s16(__rev2_458, __p3_458)); \
-  __ret_458 = __builtin_shufflevector(__ret_458, __ret_458, 3, 2, 1, 0); \
-  __ret_458; \
+#define vmlal_high_laneq_s16(__p0_550, __p1_550, __p2_550, __p3_550) __extension__ ({ \
+  int32x4_t __s0_550 = __p0_550; \
+  int16x8_t __s1_550 = __p1_550; \
+  int16x8_t __s2_550 = __p2_550; \
+  int32x4_t __rev0_550;  __rev0_550 = __builtin_shufflevector(__s0_550, __s0_550, 3, 2, 1, 0); \
+  int16x8_t __rev1_550;  __rev1_550 = __builtin_shufflevector(__s1_550, __s1_550, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev2_550;  __rev2_550 = __builtin_shufflevector(__s2_550, __s2_550, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret_550; \
+  __ret_550 = __rev0_550 + __noswap_vmull_s16(__noswap_vget_high_s16(__rev1_550), __noswap_splat_laneq_s16(__rev2_550, __p3_550)); \
+  __ret_550 = __builtin_shufflevector(__ret_550, __ret_550, 3, 2, 1, 0); \
+  __ret_550; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmlal_laneq_u32(__p0_459, __p1_459, __p2_459, __p3_459) __extension__ ({ \
-  uint64x2_t __s0_459 = __p0_459; \
-  uint32x2_t __s1_459 = __p1_459; \
-  uint32x4_t __s2_459 = __p2_459; \
-  uint64x2_t __ret_459; \
-  __ret_459 = __s0_459 + vmull_u32(__s1_459, splat_laneq_u32(__s2_459, __p3_459)); \
-  __ret_459; \
+#define vmlal_laneq_u32(__p0_551, __p1_551, __p2_551, __p3_551) __extension__ ({ \
+  uint64x2_t __s0_551 = __p0_551; \
+  uint32x2_t __s1_551 = __p1_551; \
+  uint32x4_t __s2_551 = __p2_551; \
+  uint64x2_t __ret_551; \
+  __ret_551 = __s0_551 + vmull_u32(__s1_551, splat_laneq_u32(__s2_551, __p3_551)); \
+  __ret_551; \
 })
 #else
-#define vmlal_laneq_u32(__p0_460, __p1_460, __p2_460, __p3_460) __extension__ ({ \
-  uint64x2_t __s0_460 = __p0_460; \
-  uint32x2_t __s1_460 = __p1_460; \
-  uint32x4_t __s2_460 = __p2_460; \
-  uint64x2_t __rev0_460;  __rev0_460 = __builtin_shufflevector(__s0_460, __s0_460, 1, 0); \
-  uint32x2_t __rev1_460;  __rev1_460 = __builtin_shufflevector(__s1_460, __s1_460, 1, 0); \
-  uint32x4_t __rev2_460;  __rev2_460 = __builtin_shufflevector(__s2_460, __s2_460, 3, 2, 1, 0); \
-  uint64x2_t __ret_460; \
-  __ret_460 = __rev0_460 + __noswap_vmull_u32(__rev1_460, __noswap_splat_laneq_u32(__rev2_460, __p3_460)); \
-  __ret_460 = __builtin_shufflevector(__ret_460, __ret_460, 1, 0); \
-  __ret_460; \
+#define vmlal_laneq_u32(__p0_552, __p1_552, __p2_552, __p3_552) __extension__ ({ \
+  uint64x2_t __s0_552 = __p0_552; \
+  uint32x2_t __s1_552 = __p1_552; \
+  uint32x4_t __s2_552 = __p2_552; \
+  uint64x2_t __rev0_552;  __rev0_552 = __builtin_shufflevector(__s0_552, __s0_552, 1, 0); \
+  uint32x2_t __rev1_552;  __rev1_552 = __builtin_shufflevector(__s1_552, __s1_552, 1, 0); \
+  uint32x4_t __rev2_552;  __rev2_552 = __builtin_shufflevector(__s2_552, __s2_552, 3, 2, 1, 0); \
+  uint64x2_t __ret_552; \
+  __ret_552 = __rev0_552 + __noswap_vmull_u32(__rev1_552, __noswap_splat_laneq_u32(__rev2_552, __p3_552)); \
+  __ret_552 = __builtin_shufflevector(__ret_552, __ret_552, 1, 0); \
+  __ret_552; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmlal_laneq_u16(__p0_461, __p1_461, __p2_461, __p3_461) __extension__ ({ \
-  uint32x4_t __s0_461 = __p0_461; \
-  uint16x4_t __s1_461 = __p1_461; \
-  uint16x8_t __s2_461 = __p2_461; \
-  uint32x4_t __ret_461; \
-  __ret_461 = __s0_461 + vmull_u16(__s1_461, splat_laneq_u16(__s2_461, __p3_461)); \
-  __ret_461; \
+#define vmlal_laneq_u16(__p0_553, __p1_553, __p2_553, __p3_553) __extension__ ({ \
+  uint32x4_t __s0_553 = __p0_553; \
+  uint16x4_t __s1_553 = __p1_553; \
+  uint16x8_t __s2_553 = __p2_553; \
+  uint32x4_t __ret_553; \
+  __ret_553 = __s0_553 + vmull_u16(__s1_553, splat_laneq_u16(__s2_553, __p3_553)); \
+  __ret_553; \
 })
 #else
-#define vmlal_laneq_u16(__p0_462, __p1_462, __p2_462, __p3_462) __extension__ ({ \
-  uint32x4_t __s0_462 = __p0_462; \
-  uint16x4_t __s1_462 = __p1_462; \
-  uint16x8_t __s2_462 = __p2_462; \
-  uint32x4_t __rev0_462;  __rev0_462 = __builtin_shufflevector(__s0_462, __s0_462, 3, 2, 1, 0); \
-  uint16x4_t __rev1_462;  __rev1_462 = __builtin_shufflevector(__s1_462, __s1_462, 3, 2, 1, 0); \
-  uint16x8_t __rev2_462;  __rev2_462 = __builtin_shufflevector(__s2_462, __s2_462, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint32x4_t __ret_462; \
-  __ret_462 = __rev0_462 + __noswap_vmull_u16(__rev1_462, __noswap_splat_laneq_u16(__rev2_462, __p3_462)); \
-  __ret_462 = __builtin_shufflevector(__ret_462, __ret_462, 3, 2, 1, 0); \
-  __ret_462; \
+#define vmlal_laneq_u16(__p0_554, __p1_554, __p2_554, __p3_554) __extension__ ({ \
+  uint32x4_t __s0_554 = __p0_554; \
+  uint16x4_t __s1_554 = __p1_554; \
+  uint16x8_t __s2_554 = __p2_554; \
+  uint32x4_t __rev0_554;  __rev0_554 = __builtin_shufflevector(__s0_554, __s0_554, 3, 2, 1, 0); \
+  uint16x4_t __rev1_554;  __rev1_554 = __builtin_shufflevector(__s1_554, __s1_554, 3, 2, 1, 0); \
+  uint16x8_t __rev2_554;  __rev2_554 = __builtin_shufflevector(__s2_554, __s2_554, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint32x4_t __ret_554; \
+  __ret_554 = __rev0_554 + __noswap_vmull_u16(__rev1_554, __noswap_splat_laneq_u16(__rev2_554, __p3_554)); \
+  __ret_554 = __builtin_shufflevector(__ret_554, __ret_554, 3, 2, 1, 0); \
+  __ret_554; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmlal_laneq_s32(__p0_463, __p1_463, __p2_463, __p3_463) __extension__ ({ \
-  int64x2_t __s0_463 = __p0_463; \
-  int32x2_t __s1_463 = __p1_463; \
-  int32x4_t __s2_463 = __p2_463; \
-  int64x2_t __ret_463; \
-  __ret_463 = __s0_463 + vmull_s32(__s1_463, splat_laneq_s32(__s2_463, __p3_463)); \
-  __ret_463; \
+#define vmlal_laneq_s32(__p0_555, __p1_555, __p2_555, __p3_555) __extension__ ({ \
+  int64x2_t __s0_555 = __p0_555; \
+  int32x2_t __s1_555 = __p1_555; \
+  int32x4_t __s2_555 = __p2_555; \
+  int64x2_t __ret_555; \
+  __ret_555 = __s0_555 + vmull_s32(__s1_555, splat_laneq_s32(__s2_555, __p3_555)); \
+  __ret_555; \
 })
 #else
-#define vmlal_laneq_s32(__p0_464, __p1_464, __p2_464, __p3_464) __extension__ ({ \
-  int64x2_t __s0_464 = __p0_464; \
-  int32x2_t __s1_464 = __p1_464; \
-  int32x4_t __s2_464 = __p2_464; \
-  int64x2_t __rev0_464;  __rev0_464 = __builtin_shufflevector(__s0_464, __s0_464, 1, 0); \
-  int32x2_t __rev1_464;  __rev1_464 = __builtin_shufflevector(__s1_464, __s1_464, 1, 0); \
-  int32x4_t __rev2_464;  __rev2_464 = __builtin_shufflevector(__s2_464, __s2_464, 3, 2, 1, 0); \
-  int64x2_t __ret_464; \
-  __ret_464 = __rev0_464 + __noswap_vmull_s32(__rev1_464, __noswap_splat_laneq_s32(__rev2_464, __p3_464)); \
-  __ret_464 = __builtin_shufflevector(__ret_464, __ret_464, 1, 0); \
-  __ret_464; \
+#define vmlal_laneq_s32(__p0_556, __p1_556, __p2_556, __p3_556) __extension__ ({ \
+  int64x2_t __s0_556 = __p0_556; \
+  int32x2_t __s1_556 = __p1_556; \
+  int32x4_t __s2_556 = __p2_556; \
+  int64x2_t __rev0_556;  __rev0_556 = __builtin_shufflevector(__s0_556, __s0_556, 1, 0); \
+  int32x2_t __rev1_556;  __rev1_556 = __builtin_shufflevector(__s1_556, __s1_556, 1, 0); \
+  int32x4_t __rev2_556;  __rev2_556 = __builtin_shufflevector(__s2_556, __s2_556, 3, 2, 1, 0); \
+  int64x2_t __ret_556; \
+  __ret_556 = __rev0_556 + __noswap_vmull_s32(__rev1_556, __noswap_splat_laneq_s32(__rev2_556, __p3_556)); \
+  __ret_556 = __builtin_shufflevector(__ret_556, __ret_556, 1, 0); \
+  __ret_556; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmlal_laneq_s16(__p0_465, __p1_465, __p2_465, __p3_465) __extension__ ({ \
-  int32x4_t __s0_465 = __p0_465; \
-  int16x4_t __s1_465 = __p1_465; \
-  int16x8_t __s2_465 = __p2_465; \
-  int32x4_t __ret_465; \
-  __ret_465 = __s0_465 + vmull_s16(__s1_465, splat_laneq_s16(__s2_465, __p3_465)); \
-  __ret_465; \
+#define vmlal_laneq_s16(__p0_557, __p1_557, __p2_557, __p3_557) __extension__ ({ \
+  int32x4_t __s0_557 = __p0_557; \
+  int16x4_t __s1_557 = __p1_557; \
+  int16x8_t __s2_557 = __p2_557; \
+  int32x4_t __ret_557; \
+  __ret_557 = __s0_557 + vmull_s16(__s1_557, splat_laneq_s16(__s2_557, __p3_557)); \
+  __ret_557; \
 })
 #else
-#define vmlal_laneq_s16(__p0_466, __p1_466, __p2_466, __p3_466) __extension__ ({ \
-  int32x4_t __s0_466 = __p0_466; \
-  int16x4_t __s1_466 = __p1_466; \
-  int16x8_t __s2_466 = __p2_466; \
-  int32x4_t __rev0_466;  __rev0_466 = __builtin_shufflevector(__s0_466, __s0_466, 3, 2, 1, 0); \
-  int16x4_t __rev1_466;  __rev1_466 = __builtin_shufflevector(__s1_466, __s1_466, 3, 2, 1, 0); \
-  int16x8_t __rev2_466;  __rev2_466 = __builtin_shufflevector(__s2_466, __s2_466, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int32x4_t __ret_466; \
-  __ret_466 = __rev0_466 + __noswap_vmull_s16(__rev1_466, __noswap_splat_laneq_s16(__rev2_466, __p3_466)); \
-  __ret_466 = __builtin_shufflevector(__ret_466, __ret_466, 3, 2, 1, 0); \
-  __ret_466; \
+#define vmlal_laneq_s16(__p0_558, __p1_558, __p2_558, __p3_558) __extension__ ({ \
+  int32x4_t __s0_558 = __p0_558; \
+  int16x4_t __s1_558 = __p1_558; \
+  int16x8_t __s2_558 = __p2_558; \
+  int32x4_t __rev0_558;  __rev0_558 = __builtin_shufflevector(__s0_558, __s0_558, 3, 2, 1, 0); \
+  int16x4_t __rev1_558;  __rev1_558 = __builtin_shufflevector(__s1_558, __s1_558, 3, 2, 1, 0); \
+  int16x8_t __rev2_558;  __rev2_558 = __builtin_shufflevector(__s2_558, __s2_558, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret_558; \
+  __ret_558 = __rev0_558 + __noswap_vmull_s16(__rev1_558, __noswap_splat_laneq_s16(__rev2_558, __p3_558)); \
+  __ret_558 = __builtin_shufflevector(__ret_558, __ret_558, 3, 2, 1, 0); \
+  __ret_558; \
 })
 #endif
 
@@ -53116,547 +54803,530 @@ __ai float64x1_t vmls_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2)
   return __ret;
 }
 #ifdef __LITTLE_ENDIAN__
-#define vmlsq_laneq_u32(__p0_467, __p1_467, __p2_467, __p3_467) __extension__ ({ \
-  uint32x4_t __s0_467 = __p0_467; \
-  uint32x4_t __s1_467 = __p1_467; \
-  uint32x4_t __s2_467 = __p2_467; \
-  uint32x4_t __ret_467; \
-  __ret_467 = __s0_467 - __s1_467 * splatq_laneq_u32(__s2_467, __p3_467); \
-  __ret_467; \
+#define vmlsq_laneq_u32(__p0_559, __p1_559, __p2_559, __p3_559) __extension__ ({ \
+  uint32x4_t __s0_559 = __p0_559; \
+  uint32x4_t __s1_559 = __p1_559; \
+  uint32x4_t __s2_559 = __p2_559; \
+  uint32x4_t __ret_559; \
+  __ret_559 = __s0_559 - __s1_559 * splatq_laneq_u32(__s2_559, __p3_559); \
+  __ret_559; \
 })
 #else
-#define vmlsq_laneq_u32(__p0_468, __p1_468, __p2_468, __p3_468) __extension__ ({ \
-  uint32x4_t __s0_468 = __p0_468; \
-  uint32x4_t __s1_468 = __p1_468; \
-  uint32x4_t __s2_468 = __p2_468; \
-  uint32x4_t __rev0_468;  __rev0_468 = __builtin_shufflevector(__s0_468, __s0_468, 3, 2, 1, 0); \
-  uint32x4_t __rev1_468;  __rev1_468 = __builtin_shufflevector(__s1_468, __s1_468, 3, 2, 1, 0); \
-  uint32x4_t __rev2_468;  __rev2_468 = __builtin_shufflevector(__s2_468, __s2_468, 3, 2, 1, 0); \
-  uint32x4_t __ret_468; \
-  __ret_468 = __rev0_468 - __rev1_468 * __noswap_splatq_laneq_u32(__rev2_468, __p3_468); \
-  __ret_468 = __builtin_shufflevector(__ret_468, __ret_468, 3, 2, 1, 0); \
-  __ret_468; \
+#define vmlsq_laneq_u32(__p0_560, __p1_560, __p2_560, __p3_560) __extension__ ({ \
+  uint32x4_t __s0_560 = __p0_560; \
+  uint32x4_t __s1_560 = __p1_560; \
+  uint32x4_t __s2_560 = __p2_560; \
+  uint32x4_t __rev0_560;  __rev0_560 = __builtin_shufflevector(__s0_560, __s0_560, 3, 2, 1, 0); \
+  uint32x4_t __rev1_560;  __rev1_560 = __builtin_shufflevector(__s1_560, __s1_560, 3, 2, 1, 0); \
+  uint32x4_t __rev2_560;  __rev2_560 = __builtin_shufflevector(__s2_560, __s2_560, 3, 2, 1, 0); \
+  uint32x4_t __ret_560; \
+  __ret_560 = __rev0_560 - __rev1_560 * __noswap_splatq_laneq_u32(__rev2_560, __p3_560); \
+  __ret_560 = __builtin_shufflevector(__ret_560, __ret_560, 3, 2, 1, 0); \
+  __ret_560; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmlsq_laneq_u16(__p0_469, __p1_469, __p2_469, __p3_469) __extension__ ({ \
-  uint16x8_t __s0_469 = __p0_469; \
-  uint16x8_t __s1_469 = __p1_469; \
-  uint16x8_t __s2_469 = __p2_469; \
-  uint16x8_t __ret_469; \
-  __ret_469 = __s0_469 - __s1_469 * splatq_laneq_u16(__s2_469, __p3_469); \
-  __ret_469; \
+#define vmlsq_laneq_u16(__p0_561, __p1_561, __p2_561, __p3_561) __extension__ ({ \
+  uint16x8_t __s0_561 = __p0_561; \
+  uint16x8_t __s1_561 = __p1_561; \
+  uint16x8_t __s2_561 = __p2_561; \
+  uint16x8_t __ret_561; \
+  __ret_561 = __s0_561 - __s1_561 * splatq_laneq_u16(__s2_561, __p3_561); \
+  __ret_561; \
 })
 #else
-#define vmlsq_laneq_u16(__p0_470, __p1_470, __p2_470, __p3_470) __extension__ ({ \
-  uint16x8_t __s0_470 = __p0_470; \
-  uint16x8_t __s1_470 = __p1_470; \
-  uint16x8_t __s2_470 = __p2_470; \
-  uint16x8_t __rev0_470;  __rev0_470 = __builtin_shufflevector(__s0_470, __s0_470, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __rev1_470;  __rev1_470 = __builtin_shufflevector(__s1_470, __s1_470, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __rev2_470;  __rev2_470 = __builtin_shufflevector(__s2_470, __s2_470, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __ret_470; \
-  __ret_470 = __rev0_470 - __rev1_470 * __noswap_splatq_laneq_u16(__rev2_470, __p3_470); \
-  __ret_470 = __builtin_shufflevector(__ret_470, __ret_470, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_470; \
+#define vmlsq_laneq_u16(__p0_562, __p1_562, __p2_562, __p3_562) __extension__ ({ \
+  uint16x8_t __s0_562 = __p0_562; \
+  uint16x8_t __s1_562 = __p1_562; \
+  uint16x8_t __s2_562 = __p2_562; \
+  uint16x8_t __rev0_562;  __rev0_562 = __builtin_shufflevector(__s0_562, __s0_562, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1_562;  __rev1_562 = __builtin_shufflevector(__s1_562, __s1_562, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev2_562;  __rev2_562 = __builtin_shufflevector(__s2_562, __s2_562, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret_562; \
+  __ret_562 = __rev0_562 - __rev1_562 * __noswap_splatq_laneq_u16(__rev2_562, __p3_562); \
+  __ret_562 = __builtin_shufflevector(__ret_562, __ret_562, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_562; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmlsq_laneq_f32(__p0_471, __p1_471, __p2_471, __p3_471) __extension__ ({ \
-  float32x4_t __s0_471 = __p0_471; \
-  float32x4_t __s1_471 = __p1_471; \
-  float32x4_t __s2_471 = __p2_471; \
-  float32x4_t __ret_471; \
-  __ret_471 = __s0_471 - __s1_471 * splatq_laneq_f32(__s2_471, __p3_471); \
-  __ret_471; \
+#define vmlsq_laneq_f32(__p0_563, __p1_563, __p2_563, __p3_563) __extension__ ({ \
+  float32x4_t __s0_563 = __p0_563; \
+  float32x4_t __s1_563 = __p1_563; \
+  float32x4_t __s2_563 = __p2_563; \
+  float32x4_t __ret_563; \
+  __ret_563 = __s0_563 - __s1_563 * splatq_laneq_f32(__s2_563, __p3_563); \
+  __ret_563; \
 })
 #else
-#define vmlsq_laneq_f32(__p0_472, __p1_472, __p2_472, __p3_472) __extension__ ({ \
-  float32x4_t __s0_472 = __p0_472; \
-  float32x4_t __s1_472 = __p1_472; \
-  float32x4_t __s2_472 = __p2_472; \
-  float32x4_t __rev0_472;  __rev0_472 = __builtin_shufflevector(__s0_472, __s0_472, 3, 2, 1, 0); \
-  float32x4_t __rev1_472;  __rev1_472 = __builtin_shufflevector(__s1_472, __s1_472, 3, 2, 1, 0); \
-  float32x4_t __rev2_472;  __rev2_472 = __builtin_shufflevector(__s2_472, __s2_472, 3, 2, 1, 0); \
-  float32x4_t __ret_472; \
-  __ret_472 = __rev0_472 - __rev1_472 * __noswap_splatq_laneq_f32(__rev2_472, __p3_472); \
-  __ret_472 = __builtin_shufflevector(__ret_472, __ret_472, 3, 2, 1, 0); \
-  __ret_472; \
+#define vmlsq_laneq_f32(__p0_564, __p1_564, __p2_564, __p3_564) __extension__ ({ \
+  float32x4_t __s0_564 = __p0_564; \
+  float32x4_t __s1_564 = __p1_564; \
+  float32x4_t __s2_564 = __p2_564; \
+  float32x4_t __rev0_564;  __rev0_564 = __builtin_shufflevector(__s0_564, __s0_564, 3, 2, 1, 0); \
+  float32x4_t __rev1_564;  __rev1_564 = __builtin_shufflevector(__s1_564, __s1_564, 3, 2, 1, 0); \
+  float32x4_t __rev2_564;  __rev2_564 = __builtin_shufflevector(__s2_564, __s2_564, 3, 2, 1, 0); \
+  float32x4_t __ret_564; \
+  __ret_564 = __rev0_564 - __rev1_564 * __noswap_splatq_laneq_f32(__rev2_564, __p3_564); \
+  __ret_564 = __builtin_shufflevector(__ret_564, __ret_564, 3, 2, 1, 0); \
+  __ret_564; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmlsq_laneq_s32(__p0_473, __p1_473, __p2_473, __p3_473) __extension__ ({ \
-  int32x4_t __s0_473 = __p0_473; \
-  int32x4_t __s1_473 = __p1_473; \
-  int32x4_t __s2_473 = __p2_473; \
-  int32x4_t __ret_473; \
-  __ret_473 = __s0_473 - __s1_473 * splatq_laneq_s32(__s2_473, __p3_473); \
-  __ret_473; \
+#define vmlsq_laneq_s32(__p0_565, __p1_565, __p2_565, __p3_565) __extension__ ({ \
+  int32x4_t __s0_565 = __p0_565; \
+  int32x4_t __s1_565 = __p1_565; \
+  int32x4_t __s2_565 = __p2_565; \
+  int32x4_t __ret_565; \
+  __ret_565 = __s0_565 - __s1_565 * splatq_laneq_s32(__s2_565, __p3_565); \
+  __ret_565; \
 })
 #else
-#define vmlsq_laneq_s32(__p0_474, __p1_474, __p2_474, __p3_474) __extension__ ({ \
-  int32x4_t __s0_474 = __p0_474; \
-  int32x4_t __s1_474 = __p1_474; \
-  int32x4_t __s2_474 = __p2_474; \
-  int32x4_t __rev0_474;  __rev0_474 = __builtin_shufflevector(__s0_474, __s0_474, 3, 2, 1, 0); \
-  int32x4_t __rev1_474;  __rev1_474 = __builtin_shufflevector(__s1_474, __s1_474, 3, 2, 1, 0); \
-  int32x4_t __rev2_474;  __rev2_474 = __builtin_shufflevector(__s2_474, __s2_474, 3, 2, 1, 0); \
-  int32x4_t __ret_474; \
-  __ret_474 = __rev0_474 - __rev1_474 * __noswap_splatq_laneq_s32(__rev2_474, __p3_474); \
-  __ret_474 = __builtin_shufflevector(__ret_474, __ret_474, 3, 2, 1, 0); \
-  __ret_474; \
+#define vmlsq_laneq_s32(__p0_566, __p1_566, __p2_566, __p3_566) __extension__ ({ \
+  int32x4_t __s0_566 = __p0_566; \
+  int32x4_t __s1_566 = __p1_566; \
+  int32x4_t __s2_566 = __p2_566; \
+  int32x4_t __rev0_566;  __rev0_566 = __builtin_shufflevector(__s0_566, __s0_566, 3, 2, 1, 0); \
+  int32x4_t __rev1_566;  __rev1_566 = __builtin_shufflevector(__s1_566, __s1_566, 3, 2, 1, 0); \
+  int32x4_t __rev2_566;  __rev2_566 = __builtin_shufflevector(__s2_566, __s2_566, 3, 2, 1, 0); \
+  int32x4_t __ret_566; \
+  __ret_566 = __rev0_566 - __rev1_566 * __noswap_splatq_laneq_s32(__rev2_566, __p3_566); \
+  __ret_566 = __builtin_shufflevector(__ret_566, __ret_566, 3, 2, 1, 0); \
+  __ret_566; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmlsq_laneq_s16(__p0_475, __p1_475, __p2_475, __p3_475) __extension__ ({ \
-  int16x8_t __s0_475 = __p0_475; \
-  int16x8_t __s1_475 = __p1_475; \
-  int16x8_t __s2_475 = __p2_475; \
-  int16x8_t __ret_475; \
-  __ret_475 = __s0_475 - __s1_475 * splatq_laneq_s16(__s2_475, __p3_475); \
-  __ret_475; \
+#define vmlsq_laneq_s16(__p0_567, __p1_567, __p2_567, __p3_567) __extension__ ({ \
+  int16x8_t __s0_567 = __p0_567; \
+  int16x8_t __s1_567 = __p1_567; \
+  int16x8_t __s2_567 = __p2_567; \
+  int16x8_t __ret_567; \
+  __ret_567 = __s0_567 - __s1_567 * splatq_laneq_s16(__s2_567, __p3_567); \
+  __ret_567; \
 })
 #else
-#define vmlsq_laneq_s16(__p0_476, __p1_476, __p2_476, __p3_476) __extension__ ({ \
-  int16x8_t __s0_476 = __p0_476; \
-  int16x8_t __s1_476 = __p1_476; \
-  int16x8_t __s2_476 = __p2_476; \
-  int16x8_t __rev0_476;  __rev0_476 = __builtin_shufflevector(__s0_476, __s0_476, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1_476;  __rev1_476 = __builtin_shufflevector(__s1_476, __s1_476, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev2_476;  __rev2_476 = __builtin_shufflevector(__s2_476, __s2_476, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __ret_476; \
-  __ret_476 = __rev0_476 - __rev1_476 * __noswap_splatq_laneq_s16(__rev2_476, __p3_476); \
-  __ret_476 = __builtin_shufflevector(__ret_476, __ret_476, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_476; \
+#define vmlsq_laneq_s16(__p0_568, __p1_568, __p2_568, __p3_568) __extension__ ({ \
+  int16x8_t __s0_568 = __p0_568; \
+  int16x8_t __s1_568 = __p1_568; \
+  int16x8_t __s2_568 = __p2_568; \
+  int16x8_t __rev0_568;  __rev0_568 = __builtin_shufflevector(__s0_568, __s0_568, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1_568;  __rev1_568 = __builtin_shufflevector(__s1_568, __s1_568, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev2_568;  __rev2_568 = __builtin_shufflevector(__s2_568, __s2_568, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret_568; \
+  __ret_568 = __rev0_568 - __rev1_568 * __noswap_splatq_laneq_s16(__rev2_568, __p3_568); \
+  __ret_568 = __builtin_shufflevector(__ret_568, __ret_568, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_568; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmls_laneq_u32(__p0_477, __p1_477, __p2_477, __p3_477) __extension__ ({ \
-  uint32x2_t __s0_477 = __p0_477; \
-  uint32x2_t __s1_477 = __p1_477; \
-  uint32x4_t __s2_477 = __p2_477; \
-  uint32x2_t __ret_477; \
-  __ret_477 = __s0_477 - __s1_477 * splat_laneq_u32(__s2_477, __p3_477); \
-  __ret_477; \
+#define vmls_laneq_u32(__p0_569, __p1_569, __p2_569, __p3_569) __extension__ ({ \
+  uint32x2_t __s0_569 = __p0_569; \
+  uint32x2_t __s1_569 = __p1_569; \
+  uint32x4_t __s2_569 = __p2_569; \
+  uint32x2_t __ret_569; \
+  __ret_569 = __s0_569 - __s1_569 * splat_laneq_u32(__s2_569, __p3_569); \
+  __ret_569; \
 })
 #else
-#define vmls_laneq_u32(__p0_478, __p1_478, __p2_478, __p3_478) __extension__ ({ \
-  uint32x2_t __s0_478 = __p0_478; \
-  uint32x2_t __s1_478 = __p1_478; \
-  uint32x4_t __s2_478 = __p2_478; \
-  uint32x2_t __rev0_478;  __rev0_478 = __builtin_shufflevector(__s0_478, __s0_478, 1, 0); \
-  uint32x2_t __rev1_478;  __rev1_478 = __builtin_shufflevector(__s1_478, __s1_478, 1, 0); \
-  uint32x4_t __rev2_478;  __rev2_478 = __builtin_shufflevector(__s2_478, __s2_478, 3, 2, 1, 0); \
-  uint32x2_t __ret_478; \
-  __ret_478 = __rev0_478 - __rev1_478 * __noswap_splat_laneq_u32(__rev2_478, __p3_478); \
-  __ret_478 = __builtin_shufflevector(__ret_478, __ret_478, 1, 0); \
-  __ret_478; \
+#define vmls_laneq_u32(__p0_570, __p1_570, __p2_570, __p3_570) __extension__ ({ \
+  uint32x2_t __s0_570 = __p0_570; \
+  uint32x2_t __s1_570 = __p1_570; \
+  uint32x4_t __s2_570 = __p2_570; \
+  uint32x2_t __rev0_570;  __rev0_570 = __builtin_shufflevector(__s0_570, __s0_570, 1, 0); \
+  uint32x2_t __rev1_570;  __rev1_570 = __builtin_shufflevector(__s1_570, __s1_570, 1, 0); \
+  uint32x4_t __rev2_570;  __rev2_570 = __builtin_shufflevector(__s2_570, __s2_570, 3, 2, 1, 0); \
+  uint32x2_t __ret_570; \
+  __ret_570 = __rev0_570 - __rev1_570 * __noswap_splat_laneq_u32(__rev2_570, __p3_570); \
+  __ret_570 = __builtin_shufflevector(__ret_570, __ret_570, 1, 0); \
+  __ret_570; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmls_laneq_u16(__p0_479, __p1_479, __p2_479, __p3_479) __extension__ ({ \
-  uint16x4_t __s0_479 = __p0_479; \
-  uint16x4_t __s1_479 = __p1_479; \
-  uint16x8_t __s2_479 = __p2_479; \
-  uint16x4_t __ret_479; \
-  __ret_479 = __s0_479 - __s1_479 * splat_laneq_u16(__s2_479, __p3_479); \
-  __ret_479; \
+#define vmls_laneq_u16(__p0_571, __p1_571, __p2_571, __p3_571) __extension__ ({ \
+  uint16x4_t __s0_571 = __p0_571; \
+  uint16x4_t __s1_571 = __p1_571; \
+  uint16x8_t __s2_571 = __p2_571; \
+  uint16x4_t __ret_571; \
+  __ret_571 = __s0_571 - __s1_571 * splat_laneq_u16(__s2_571, __p3_571); \
+  __ret_571; \
 })
 #else
-#define vmls_laneq_u16(__p0_480, __p1_480, __p2_480, __p3_480) __extension__ ({ \
-  uint16x4_t __s0_480 = __p0_480; \
-  uint16x4_t __s1_480 = __p1_480; \
-  uint16x8_t __s2_480 = __p2_480; \
-  uint16x4_t __rev0_480;  __rev0_480 = __builtin_shufflevector(__s0_480, __s0_480, 3, 2, 1, 0); \
-  uint16x4_t __rev1_480;  __rev1_480 = __builtin_shufflevector(__s1_480, __s1_480, 3, 2, 1, 0); \
-  uint16x8_t __rev2_480;  __rev2_480 = __builtin_shufflevector(__s2_480, __s2_480, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x4_t __ret_480; \
-  __ret_480 = __rev0_480 - __rev1_480 * __noswap_splat_laneq_u16(__rev2_480, __p3_480); \
-  __ret_480 = __builtin_shufflevector(__ret_480, __ret_480, 3, 2, 1, 0); \
-  __ret_480; \
+#define vmls_laneq_u16(__p0_572, __p1_572, __p2_572, __p3_572) __extension__ ({ \
+  uint16x4_t __s0_572 = __p0_572; \
+  uint16x4_t __s1_572 = __p1_572; \
+  uint16x8_t __s2_572 = __p2_572; \
+  uint16x4_t __rev0_572;  __rev0_572 = __builtin_shufflevector(__s0_572, __s0_572, 3, 2, 1, 0); \
+  uint16x4_t __rev1_572;  __rev1_572 = __builtin_shufflevector(__s1_572, __s1_572, 3, 2, 1, 0); \
+  uint16x8_t __rev2_572;  __rev2_572 = __builtin_shufflevector(__s2_572, __s2_572, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x4_t __ret_572; \
+  __ret_572 = __rev0_572 - __rev1_572 * __noswap_splat_laneq_u16(__rev2_572, __p3_572); \
+  __ret_572 = __builtin_shufflevector(__ret_572, __ret_572, 3, 2, 1, 0); \
+  __ret_572; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmls_laneq_f32(__p0_481, __p1_481, __p2_481, __p3_481) __extension__ ({ \
-  float32x2_t __s0_481 = __p0_481; \
-  float32x2_t __s1_481 = __p1_481; \
-  float32x4_t __s2_481 = __p2_481; \
-  float32x2_t __ret_481; \
-  __ret_481 = __s0_481 - __s1_481 * splat_laneq_f32(__s2_481, __p3_481); \
-  __ret_481; \
+#define vmls_laneq_f32(__p0_573, __p1_573, __p2_573, __p3_573) __extension__ ({ \
+  float32x2_t __s0_573 = __p0_573; \
+  float32x2_t __s1_573 = __p1_573; \
+  float32x4_t __s2_573 = __p2_573; \
+  float32x2_t __ret_573; \
+  __ret_573 = __s0_573 - __s1_573 * splat_laneq_f32(__s2_573, __p3_573); \
+  __ret_573; \
 })
 #else
-#define vmls_laneq_f32(__p0_482, __p1_482, __p2_482, __p3_482) __extension__ ({ \
-  float32x2_t __s0_482 = __p0_482; \
-  float32x2_t __s1_482 = __p1_482; \
-  float32x4_t __s2_482 = __p2_482; \
-  float32x2_t __rev0_482;  __rev0_482 = __builtin_shufflevector(__s0_482, __s0_482, 1, 0); \
-  float32x2_t __rev1_482;  __rev1_482 = __builtin_shufflevector(__s1_482, __s1_482, 1, 0); \
-  float32x4_t __rev2_482;  __rev2_482 = __builtin_shufflevector(__s2_482, __s2_482, 3, 2, 1, 0); \
-  float32x2_t __ret_482; \
-  __ret_482 = __rev0_482 - __rev1_482 * __noswap_splat_laneq_f32(__rev2_482, __p3_482); \
-  __ret_482 = __builtin_shufflevector(__ret_482, __ret_482, 1, 0); \
-  __ret_482; \
+#define vmls_laneq_f32(__p0_574, __p1_574, __p2_574, __p3_574) __extension__ ({ \
+  float32x2_t __s0_574 = __p0_574; \
+  float32x2_t __s1_574 = __p1_574; \
+  float32x4_t __s2_574 = __p2_574; \
+  float32x2_t __rev0_574;  __rev0_574 = __builtin_shufflevector(__s0_574, __s0_574, 1, 0); \
+  float32x2_t __rev1_574;  __rev1_574 = __builtin_shufflevector(__s1_574, __s1_574, 1, 0); \
+  float32x4_t __rev2_574;  __rev2_574 = __builtin_shufflevector(__s2_574, __s2_574, 3, 2, 1, 0); \
+  float32x2_t __ret_574; \
+  __ret_574 = __rev0_574 - __rev1_574 * __noswap_splat_laneq_f32(__rev2_574, __p3_574); \
+  __ret_574 = __builtin_shufflevector(__ret_574, __ret_574, 1, 0); \
+  __ret_574; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmls_laneq_s32(__p0_483, __p1_483, __p2_483, __p3_483) __extension__ ({ \
-  int32x2_t __s0_483 = __p0_483; \
-  int32x2_t __s1_483 = __p1_483; \
-  int32x4_t __s2_483 = __p2_483; \
-  int32x2_t __ret_483; \
-  __ret_483 = __s0_483 - __s1_483 * splat_laneq_s32(__s2_483, __p3_483); \
-  __ret_483; \
+#define vmls_laneq_s32(__p0_575, __p1_575, __p2_575, __p3_575) __extension__ ({ \
+  int32x2_t __s0_575 = __p0_575; \
+  int32x2_t __s1_575 = __p1_575; \
+  int32x4_t __s2_575 = __p2_575; \
+  int32x2_t __ret_575; \
+  __ret_575 = __s0_575 - __s1_575 * splat_laneq_s32(__s2_575, __p3_575); \
+  __ret_575; \
 })
 #else
-#define vmls_laneq_s32(__p0_484, __p1_484, __p2_484, __p3_484) __extension__ ({ \
-  int32x2_t __s0_484 = __p0_484; \
-  int32x2_t __s1_484 = __p1_484; \
-  int32x4_t __s2_484 = __p2_484; \
-  int32x2_t __rev0_484;  __rev0_484 = __builtin_shufflevector(__s0_484, __s0_484, 1, 0); \
-  int32x2_t __rev1_484;  __rev1_484 = __builtin_shufflevector(__s1_484, __s1_484, 1, 0); \
-  int32x4_t __rev2_484;  __rev2_484 = __builtin_shufflevector(__s2_484, __s2_484, 3, 2, 1, 0); \
-  int32x2_t __ret_484; \
-  __ret_484 = __rev0_484 - __rev1_484 * __noswap_splat_laneq_s32(__rev2_484, __p3_484); \
-  __ret_484 = __builtin_shufflevector(__ret_484, __ret_484, 1, 0); \
-  __ret_484; \
+#define vmls_laneq_s32(__p0_576, __p1_576, __p2_576, __p3_576) __extension__ ({ \
+  int32x2_t __s0_576 = __p0_576; \
+  int32x2_t __s1_576 = __p1_576; \
+  int32x4_t __s2_576 = __p2_576; \
+  int32x2_t __rev0_576;  __rev0_576 = __builtin_shufflevector(__s0_576, __s0_576, 1, 0); \
+  int32x2_t __rev1_576;  __rev1_576 = __builtin_shufflevector(__s1_576, __s1_576, 1, 0); \
+  int32x4_t __rev2_576;  __rev2_576 = __builtin_shufflevector(__s2_576, __s2_576, 3, 2, 1, 0); \
+  int32x2_t __ret_576; \
+  __ret_576 = __rev0_576 - __rev1_576 * __noswap_splat_laneq_s32(__rev2_576, __p3_576); \
+  __ret_576 = __builtin_shufflevector(__ret_576, __ret_576, 1, 0); \
+  __ret_576; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmls_laneq_s16(__p0_485, __p1_485, __p2_485, __p3_485) __extension__ ({ \
-  int16x4_t __s0_485 = __p0_485; \
-  int16x4_t __s1_485 = __p1_485; \
-  int16x8_t __s2_485 = __p2_485; \
-  int16x4_t __ret_485; \
-  __ret_485 = __s0_485 - __s1_485 * splat_laneq_s16(__s2_485, __p3_485); \
-  __ret_485; \
+#define vmls_laneq_s16(__p0_577, __p1_577, __p2_577, __p3_577) __extension__ ({ \
+  int16x4_t __s0_577 = __p0_577; \
+  int16x4_t __s1_577 = __p1_577; \
+  int16x8_t __s2_577 = __p2_577; \
+  int16x4_t __ret_577; \
+  __ret_577 = __s0_577 - __s1_577 * splat_laneq_s16(__s2_577, __p3_577); \
+  __ret_577; \
 })
 #else
-#define vmls_laneq_s16(__p0_486, __p1_486, __p2_486, __p3_486) __extension__ ({ \
-  int16x4_t __s0_486 = __p0_486; \
-  int16x4_t __s1_486 = __p1_486; \
-  int16x8_t __s2_486 = __p2_486; \
-  int16x4_t __rev0_486;  __rev0_486 = __builtin_shufflevector(__s0_486, __s0_486, 3, 2, 1, 0); \
-  int16x4_t __rev1_486;  __rev1_486 = __builtin_shufflevector(__s1_486, __s1_486, 3, 2, 1, 0); \
-  int16x8_t __rev2_486;  __rev2_486 = __builtin_shufflevector(__s2_486, __s2_486, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x4_t __ret_486; \
-  __ret_486 = __rev0_486 - __rev1_486 * __noswap_splat_laneq_s16(__rev2_486, __p3_486); \
-  __ret_486 = __builtin_shufflevector(__ret_486, __ret_486, 3, 2, 1, 0); \
-  __ret_486; \
+#define vmls_laneq_s16(__p0_578, __p1_578, __p2_578, __p3_578) __extension__ ({ \
+  int16x4_t __s0_578 = __p0_578; \
+  int16x4_t __s1_578 = __p1_578; \
+  int16x8_t __s2_578 = __p2_578; \
+  int16x4_t __rev0_578;  __rev0_578 = __builtin_shufflevector(__s0_578, __s0_578, 3, 2, 1, 0); \
+  int16x4_t __rev1_578;  __rev1_578 = __builtin_shufflevector(__s1_578, __s1_578, 3, 2, 1, 0); \
+  int16x8_t __rev2_578;  __rev2_578 = __builtin_shufflevector(__s2_578, __s2_578, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __ret_578; \
+  __ret_578 = __rev0_578 - __rev1_578 * __noswap_splat_laneq_s16(__rev2_578, __p3_578); \
+  __ret_578 = __builtin_shufflevector(__ret_578, __ret_578, 3, 2, 1, 0); \
+  __ret_578; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai float64x2_t vmlsq_n_f64(float64x2_t __p0, float64x2_t __p1, float64_t __p2) {
-  float64x2_t __ret;
-  __ret = __p0 - __p1 * (float64x2_t) {__p2, __p2};
-  return __ret;
-}
-#else
-__ai float64x2_t vmlsq_n_f64(float64x2_t __p0, float64x2_t __p1, float64_t __p2) {
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  float64x2_t __ret;
-  __ret = __rev0 - __rev1 * (float64x2_t) {__p2, __p2};
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlsl_high_lane_u32(__p0_487, __p1_487, __p2_487, __p3_487) __extension__ ({ \
-  uint64x2_t __s0_487 = __p0_487; \
-  uint32x4_t __s1_487 = __p1_487; \
-  uint32x2_t __s2_487 = __p2_487; \
-  uint64x2_t __ret_487; \
-  __ret_487 = __s0_487 - vmull_u32(vget_high_u32(__s1_487), splat_lane_u32(__s2_487, __p3_487)); \
-  __ret_487; \
+#define vmlsl_high_lane_u32(__p0_579, __p1_579, __p2_579, __p3_579) __extension__ ({ \
+  uint64x2_t __s0_579 = __p0_579; \
+  uint32x4_t __s1_579 = __p1_579; \
+  uint32x2_t __s2_579 = __p2_579; \
+  uint64x2_t __ret_579; \
+  __ret_579 = __s0_579 - vmull_u32(vget_high_u32(__s1_579), splat_lane_u32(__s2_579, __p3_579)); \
+  __ret_579; \
 })
 #else
-#define vmlsl_high_lane_u32(__p0_488, __p1_488, __p2_488, __p3_488) __extension__ ({ \
-  uint64x2_t __s0_488 = __p0_488; \
-  uint32x4_t __s1_488 = __p1_488; \
-  uint32x2_t __s2_488 = __p2_488; \
-  uint64x2_t __rev0_488;  __rev0_488 = __builtin_shufflevector(__s0_488, __s0_488, 1, 0); \
-  uint32x4_t __rev1_488;  __rev1_488 = __builtin_shufflevector(__s1_488, __s1_488, 3, 2, 1, 0); \
-  uint32x2_t __rev2_488;  __rev2_488 = __builtin_shufflevector(__s2_488, __s2_488, 1, 0); \
-  uint64x2_t __ret_488; \
-  __ret_488 = __rev0_488 - __noswap_vmull_u32(__noswap_vget_high_u32(__rev1_488), __noswap_splat_lane_u32(__rev2_488, __p3_488)); \
-  __ret_488 = __builtin_shufflevector(__ret_488, __ret_488, 1, 0); \
-  __ret_488; \
+#define vmlsl_high_lane_u32(__p0_580, __p1_580, __p2_580, __p3_580) __extension__ ({ \
+  uint64x2_t __s0_580 = __p0_580; \
+  uint32x4_t __s1_580 = __p1_580; \
+  uint32x2_t __s2_580 = __p2_580; \
+  uint64x2_t __rev0_580;  __rev0_580 = __builtin_shufflevector(__s0_580, __s0_580, 1, 0); \
+  uint32x4_t __rev1_580;  __rev1_580 = __builtin_shufflevector(__s1_580, __s1_580, 3, 2, 1, 0); \
+  uint32x2_t __rev2_580;  __rev2_580 = __builtin_shufflevector(__s2_580, __s2_580, 1, 0); \
+  uint64x2_t __ret_580; \
+  __ret_580 = __rev0_580 - __noswap_vmull_u32(__noswap_vget_high_u32(__rev1_580), __noswap_splat_lane_u32(__rev2_580, __p3_580)); \
+  __ret_580 = __builtin_shufflevector(__ret_580, __ret_580, 1, 0); \
+  __ret_580; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmlsl_high_lane_u16(__p0_489, __p1_489, __p2_489, __p3_489) __extension__ ({ \
-  uint32x4_t __s0_489 = __p0_489; \
-  uint16x8_t __s1_489 = __p1_489; \
-  uint16x4_t __s2_489 = __p2_489; \
-  uint32x4_t __ret_489; \
-  __ret_489 = __s0_489 - vmull_u16(vget_high_u16(__s1_489), splat_lane_u16(__s2_489, __p3_489)); \
-  __ret_489; \
+#define vmlsl_high_lane_u16(__p0_581, __p1_581, __p2_581, __p3_581) __extension__ ({ \
+  uint32x4_t __s0_581 = __p0_581; \
+  uint16x8_t __s1_581 = __p1_581; \
+  uint16x4_t __s2_581 = __p2_581; \
+  uint32x4_t __ret_581; \
+  __ret_581 = __s0_581 - vmull_u16(vget_high_u16(__s1_581), splat_lane_u16(__s2_581, __p3_581)); \
+  __ret_581; \
 })
 #else
-#define vmlsl_high_lane_u16(__p0_490, __p1_490, __p2_490, __p3_490) __extension__ ({ \
-  uint32x4_t __s0_490 = __p0_490; \
-  uint16x8_t __s1_490 = __p1_490; \
-  uint16x4_t __s2_490 = __p2_490; \
-  uint32x4_t __rev0_490;  __rev0_490 = __builtin_shufflevector(__s0_490, __s0_490, 3, 2, 1, 0); \
-  uint16x8_t __rev1_490;  __rev1_490 = __builtin_shufflevector(__s1_490, __s1_490, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x4_t __rev2_490;  __rev2_490 = __builtin_shufflevector(__s2_490, __s2_490, 3, 2, 1, 0); \
-  uint32x4_t __ret_490; \
-  __ret_490 = __rev0_490 - __noswap_vmull_u16(__noswap_vget_high_u16(__rev1_490), __noswap_splat_lane_u16(__rev2_490, __p3_490)); \
-  __ret_490 = __builtin_shufflevector(__ret_490, __ret_490, 3, 2, 1, 0); \
-  __ret_490; \
+#define vmlsl_high_lane_u16(__p0_582, __p1_582, __p2_582, __p3_582) __extension__ ({ \
+  uint32x4_t __s0_582 = __p0_582; \
+  uint16x8_t __s1_582 = __p1_582; \
+  uint16x4_t __s2_582 = __p2_582; \
+  uint32x4_t __rev0_582;  __rev0_582 = __builtin_shufflevector(__s0_582, __s0_582, 3, 2, 1, 0); \
+  uint16x8_t __rev1_582;  __rev1_582 = __builtin_shufflevector(__s1_582, __s1_582, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x4_t __rev2_582;  __rev2_582 = __builtin_shufflevector(__s2_582, __s2_582, 3, 2, 1, 0); \
+  uint32x4_t __ret_582; \
+  __ret_582 = __rev0_582 - __noswap_vmull_u16(__noswap_vget_high_u16(__rev1_582), __noswap_splat_lane_u16(__rev2_582, __p3_582)); \
+  __ret_582 = __builtin_shufflevector(__ret_582, __ret_582, 3, 2, 1, 0); \
+  __ret_582; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmlsl_high_lane_s32(__p0_491, __p1_491, __p2_491, __p3_491) __extension__ ({ \
-  int64x2_t __s0_491 = __p0_491; \
-  int32x4_t __s1_491 = __p1_491; \
-  int32x2_t __s2_491 = __p2_491; \
-  int64x2_t __ret_491; \
-  __ret_491 = __s0_491 - vmull_s32(vget_high_s32(__s1_491), splat_lane_s32(__s2_491, __p3_491)); \
-  __ret_491; \
+#define vmlsl_high_lane_s32(__p0_583, __p1_583, __p2_583, __p3_583) __extension__ ({ \
+  int64x2_t __s0_583 = __p0_583; \
+  int32x4_t __s1_583 = __p1_583; \
+  int32x2_t __s2_583 = __p2_583; \
+  int64x2_t __ret_583; \
+  __ret_583 = __s0_583 - vmull_s32(vget_high_s32(__s1_583), splat_lane_s32(__s2_583, __p3_583)); \
+  __ret_583; \
 })
 #else
-#define vmlsl_high_lane_s32(__p0_492, __p1_492, __p2_492, __p3_492) __extension__ ({ \
-  int64x2_t __s0_492 = __p0_492; \
-  int32x4_t __s1_492 = __p1_492; \
-  int32x2_t __s2_492 = __p2_492; \
-  int64x2_t __rev0_492;  __rev0_492 = __builtin_shufflevector(__s0_492, __s0_492, 1, 0); \
-  int32x4_t __rev1_492;  __rev1_492 = __builtin_shufflevector(__s1_492, __s1_492, 3, 2, 1, 0); \
-  int32x2_t __rev2_492;  __rev2_492 = __builtin_shufflevector(__s2_492, __s2_492, 1, 0); \
-  int64x2_t __ret_492; \
-  __ret_492 = __rev0_492 - __noswap_vmull_s32(__noswap_vget_high_s32(__rev1_492), __noswap_splat_lane_s32(__rev2_492, __p3_492)); \
-  __ret_492 = __builtin_shufflevector(__ret_492, __ret_492, 1, 0); \
-  __ret_492; \
+#define vmlsl_high_lane_s32(__p0_584, __p1_584, __p2_584, __p3_584) __extension__ ({ \
+  int64x2_t __s0_584 = __p0_584; \
+  int32x4_t __s1_584 = __p1_584; \
+  int32x2_t __s2_584 = __p2_584; \
+  int64x2_t __rev0_584;  __rev0_584 = __builtin_shufflevector(__s0_584, __s0_584, 1, 0); \
+  int32x4_t __rev1_584;  __rev1_584 = __builtin_shufflevector(__s1_584, __s1_584, 3, 2, 1, 0); \
+  int32x2_t __rev2_584;  __rev2_584 = __builtin_shufflevector(__s2_584, __s2_584, 1, 0); \
+  int64x2_t __ret_584; \
+  __ret_584 = __rev0_584 - __noswap_vmull_s32(__noswap_vget_high_s32(__rev1_584), __noswap_splat_lane_s32(__rev2_584, __p3_584)); \
+  __ret_584 = __builtin_shufflevector(__ret_584, __ret_584, 1, 0); \
+  __ret_584; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmlsl_high_lane_s16(__p0_493, __p1_493, __p2_493, __p3_493) __extension__ ({ \
-  int32x4_t __s0_493 = __p0_493; \
-  int16x8_t __s1_493 = __p1_493; \
-  int16x4_t __s2_493 = __p2_493; \
-  int32x4_t __ret_493; \
-  __ret_493 = __s0_493 - vmull_s16(vget_high_s16(__s1_493), splat_lane_s16(__s2_493, __p3_493)); \
-  __ret_493; \
+#define vmlsl_high_lane_s16(__p0_585, __p1_585, __p2_585, __p3_585) __extension__ ({ \
+  int32x4_t __s0_585 = __p0_585; \
+  int16x8_t __s1_585 = __p1_585; \
+  int16x4_t __s2_585 = __p2_585; \
+  int32x4_t __ret_585; \
+  __ret_585 = __s0_585 - vmull_s16(vget_high_s16(__s1_585), splat_lane_s16(__s2_585, __p3_585)); \
+  __ret_585; \
 })
 #else
-#define vmlsl_high_lane_s16(__p0_494, __p1_494, __p2_494, __p3_494) __extension__ ({ \
-  int32x4_t __s0_494 = __p0_494; \
-  int16x8_t __s1_494 = __p1_494; \
-  int16x4_t __s2_494 = __p2_494; \
-  int32x4_t __rev0_494;  __rev0_494 = __builtin_shufflevector(__s0_494, __s0_494, 3, 2, 1, 0); \
-  int16x8_t __rev1_494;  __rev1_494 = __builtin_shufflevector(__s1_494, __s1_494, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x4_t __rev2_494;  __rev2_494 = __builtin_shufflevector(__s2_494, __s2_494, 3, 2, 1, 0); \
-  int32x4_t __ret_494; \
-  __ret_494 = __rev0_494 - __noswap_vmull_s16(__noswap_vget_high_s16(__rev1_494), __noswap_splat_lane_s16(__rev2_494, __p3_494)); \
-  __ret_494 = __builtin_shufflevector(__ret_494, __ret_494, 3, 2, 1, 0); \
-  __ret_494; \
+#define vmlsl_high_lane_s16(__p0_586, __p1_586, __p2_586, __p3_586) __extension__ ({ \
+  int32x4_t __s0_586 = __p0_586; \
+  int16x8_t __s1_586 = __p1_586; \
+  int16x4_t __s2_586 = __p2_586; \
+  int32x4_t __rev0_586;  __rev0_586 = __builtin_shufflevector(__s0_586, __s0_586, 3, 2, 1, 0); \
+  int16x8_t __rev1_586;  __rev1_586 = __builtin_shufflevector(__s1_586, __s1_586, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __rev2_586;  __rev2_586 = __builtin_shufflevector(__s2_586, __s2_586, 3, 2, 1, 0); \
+  int32x4_t __ret_586; \
+  __ret_586 = __rev0_586 - __noswap_vmull_s16(__noswap_vget_high_s16(__rev1_586), __noswap_splat_lane_s16(__rev2_586, __p3_586)); \
+  __ret_586 = __builtin_shufflevector(__ret_586, __ret_586, 3, 2, 1, 0); \
+  __ret_586; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmlsl_high_laneq_u32(__p0_495, __p1_495, __p2_495, __p3_495) __extension__ ({ \
-  uint64x2_t __s0_495 = __p0_495; \
-  uint32x4_t __s1_495 = __p1_495; \
-  uint32x4_t __s2_495 = __p2_495; \
-  uint64x2_t __ret_495; \
-  __ret_495 = __s0_495 - vmull_u32(vget_high_u32(__s1_495), splat_laneq_u32(__s2_495, __p3_495)); \
-  __ret_495; \
+#define vmlsl_high_laneq_u32(__p0_587, __p1_587, __p2_587, __p3_587) __extension__ ({ \
+  uint64x2_t __s0_587 = __p0_587; \
+  uint32x4_t __s1_587 = __p1_587; \
+  uint32x4_t __s2_587 = __p2_587; \
+  uint64x2_t __ret_587; \
+  __ret_587 = __s0_587 - vmull_u32(vget_high_u32(__s1_587), splat_laneq_u32(__s2_587, __p3_587)); \
+  __ret_587; \
 })
 #else
-#define vmlsl_high_laneq_u32(__p0_496, __p1_496, __p2_496, __p3_496) __extension__ ({ \
-  uint64x2_t __s0_496 = __p0_496; \
-  uint32x4_t __s1_496 = __p1_496; \
-  uint32x4_t __s2_496 = __p2_496; \
-  uint64x2_t __rev0_496;  __rev0_496 = __builtin_shufflevector(__s0_496, __s0_496, 1, 0); \
-  uint32x4_t __rev1_496;  __rev1_496 = __builtin_shufflevector(__s1_496, __s1_496, 3, 2, 1, 0); \
-  uint32x4_t __rev2_496;  __rev2_496 = __builtin_shufflevector(__s2_496, __s2_496, 3, 2, 1, 0); \
-  uint64x2_t __ret_496; \
-  __ret_496 = __rev0_496 - __noswap_vmull_u32(__noswap_vget_high_u32(__rev1_496), __noswap_splat_laneq_u32(__rev2_496, __p3_496)); \
-  __ret_496 = __builtin_shufflevector(__ret_496, __ret_496, 1, 0); \
-  __ret_496; \
+#define vmlsl_high_laneq_u32(__p0_588, __p1_588, __p2_588, __p3_588) __extension__ ({ \
+  uint64x2_t __s0_588 = __p0_588; \
+  uint32x4_t __s1_588 = __p1_588; \
+  uint32x4_t __s2_588 = __p2_588; \
+  uint64x2_t __rev0_588;  __rev0_588 = __builtin_shufflevector(__s0_588, __s0_588, 1, 0); \
+  uint32x4_t __rev1_588;  __rev1_588 = __builtin_shufflevector(__s1_588, __s1_588, 3, 2, 1, 0); \
+  uint32x4_t __rev2_588;  __rev2_588 = __builtin_shufflevector(__s2_588, __s2_588, 3, 2, 1, 0); \
+  uint64x2_t __ret_588; \
+  __ret_588 = __rev0_588 - __noswap_vmull_u32(__noswap_vget_high_u32(__rev1_588), __noswap_splat_laneq_u32(__rev2_588, __p3_588)); \
+  __ret_588 = __builtin_shufflevector(__ret_588, __ret_588, 1, 0); \
+  __ret_588; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmlsl_high_laneq_u16(__p0_497, __p1_497, __p2_497, __p3_497) __extension__ ({ \
-  uint32x4_t __s0_497 = __p0_497; \
-  uint16x8_t __s1_497 = __p1_497; \
-  uint16x8_t __s2_497 = __p2_497; \
-  uint32x4_t __ret_497; \
-  __ret_497 = __s0_497 - vmull_u16(vget_high_u16(__s1_497), splat_laneq_u16(__s2_497, __p3_497)); \
-  __ret_497; \
+#define vmlsl_high_laneq_u16(__p0_589, __p1_589, __p2_589, __p3_589) __extension__ ({ \
+  uint32x4_t __s0_589 = __p0_589; \
+  uint16x8_t __s1_589 = __p1_589; \
+  uint16x8_t __s2_589 = __p2_589; \
+  uint32x4_t __ret_589; \
+  __ret_589 = __s0_589 - vmull_u16(vget_high_u16(__s1_589), splat_laneq_u16(__s2_589, __p3_589)); \
+  __ret_589; \
 })
 #else
-#define vmlsl_high_laneq_u16(__p0_498, __p1_498, __p2_498, __p3_498) __extension__ ({ \
-  uint32x4_t __s0_498 = __p0_498; \
-  uint16x8_t __s1_498 = __p1_498; \
-  uint16x8_t __s2_498 = __p2_498; \
-  uint32x4_t __rev0_498;  __rev0_498 = __builtin_shufflevector(__s0_498, __s0_498, 3, 2, 1, 0); \
-  uint16x8_t __rev1_498;  __rev1_498 = __builtin_shufflevector(__s1_498, __s1_498, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __rev2_498;  __rev2_498 = __builtin_shufflevector(__s2_498, __s2_498, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint32x4_t __ret_498; \
-  __ret_498 = __rev0_498 - __noswap_vmull_u16(__noswap_vget_high_u16(__rev1_498), __noswap_splat_laneq_u16(__rev2_498, __p3_498)); \
-  __ret_498 = __builtin_shufflevector(__ret_498, __ret_498, 3, 2, 1, 0); \
-  __ret_498; \
+#define vmlsl_high_laneq_u16(__p0_590, __p1_590, __p2_590, __p3_590) __extension__ ({ \
+  uint32x4_t __s0_590 = __p0_590; \
+  uint16x8_t __s1_590 = __p1_590; \
+  uint16x8_t __s2_590 = __p2_590; \
+  uint32x4_t __rev0_590;  __rev0_590 = __builtin_shufflevector(__s0_590, __s0_590, 3, 2, 1, 0); \
+  uint16x8_t __rev1_590;  __rev1_590 = __builtin_shufflevector(__s1_590, __s1_590, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev2_590;  __rev2_590 = __builtin_shufflevector(__s2_590, __s2_590, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint32x4_t __ret_590; \
+  __ret_590 = __rev0_590 - __noswap_vmull_u16(__noswap_vget_high_u16(__rev1_590), __noswap_splat_laneq_u16(__rev2_590, __p3_590)); \
+  __ret_590 = __builtin_shufflevector(__ret_590, __ret_590, 3, 2, 1, 0); \
+  __ret_590; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmlsl_high_laneq_s32(__p0_499, __p1_499, __p2_499, __p3_499) __extension__ ({ \
-  int64x2_t __s0_499 = __p0_499; \
-  int32x4_t __s1_499 = __p1_499; \
-  int32x4_t __s2_499 = __p2_499; \
-  int64x2_t __ret_499; \
-  __ret_499 = __s0_499 - vmull_s32(vget_high_s32(__s1_499), splat_laneq_s32(__s2_499, __p3_499)); \
-  __ret_499; \
+#define vmlsl_high_laneq_s32(__p0_591, __p1_591, __p2_591, __p3_591) __extension__ ({ \
+  int64x2_t __s0_591 = __p0_591; \
+  int32x4_t __s1_591 = __p1_591; \
+  int32x4_t __s2_591 = __p2_591; \
+  int64x2_t __ret_591; \
+  __ret_591 = __s0_591 - vmull_s32(vget_high_s32(__s1_591), splat_laneq_s32(__s2_591, __p3_591)); \
+  __ret_591; \
 })
 #else
-#define vmlsl_high_laneq_s32(__p0_500, __p1_500, __p2_500, __p3_500) __extension__ ({ \
-  int64x2_t __s0_500 = __p0_500; \
-  int32x4_t __s1_500 = __p1_500; \
-  int32x4_t __s2_500 = __p2_500; \
-  int64x2_t __rev0_500;  __rev0_500 = __builtin_shufflevector(__s0_500, __s0_500, 1, 0); \
-  int32x4_t __rev1_500;  __rev1_500 = __builtin_shufflevector(__s1_500, __s1_500, 3, 2, 1, 0); \
-  int32x4_t __rev2_500;  __rev2_500 = __builtin_shufflevector(__s2_500, __s2_500, 3, 2, 1, 0); \
-  int64x2_t __ret_500; \
-  __ret_500 = __rev0_500 - __noswap_vmull_s32(__noswap_vget_high_s32(__rev1_500), __noswap_splat_laneq_s32(__rev2_500, __p3_500)); \
-  __ret_500 = __builtin_shufflevector(__ret_500, __ret_500, 1, 0); \
-  __ret_500; \
+#define vmlsl_high_laneq_s32(__p0_592, __p1_592, __p2_592, __p3_592) __extension__ ({ \
+  int64x2_t __s0_592 = __p0_592; \
+  int32x4_t __s1_592 = __p1_592; \
+  int32x4_t __s2_592 = __p2_592; \
+  int64x2_t __rev0_592;  __rev0_592 = __builtin_shufflevector(__s0_592, __s0_592, 1, 0); \
+  int32x4_t __rev1_592;  __rev1_592 = __builtin_shufflevector(__s1_592, __s1_592, 3, 2, 1, 0); \
+  int32x4_t __rev2_592;  __rev2_592 = __builtin_shufflevector(__s2_592, __s2_592, 3, 2, 1, 0); \
+  int64x2_t __ret_592; \
+  __ret_592 = __rev0_592 - __noswap_vmull_s32(__noswap_vget_high_s32(__rev1_592), __noswap_splat_laneq_s32(__rev2_592, __p3_592)); \
+  __ret_592 = __builtin_shufflevector(__ret_592, __ret_592, 1, 0); \
+  __ret_592; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmlsl_high_laneq_s16(__p0_501, __p1_501, __p2_501, __p3_501) __extension__ ({ \
-  int32x4_t __s0_501 = __p0_501; \
-  int16x8_t __s1_501 = __p1_501; \
-  int16x8_t __s2_501 = __p2_501; \
-  int32x4_t __ret_501; \
-  __ret_501 = __s0_501 - vmull_s16(vget_high_s16(__s1_501), splat_laneq_s16(__s2_501, __p3_501)); \
-  __ret_501; \
+#define vmlsl_high_laneq_s16(__p0_593, __p1_593, __p2_593, __p3_593) __extension__ ({ \
+  int32x4_t __s0_593 = __p0_593; \
+  int16x8_t __s1_593 = __p1_593; \
+  int16x8_t __s2_593 = __p2_593; \
+  int32x4_t __ret_593; \
+  __ret_593 = __s0_593 - vmull_s16(vget_high_s16(__s1_593), splat_laneq_s16(__s2_593, __p3_593)); \
+  __ret_593; \
 })
 #else
-#define vmlsl_high_laneq_s16(__p0_502, __p1_502, __p2_502, __p3_502) __extension__ ({ \
-  int32x4_t __s0_502 = __p0_502; \
-  int16x8_t __s1_502 = __p1_502; \
-  int16x8_t __s2_502 = __p2_502; \
-  int32x4_t __rev0_502;  __rev0_502 = __builtin_shufflevector(__s0_502, __s0_502, 3, 2, 1, 0); \
-  int16x8_t __rev1_502;  __rev1_502 = __builtin_shufflevector(__s1_502, __s1_502, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev2_502;  __rev2_502 = __builtin_shufflevector(__s2_502, __s2_502, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int32x4_t __ret_502; \
-  __ret_502 = __rev0_502 - __noswap_vmull_s16(__noswap_vget_high_s16(__rev1_502), __noswap_splat_laneq_s16(__rev2_502, __p3_502)); \
-  __ret_502 = __builtin_shufflevector(__ret_502, __ret_502, 3, 2, 1, 0); \
-  __ret_502; \
+#define vmlsl_high_laneq_s16(__p0_594, __p1_594, __p2_594, __p3_594) __extension__ ({ \
+  int32x4_t __s0_594 = __p0_594; \
+  int16x8_t __s1_594 = __p1_594; \
+  int16x8_t __s2_594 = __p2_594; \
+  int32x4_t __rev0_594;  __rev0_594 = __builtin_shufflevector(__s0_594, __s0_594, 3, 2, 1, 0); \
+  int16x8_t __rev1_594;  __rev1_594 = __builtin_shufflevector(__s1_594, __s1_594, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev2_594;  __rev2_594 = __builtin_shufflevector(__s2_594, __s2_594, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret_594; \
+  __ret_594 = __rev0_594 - __noswap_vmull_s16(__noswap_vget_high_s16(__rev1_594), __noswap_splat_laneq_s16(__rev2_594, __p3_594)); \
+  __ret_594 = __builtin_shufflevector(__ret_594, __ret_594, 3, 2, 1, 0); \
+  __ret_594; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmlsl_laneq_u32(__p0_503, __p1_503, __p2_503, __p3_503) __extension__ ({ \
-  uint64x2_t __s0_503 = __p0_503; \
-  uint32x2_t __s1_503 = __p1_503; \
-  uint32x4_t __s2_503 = __p2_503; \
-  uint64x2_t __ret_503; \
-  __ret_503 = __s0_503 - vmull_u32(__s1_503, splat_laneq_u32(__s2_503, __p3_503)); \
-  __ret_503; \
+#define vmlsl_laneq_u32(__p0_595, __p1_595, __p2_595, __p3_595) __extension__ ({ \
+  uint64x2_t __s0_595 = __p0_595; \
+  uint32x2_t __s1_595 = __p1_595; \
+  uint32x4_t __s2_595 = __p2_595; \
+  uint64x2_t __ret_595; \
+  __ret_595 = __s0_595 - vmull_u32(__s1_595, splat_laneq_u32(__s2_595, __p3_595)); \
+  __ret_595; \
 })
 #else
-#define vmlsl_laneq_u32(__p0_504, __p1_504, __p2_504, __p3_504) __extension__ ({ \
-  uint64x2_t __s0_504 = __p0_504; \
-  uint32x2_t __s1_504 = __p1_504; \
-  uint32x4_t __s2_504 = __p2_504; \
-  uint64x2_t __rev0_504;  __rev0_504 = __builtin_shufflevector(__s0_504, __s0_504, 1, 0); \
-  uint32x2_t __rev1_504;  __rev1_504 = __builtin_shufflevector(__s1_504, __s1_504, 1, 0); \
-  uint32x4_t __rev2_504;  __rev2_504 = __builtin_shufflevector(__s2_504, __s2_504, 3, 2, 1, 0); \
-  uint64x2_t __ret_504; \
-  __ret_504 = __rev0_504 - __noswap_vmull_u32(__rev1_504, __noswap_splat_laneq_u32(__rev2_504, __p3_504)); \
-  __ret_504 = __builtin_shufflevector(__ret_504, __ret_504, 1, 0); \
-  __ret_504; \
+#define vmlsl_laneq_u32(__p0_596, __p1_596, __p2_596, __p3_596) __extension__ ({ \
+  uint64x2_t __s0_596 = __p0_596; \
+  uint32x2_t __s1_596 = __p1_596; \
+  uint32x4_t __s2_596 = __p2_596; \
+  uint64x2_t __rev0_596;  __rev0_596 = __builtin_shufflevector(__s0_596, __s0_596, 1, 0); \
+  uint32x2_t __rev1_596;  __rev1_596 = __builtin_shufflevector(__s1_596, __s1_596, 1, 0); \
+  uint32x4_t __rev2_596;  __rev2_596 = __builtin_shufflevector(__s2_596, __s2_596, 3, 2, 1, 0); \
+  uint64x2_t __ret_596; \
+  __ret_596 = __rev0_596 - __noswap_vmull_u32(__rev1_596, __noswap_splat_laneq_u32(__rev2_596, __p3_596)); \
+  __ret_596 = __builtin_shufflevector(__ret_596, __ret_596, 1, 0); \
+  __ret_596; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmlsl_laneq_u16(__p0_505, __p1_505, __p2_505, __p3_505) __extension__ ({ \
-  uint32x4_t __s0_505 = __p0_505; \
-  uint16x4_t __s1_505 = __p1_505; \
-  uint16x8_t __s2_505 = __p2_505; \
-  uint32x4_t __ret_505; \
-  __ret_505 = __s0_505 - vmull_u16(__s1_505, splat_laneq_u16(__s2_505, __p3_505)); \
-  __ret_505; \
+#define vmlsl_laneq_u16(__p0_597, __p1_597, __p2_597, __p3_597) __extension__ ({ \
+  uint32x4_t __s0_597 = __p0_597; \
+  uint16x4_t __s1_597 = __p1_597; \
+  uint16x8_t __s2_597 = __p2_597; \
+  uint32x4_t __ret_597; \
+  __ret_597 = __s0_597 - vmull_u16(__s1_597, splat_laneq_u16(__s2_597, __p3_597)); \
+  __ret_597; \
 })
 #else
-#define vmlsl_laneq_u16(__p0_506, __p1_506, __p2_506, __p3_506) __extension__ ({ \
-  uint32x4_t __s0_506 = __p0_506; \
-  uint16x4_t __s1_506 = __p1_506; \
-  uint16x8_t __s2_506 = __p2_506; \
-  uint32x4_t __rev0_506;  __rev0_506 = __builtin_shufflevector(__s0_506, __s0_506, 3, 2, 1, 0); \
-  uint16x4_t __rev1_506;  __rev1_506 = __builtin_shufflevector(__s1_506, __s1_506, 3, 2, 1, 0); \
-  uint16x8_t __rev2_506;  __rev2_506 = __builtin_shufflevector(__s2_506, __s2_506, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint32x4_t __ret_506; \
-  __ret_506 = __rev0_506 - __noswap_vmull_u16(__rev1_506, __noswap_splat_laneq_u16(__rev2_506, __p3_506)); \
-  __ret_506 = __builtin_shufflevector(__ret_506, __ret_506, 3, 2, 1, 0); \
-  __ret_506; \
+#define vmlsl_laneq_u16(__p0_598, __p1_598, __p2_598, __p3_598) __extension__ ({ \
+  uint32x4_t __s0_598 = __p0_598; \
+  uint16x4_t __s1_598 = __p1_598; \
+  uint16x8_t __s2_598 = __p2_598; \
+  uint32x4_t __rev0_598;  __rev0_598 = __builtin_shufflevector(__s0_598, __s0_598, 3, 2, 1, 0); \
+  uint16x4_t __rev1_598;  __rev1_598 = __builtin_shufflevector(__s1_598, __s1_598, 3, 2, 1, 0); \
+  uint16x8_t __rev2_598;  __rev2_598 = __builtin_shufflevector(__s2_598, __s2_598, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint32x4_t __ret_598; \
+  __ret_598 = __rev0_598 - __noswap_vmull_u16(__rev1_598, __noswap_splat_laneq_u16(__rev2_598, __p3_598)); \
+  __ret_598 = __builtin_shufflevector(__ret_598, __ret_598, 3, 2, 1, 0); \
+  __ret_598; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmlsl_laneq_s32(__p0_507, __p1_507, __p2_507, __p3_507) __extension__ ({ \
-  int64x2_t __s0_507 = __p0_507; \
-  int32x2_t __s1_507 = __p1_507; \
-  int32x4_t __s2_507 = __p2_507; \
-  int64x2_t __ret_507; \
-  __ret_507 = __s0_507 - vmull_s32(__s1_507, splat_laneq_s32(__s2_507, __p3_507)); \
-  __ret_507; \
+#define vmlsl_laneq_s32(__p0_599, __p1_599, __p2_599, __p3_599) __extension__ ({ \
+  int64x2_t __s0_599 = __p0_599; \
+  int32x2_t __s1_599 = __p1_599; \
+  int32x4_t __s2_599 = __p2_599; \
+  int64x2_t __ret_599; \
+  __ret_599 = __s0_599 - vmull_s32(__s1_599, splat_laneq_s32(__s2_599, __p3_599)); \
+  __ret_599; \
 })
 #else
-#define vmlsl_laneq_s32(__p0_508, __p1_508, __p2_508, __p3_508) __extension__ ({ \
-  int64x2_t __s0_508 = __p0_508; \
-  int32x2_t __s1_508 = __p1_508; \
-  int32x4_t __s2_508 = __p2_508; \
-  int64x2_t __rev0_508;  __rev0_508 = __builtin_shufflevector(__s0_508, __s0_508, 1, 0); \
-  int32x2_t __rev1_508;  __rev1_508 = __builtin_shufflevector(__s1_508, __s1_508, 1, 0); \
-  int32x4_t __rev2_508;  __rev2_508 = __builtin_shufflevector(__s2_508, __s2_508, 3, 2, 1, 0); \
-  int64x2_t __ret_508; \
-  __ret_508 = __rev0_508 - __noswap_vmull_s32(__rev1_508, __noswap_splat_laneq_s32(__rev2_508, __p3_508)); \
-  __ret_508 = __builtin_shufflevector(__ret_508, __ret_508, 1, 0); \
-  __ret_508; \
+#define vmlsl_laneq_s32(__p0_600, __p1_600, __p2_600, __p3_600) __extension__ ({ \
+  int64x2_t __s0_600 = __p0_600; \
+  int32x2_t __s1_600 = __p1_600; \
+  int32x4_t __s2_600 = __p2_600; \
+  int64x2_t __rev0_600;  __rev0_600 = __builtin_shufflevector(__s0_600, __s0_600, 1, 0); \
+  int32x2_t __rev1_600;  __rev1_600 = __builtin_shufflevector(__s1_600, __s1_600, 1, 0); \
+  int32x4_t __rev2_600;  __rev2_600 = __builtin_shufflevector(__s2_600, __s2_600, 3, 2, 1, 0); \
+  int64x2_t __ret_600; \
+  __ret_600 = __rev0_600 - __noswap_vmull_s32(__rev1_600, __noswap_splat_laneq_s32(__rev2_600, __p3_600)); \
+  __ret_600 = __builtin_shufflevector(__ret_600, __ret_600, 1, 0); \
+  __ret_600; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmlsl_laneq_s16(__p0_509, __p1_509, __p2_509, __p3_509) __extension__ ({ \
-  int32x4_t __s0_509 = __p0_509; \
-  int16x4_t __s1_509 = __p1_509; \
-  int16x8_t __s2_509 = __p2_509; \
-  int32x4_t __ret_509; \
-  __ret_509 = __s0_509 - vmull_s16(__s1_509, splat_laneq_s16(__s2_509, __p3_509)); \
-  __ret_509; \
+#define vmlsl_laneq_s16(__p0_601, __p1_601, __p2_601, __p3_601) __extension__ ({ \
+  int32x4_t __s0_601 = __p0_601; \
+  int16x4_t __s1_601 = __p1_601; \
+  int16x8_t __s2_601 = __p2_601; \
+  int32x4_t __ret_601; \
+  __ret_601 = __s0_601 - vmull_s16(__s1_601, splat_laneq_s16(__s2_601, __p3_601)); \
+  __ret_601; \
 })
 #else
-#define vmlsl_laneq_s16(__p0_510, __p1_510, __p2_510, __p3_510) __extension__ ({ \
-  int32x4_t __s0_510 = __p0_510; \
-  int16x4_t __s1_510 = __p1_510; \
-  int16x8_t __s2_510 = __p2_510; \
-  int32x4_t __rev0_510;  __rev0_510 = __builtin_shufflevector(__s0_510, __s0_510, 3, 2, 1, 0); \
-  int16x4_t __rev1_510;  __rev1_510 = __builtin_shufflevector(__s1_510, __s1_510, 3, 2, 1, 0); \
-  int16x8_t __rev2_510;  __rev2_510 = __builtin_shufflevector(__s2_510, __s2_510, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int32x4_t __ret_510; \
-  __ret_510 = __rev0_510 - __noswap_vmull_s16(__rev1_510, __noswap_splat_laneq_s16(__rev2_510, __p3_510)); \
-  __ret_510 = __builtin_shufflevector(__ret_510, __ret_510, 3, 2, 1, 0); \
-  __ret_510; \
+#define vmlsl_laneq_s16(__p0_602, __p1_602, __p2_602, __p3_602) __extension__ ({ \
+  int32x4_t __s0_602 = __p0_602; \
+  int16x4_t __s1_602 = __p1_602; \
+  int16x8_t __s2_602 = __p2_602; \
+  int32x4_t __rev0_602;  __rev0_602 = __builtin_shufflevector(__s0_602, __s0_602, 3, 2, 1, 0); \
+  int16x4_t __rev1_602;  __rev1_602 = __builtin_shufflevector(__s1_602, __s1_602, 3, 2, 1, 0); \
+  int16x8_t __rev2_602;  __rev2_602 = __builtin_shufflevector(__s2_602, __s2_602, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret_602; \
+  __ret_602 = __rev0_602 - __noswap_vmull_s16(__rev1_602, __noswap_splat_laneq_s16(__rev2_602, __p3_602)); \
+  __ret_602 = __builtin_shufflevector(__ret_602, __ret_602, 3, 2, 1, 0); \
+  __ret_602; \
 })
 #endif
 
@@ -53701,146 +55371,146 @@ __ai float64x1_t vmov_n_f64(float64_t __p0) {
   return __ret;
 }
 #ifdef __LITTLE_ENDIAN__
-__ai uint16x8_t vmovl_high_u8(uint8x16_t __p0_511) {
-  uint16x8_t __ret_511;
-  uint8x8_t __a1_511 = vget_high_u8(__p0_511);
-  __ret_511 = (uint16x8_t)(vshll_n_u8(__a1_511, 0));
-  return __ret_511;
+__ai uint16x8_t vmovl_high_u8(uint8x16_t __p0_603) {
+  uint16x8_t __ret_603;
+  uint8x8_t __a1_603 = vget_high_u8(__p0_603);
+  __ret_603 = (uint16x8_t)(vshll_n_u8(__a1_603, 0));
+  return __ret_603;
 }
 #else
-__ai uint16x8_t vmovl_high_u8(uint8x16_t __p0_512) {
-  uint8x16_t __rev0_512;  __rev0_512 = __builtin_shufflevector(__p0_512, __p0_512, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __ret_512;
-  uint8x8_t __a1_512 = __noswap_vget_high_u8(__rev0_512);
-  __ret_512 = (uint16x8_t)(__noswap_vshll_n_u8(__a1_512, 0));
-  __ret_512 = __builtin_shufflevector(__ret_512, __ret_512, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret_512;
+__ai uint16x8_t vmovl_high_u8(uint8x16_t __p0_604) {
+  uint8x16_t __rev0_604;  __rev0_604 = __builtin_shufflevector(__p0_604, __p0_604, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret_604;
+  uint8x8_t __a1_604 = __noswap_vget_high_u8(__rev0_604);
+  __ret_604 = (uint16x8_t)(__noswap_vshll_n_u8(__a1_604, 0));
+  __ret_604 = __builtin_shufflevector(__ret_604, __ret_604, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret_604;
 }
-__ai uint16x8_t __noswap_vmovl_high_u8(uint8x16_t __p0_513) {
-  uint16x8_t __ret_513;
-  uint8x8_t __a1_513 = __noswap_vget_high_u8(__p0_513);
-  __ret_513 = (uint16x8_t)(__noswap_vshll_n_u8(__a1_513, 0));
-  return __ret_513;
+__ai uint16x8_t __noswap_vmovl_high_u8(uint8x16_t __p0_605) {
+  uint16x8_t __ret_605;
+  uint8x8_t __a1_605 = __noswap_vget_high_u8(__p0_605);
+  __ret_605 = (uint16x8_t)(__noswap_vshll_n_u8(__a1_605, 0));
+  return __ret_605;
 }
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai uint64x2_t vmovl_high_u32(uint32x4_t __p0_514) {
-  uint64x2_t __ret_514;
-  uint32x2_t __a1_514 = vget_high_u32(__p0_514);
-  __ret_514 = (uint64x2_t)(vshll_n_u32(__a1_514, 0));
-  return __ret_514;
+__ai uint64x2_t vmovl_high_u32(uint32x4_t __p0_606) {
+  uint64x2_t __ret_606;
+  uint32x2_t __a1_606 = vget_high_u32(__p0_606);
+  __ret_606 = (uint64x2_t)(vshll_n_u32(__a1_606, 0));
+  return __ret_606;
 }
 #else
-__ai uint64x2_t vmovl_high_u32(uint32x4_t __p0_515) {
-  uint32x4_t __rev0_515;  __rev0_515 = __builtin_shufflevector(__p0_515, __p0_515, 3, 2, 1, 0);
-  uint64x2_t __ret_515;
-  uint32x2_t __a1_515 = __noswap_vget_high_u32(__rev0_515);
-  __ret_515 = (uint64x2_t)(__noswap_vshll_n_u32(__a1_515, 0));
-  __ret_515 = __builtin_shufflevector(__ret_515, __ret_515, 1, 0);
-  return __ret_515;
+__ai uint64x2_t vmovl_high_u32(uint32x4_t __p0_607) {
+  uint32x4_t __rev0_607;  __rev0_607 = __builtin_shufflevector(__p0_607, __p0_607, 3, 2, 1, 0);
+  uint64x2_t __ret_607;
+  uint32x2_t __a1_607 = __noswap_vget_high_u32(__rev0_607);
+  __ret_607 = (uint64x2_t)(__noswap_vshll_n_u32(__a1_607, 0));
+  __ret_607 = __builtin_shufflevector(__ret_607, __ret_607, 1, 0);
+  return __ret_607;
 }
-__ai uint64x2_t __noswap_vmovl_high_u32(uint32x4_t __p0_516) {
-  uint64x2_t __ret_516;
-  uint32x2_t __a1_516 = __noswap_vget_high_u32(__p0_516);
-  __ret_516 = (uint64x2_t)(__noswap_vshll_n_u32(__a1_516, 0));
-  return __ret_516;
+__ai uint64x2_t __noswap_vmovl_high_u32(uint32x4_t __p0_608) {
+  uint64x2_t __ret_608;
+  uint32x2_t __a1_608 = __noswap_vget_high_u32(__p0_608);
+  __ret_608 = (uint64x2_t)(__noswap_vshll_n_u32(__a1_608, 0));
+  return __ret_608;
 }
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai uint32x4_t vmovl_high_u16(uint16x8_t __p0_517) {
-  uint32x4_t __ret_517;
-  uint16x4_t __a1_517 = vget_high_u16(__p0_517);
-  __ret_517 = (uint32x4_t)(vshll_n_u16(__a1_517, 0));
-  return __ret_517;
+__ai uint32x4_t vmovl_high_u16(uint16x8_t __p0_609) {
+  uint32x4_t __ret_609;
+  uint16x4_t __a1_609 = vget_high_u16(__p0_609);
+  __ret_609 = (uint32x4_t)(vshll_n_u16(__a1_609, 0));
+  return __ret_609;
 }
 #else
-__ai uint32x4_t vmovl_high_u16(uint16x8_t __p0_518) {
-  uint16x8_t __rev0_518;  __rev0_518 = __builtin_shufflevector(__p0_518, __p0_518, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint32x4_t __ret_518;
-  uint16x4_t __a1_518 = __noswap_vget_high_u16(__rev0_518);
-  __ret_518 = (uint32x4_t)(__noswap_vshll_n_u16(__a1_518, 0));
-  __ret_518 = __builtin_shufflevector(__ret_518, __ret_518, 3, 2, 1, 0);
-  return __ret_518;
+__ai uint32x4_t vmovl_high_u16(uint16x8_t __p0_610) {
+  uint16x8_t __rev0_610;  __rev0_610 = __builtin_shufflevector(__p0_610, __p0_610, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint32x4_t __ret_610;
+  uint16x4_t __a1_610 = __noswap_vget_high_u16(__rev0_610);
+  __ret_610 = (uint32x4_t)(__noswap_vshll_n_u16(__a1_610, 0));
+  __ret_610 = __builtin_shufflevector(__ret_610, __ret_610, 3, 2, 1, 0);
+  return __ret_610;
 }
-__ai uint32x4_t __noswap_vmovl_high_u16(uint16x8_t __p0_519) {
-  uint32x4_t __ret_519;
-  uint16x4_t __a1_519 = __noswap_vget_high_u16(__p0_519);
-  __ret_519 = (uint32x4_t)(__noswap_vshll_n_u16(__a1_519, 0));
-  return __ret_519;
+__ai uint32x4_t __noswap_vmovl_high_u16(uint16x8_t __p0_611) {
+  uint32x4_t __ret_611;
+  uint16x4_t __a1_611 = __noswap_vget_high_u16(__p0_611);
+  __ret_611 = (uint32x4_t)(__noswap_vshll_n_u16(__a1_611, 0));
+  return __ret_611;
 }
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai int16x8_t vmovl_high_s8(int8x16_t __p0_520) {
-  int16x8_t __ret_520;
-  int8x8_t __a1_520 = vget_high_s8(__p0_520);
-  __ret_520 = (int16x8_t)(vshll_n_s8(__a1_520, 0));
-  return __ret_520;
+__ai int16x8_t vmovl_high_s8(int8x16_t __p0_612) {
+  int16x8_t __ret_612;
+  int8x8_t __a1_612 = vget_high_s8(__p0_612);
+  __ret_612 = (int16x8_t)(vshll_n_s8(__a1_612, 0));
+  return __ret_612;
 }
 #else
-__ai int16x8_t vmovl_high_s8(int8x16_t __p0_521) {
-  int8x16_t __rev0_521;  __rev0_521 = __builtin_shufflevector(__p0_521, __p0_521, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __ret_521;
-  int8x8_t __a1_521 = __noswap_vget_high_s8(__rev0_521);
-  __ret_521 = (int16x8_t)(__noswap_vshll_n_s8(__a1_521, 0));
-  __ret_521 = __builtin_shufflevector(__ret_521, __ret_521, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret_521;
+__ai int16x8_t vmovl_high_s8(int8x16_t __p0_613) {
+  int8x16_t __rev0_613;  __rev0_613 = __builtin_shufflevector(__p0_613, __p0_613, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret_613;
+  int8x8_t __a1_613 = __noswap_vget_high_s8(__rev0_613);
+  __ret_613 = (int16x8_t)(__noswap_vshll_n_s8(__a1_613, 0));
+  __ret_613 = __builtin_shufflevector(__ret_613, __ret_613, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret_613;
 }
-__ai int16x8_t __noswap_vmovl_high_s8(int8x16_t __p0_522) {
-  int16x8_t __ret_522;
-  int8x8_t __a1_522 = __noswap_vget_high_s8(__p0_522);
-  __ret_522 = (int16x8_t)(__noswap_vshll_n_s8(__a1_522, 0));
-  return __ret_522;
+__ai int16x8_t __noswap_vmovl_high_s8(int8x16_t __p0_614) {
+  int16x8_t __ret_614;
+  int8x8_t __a1_614 = __noswap_vget_high_s8(__p0_614);
+  __ret_614 = (int16x8_t)(__noswap_vshll_n_s8(__a1_614, 0));
+  return __ret_614;
 }
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai int64x2_t vmovl_high_s32(int32x4_t __p0_523) {
-  int64x2_t __ret_523;
-  int32x2_t __a1_523 = vget_high_s32(__p0_523);
-  __ret_523 = (int64x2_t)(vshll_n_s32(__a1_523, 0));
-  return __ret_523;
+__ai int64x2_t vmovl_high_s32(int32x4_t __p0_615) {
+  int64x2_t __ret_615;
+  int32x2_t __a1_615 = vget_high_s32(__p0_615);
+  __ret_615 = (int64x2_t)(vshll_n_s32(__a1_615, 0));
+  return __ret_615;
 }
 #else
-__ai int64x2_t vmovl_high_s32(int32x4_t __p0_524) {
-  int32x4_t __rev0_524;  __rev0_524 = __builtin_shufflevector(__p0_524, __p0_524, 3, 2, 1, 0);
-  int64x2_t __ret_524;
-  int32x2_t __a1_524 = __noswap_vget_high_s32(__rev0_524);
-  __ret_524 = (int64x2_t)(__noswap_vshll_n_s32(__a1_524, 0));
-  __ret_524 = __builtin_shufflevector(__ret_524, __ret_524, 1, 0);
-  return __ret_524;
+__ai int64x2_t vmovl_high_s32(int32x4_t __p0_616) {
+  int32x4_t __rev0_616;  __rev0_616 = __builtin_shufflevector(__p0_616, __p0_616, 3, 2, 1, 0);
+  int64x2_t __ret_616;
+  int32x2_t __a1_616 = __noswap_vget_high_s32(__rev0_616);
+  __ret_616 = (int64x2_t)(__noswap_vshll_n_s32(__a1_616, 0));
+  __ret_616 = __builtin_shufflevector(__ret_616, __ret_616, 1, 0);
+  return __ret_616;
 }
-__ai int64x2_t __noswap_vmovl_high_s32(int32x4_t __p0_525) {
-  int64x2_t __ret_525;
-  int32x2_t __a1_525 = __noswap_vget_high_s32(__p0_525);
-  __ret_525 = (int64x2_t)(__noswap_vshll_n_s32(__a1_525, 0));
-  return __ret_525;
+__ai int64x2_t __noswap_vmovl_high_s32(int32x4_t __p0_617) {
+  int64x2_t __ret_617;
+  int32x2_t __a1_617 = __noswap_vget_high_s32(__p0_617);
+  __ret_617 = (int64x2_t)(__noswap_vshll_n_s32(__a1_617, 0));
+  return __ret_617;
 }
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai int32x4_t vmovl_high_s16(int16x8_t __p0_526) {
-  int32x4_t __ret_526;
-  int16x4_t __a1_526 = vget_high_s16(__p0_526);
-  __ret_526 = (int32x4_t)(vshll_n_s16(__a1_526, 0));
-  return __ret_526;
+__ai int32x4_t vmovl_high_s16(int16x8_t __p0_618) {
+  int32x4_t __ret_618;
+  int16x4_t __a1_618 = vget_high_s16(__p0_618);
+  __ret_618 = (int32x4_t)(vshll_n_s16(__a1_618, 0));
+  return __ret_618;
 }
 #else
-__ai int32x4_t vmovl_high_s16(int16x8_t __p0_527) {
-  int16x8_t __rev0_527;  __rev0_527 = __builtin_shufflevector(__p0_527, __p0_527, 7, 6, 5, 4, 3, 2, 1, 0);
-  int32x4_t __ret_527;
-  int16x4_t __a1_527 = __noswap_vget_high_s16(__rev0_527);
-  __ret_527 = (int32x4_t)(__noswap_vshll_n_s16(__a1_527, 0));
-  __ret_527 = __builtin_shufflevector(__ret_527, __ret_527, 3, 2, 1, 0);
-  return __ret_527;
+__ai int32x4_t vmovl_high_s16(int16x8_t __p0_619) {
+  int16x8_t __rev0_619;  __rev0_619 = __builtin_shufflevector(__p0_619, __p0_619, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret_619;
+  int16x4_t __a1_619 = __noswap_vget_high_s16(__rev0_619);
+  __ret_619 = (int32x4_t)(__noswap_vshll_n_s16(__a1_619, 0));
+  __ret_619 = __builtin_shufflevector(__ret_619, __ret_619, 3, 2, 1, 0);
+  return __ret_619;
 }
-__ai int32x4_t __noswap_vmovl_high_s16(int16x8_t __p0_528) {
-  int32x4_t __ret_528;
-  int16x4_t __a1_528 = __noswap_vget_high_s16(__p0_528);
-  __ret_528 = (int32x4_t)(__noswap_vshll_n_s16(__a1_528, 0));
-  return __ret_528;
+__ai int32x4_t __noswap_vmovl_high_s16(int16x8_t __p0_620) {
+  int32x4_t __ret_620;
+  int16x4_t __a1_620 = __noswap_vget_high_s16(__p0_620);
+  __ret_620 = (int32x4_t)(__noswap_vshll_n_s16(__a1_620, 0));
+  return __ret_620;
 }
 #endif
 
@@ -53968,29 +55638,29 @@ __ai float64x1_t vmul_f64(float64x1_t __p0, float64x1_t __p1) {
   __ret = __p0 * __p1;
   return __ret;
 }
-#define vmuld_lane_f64(__p0_529, __p1_529, __p2_529) __extension__ ({ \
-  float64_t __s0_529 = __p0_529; \
-  float64x1_t __s1_529 = __p1_529; \
-  float64_t __ret_529; \
-  __ret_529 = __s0_529 * vget_lane_f64(__s1_529, __p2_529); \
-  __ret_529; \
+#define vmuld_lane_f64(__p0_621, __p1_621, __p2_621) __extension__ ({ \
+  float64_t __s0_621 = __p0_621; \
+  float64x1_t __s1_621 = __p1_621; \
+  float64_t __ret_621; \
+  __ret_621 = __s0_621 * vget_lane_f64(__s1_621, __p2_621); \
+  __ret_621; \
 })
 #ifdef __LITTLE_ENDIAN__
-#define vmuls_lane_f32(__p0_530, __p1_530, __p2_530) __extension__ ({ \
-  float32_t __s0_530 = __p0_530; \
-  float32x2_t __s1_530 = __p1_530; \
-  float32_t __ret_530; \
-  __ret_530 = __s0_530 * vget_lane_f32(__s1_530, __p2_530); \
-  __ret_530; \
+#define vmuls_lane_f32(__p0_622, __p1_622, __p2_622) __extension__ ({ \
+  float32_t __s0_622 = __p0_622; \
+  float32x2_t __s1_622 = __p1_622; \
+  float32_t __ret_622; \
+  __ret_622 = __s0_622 * vget_lane_f32(__s1_622, __p2_622); \
+  __ret_622; \
 })
 #else
-#define vmuls_lane_f32(__p0_531, __p1_531, __p2_531) __extension__ ({ \
-  float32_t __s0_531 = __p0_531; \
-  float32x2_t __s1_531 = __p1_531; \
-  float32x2_t __rev1_531;  __rev1_531 = __builtin_shufflevector(__s1_531, __s1_531, 1, 0); \
-  float32_t __ret_531; \
-  __ret_531 = __s0_531 * __noswap_vget_lane_f32(__rev1_531, __p2_531); \
-  __ret_531; \
+#define vmuls_lane_f32(__p0_623, __p1_623, __p2_623) __extension__ ({ \
+  float32_t __s0_623 = __p0_623; \
+  float32x2_t __s1_623 = __p1_623; \
+  float32x2_t __rev1_623;  __rev1_623 = __builtin_shufflevector(__s1_623, __s1_623, 1, 0); \
+  float32_t __ret_623; \
+  __ret_623 = __s0_623 * __noswap_vget_lane_f32(__rev1_623, __p2_623); \
+  __ret_623; \
 })
 #endif
 
@@ -54002,60 +55672,60 @@ __ai float64x1_t vmul_f64(float64x1_t __p0, float64x1_t __p1) {
   __ret; \
 })
 #ifdef __LITTLE_ENDIAN__
-#define vmulq_lane_f64(__p0_532, __p1_532, __p2_532) __extension__ ({ \
-  float64x2_t __s0_532 = __p0_532; \
-  float64x1_t __s1_532 = __p1_532; \
-  float64x2_t __ret_532; \
-  __ret_532 = __s0_532 * splatq_lane_f64(__s1_532, __p2_532); \
-  __ret_532; \
+#define vmulq_lane_f64(__p0_624, __p1_624, __p2_624) __extension__ ({ \
+  float64x2_t __s0_624 = __p0_624; \
+  float64x1_t __s1_624 = __p1_624; \
+  float64x2_t __ret_624; \
+  __ret_624 = __s0_624 * splatq_lane_f64(__s1_624, __p2_624); \
+  __ret_624; \
 })
 #else
-#define vmulq_lane_f64(__p0_533, __p1_533, __p2_533) __extension__ ({ \
-  float64x2_t __s0_533 = __p0_533; \
-  float64x1_t __s1_533 = __p1_533; \
-  float64x2_t __rev0_533;  __rev0_533 = __builtin_shufflevector(__s0_533, __s0_533, 1, 0); \
-  float64x2_t __ret_533; \
-  __ret_533 = __rev0_533 * __noswap_splatq_lane_f64(__s1_533, __p2_533); \
-  __ret_533 = __builtin_shufflevector(__ret_533, __ret_533, 1, 0); \
-  __ret_533; \
+#define vmulq_lane_f64(__p0_625, __p1_625, __p2_625) __extension__ ({ \
+  float64x2_t __s0_625 = __p0_625; \
+  float64x1_t __s1_625 = __p1_625; \
+  float64x2_t __rev0_625;  __rev0_625 = __builtin_shufflevector(__s0_625, __s0_625, 1, 0); \
+  float64x2_t __ret_625; \
+  __ret_625 = __rev0_625 * __noswap_splatq_lane_f64(__s1_625, __p2_625); \
+  __ret_625 = __builtin_shufflevector(__ret_625, __ret_625, 1, 0); \
+  __ret_625; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmuld_laneq_f64(__p0_534, __p1_534, __p2_534) __extension__ ({ \
-  float64_t __s0_534 = __p0_534; \
-  float64x2_t __s1_534 = __p1_534; \
-  float64_t __ret_534; \
-  __ret_534 = __s0_534 * vgetq_lane_f64(__s1_534, __p2_534); \
-  __ret_534; \
+#define vmuld_laneq_f64(__p0_626, __p1_626, __p2_626) __extension__ ({ \
+  float64_t __s0_626 = __p0_626; \
+  float64x2_t __s1_626 = __p1_626; \
+  float64_t __ret_626; \
+  __ret_626 = __s0_626 * vgetq_lane_f64(__s1_626, __p2_626); \
+  __ret_626; \
 })
 #else
-#define vmuld_laneq_f64(__p0_535, __p1_535, __p2_535) __extension__ ({ \
-  float64_t __s0_535 = __p0_535; \
-  float64x2_t __s1_535 = __p1_535; \
-  float64x2_t __rev1_535;  __rev1_535 = __builtin_shufflevector(__s1_535, __s1_535, 1, 0); \
-  float64_t __ret_535; \
-  __ret_535 = __s0_535 * __noswap_vgetq_lane_f64(__rev1_535, __p2_535); \
-  __ret_535; \
+#define vmuld_laneq_f64(__p0_627, __p1_627, __p2_627) __extension__ ({ \
+  float64_t __s0_627 = __p0_627; \
+  float64x2_t __s1_627 = __p1_627; \
+  float64x2_t __rev1_627;  __rev1_627 = __builtin_shufflevector(__s1_627, __s1_627, 1, 0); \
+  float64_t __ret_627; \
+  __ret_627 = __s0_627 * __noswap_vgetq_lane_f64(__rev1_627, __p2_627); \
+  __ret_627; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmuls_laneq_f32(__p0_536, __p1_536, __p2_536) __extension__ ({ \
-  float32_t __s0_536 = __p0_536; \
-  float32x4_t __s1_536 = __p1_536; \
-  float32_t __ret_536; \
-  __ret_536 = __s0_536 * vgetq_lane_f32(__s1_536, __p2_536); \
-  __ret_536; \
+#define vmuls_laneq_f32(__p0_628, __p1_628, __p2_628) __extension__ ({ \
+  float32_t __s0_628 = __p0_628; \
+  float32x4_t __s1_628 = __p1_628; \
+  float32_t __ret_628; \
+  __ret_628 = __s0_628 * vgetq_lane_f32(__s1_628, __p2_628); \
+  __ret_628; \
 })
 #else
-#define vmuls_laneq_f32(__p0_537, __p1_537, __p2_537) __extension__ ({ \
-  float32_t __s0_537 = __p0_537; \
-  float32x4_t __s1_537 = __p1_537; \
-  float32x4_t __rev1_537;  __rev1_537 = __builtin_shufflevector(__s1_537, __s1_537, 3, 2, 1, 0); \
-  float32_t __ret_537; \
-  __ret_537 = __s0_537 * __noswap_vgetq_lane_f32(__rev1_537, __p2_537); \
-  __ret_537; \
+#define vmuls_laneq_f32(__p0_629, __p1_629, __p2_629) __extension__ ({ \
+  float32_t __s0_629 = __p0_629; \
+  float32x4_t __s1_629 = __p1_629; \
+  float32x4_t __rev1_629;  __rev1_629 = __builtin_shufflevector(__s1_629, __s1_629, 3, 2, 1, 0); \
+  float32_t __ret_629; \
+  __ret_629 = __s0_629 * __noswap_vgetq_lane_f32(__rev1_629, __p2_629); \
+  __ret_629; \
 })
 #endif
 
@@ -54079,233 +55749,233 @@ __ai float64x1_t vmul_f64(float64x1_t __p0, float64x1_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmulq_laneq_u32(__p0_538, __p1_538, __p2_538) __extension__ ({ \
-  uint32x4_t __s0_538 = __p0_538; \
-  uint32x4_t __s1_538 = __p1_538; \
-  uint32x4_t __ret_538; \
-  __ret_538 = __s0_538 * splatq_laneq_u32(__s1_538, __p2_538); \
-  __ret_538; \
+#define vmulq_laneq_u32(__p0_630, __p1_630, __p2_630) __extension__ ({ \
+  uint32x4_t __s0_630 = __p0_630; \
+  uint32x4_t __s1_630 = __p1_630; \
+  uint32x4_t __ret_630; \
+  __ret_630 = __s0_630 * splatq_laneq_u32(__s1_630, __p2_630); \
+  __ret_630; \
 })
 #else
-#define vmulq_laneq_u32(__p0_539, __p1_539, __p2_539) __extension__ ({ \
-  uint32x4_t __s0_539 = __p0_539; \
-  uint32x4_t __s1_539 = __p1_539; \
-  uint32x4_t __rev0_539;  __rev0_539 = __builtin_shufflevector(__s0_539, __s0_539, 3, 2, 1, 0); \
-  uint32x4_t __rev1_539;  __rev1_539 = __builtin_shufflevector(__s1_539, __s1_539, 3, 2, 1, 0); \
-  uint32x4_t __ret_539; \
-  __ret_539 = __rev0_539 * __noswap_splatq_laneq_u32(__rev1_539, __p2_539); \
-  __ret_539 = __builtin_shufflevector(__ret_539, __ret_539, 3, 2, 1, 0); \
-  __ret_539; \
+#define vmulq_laneq_u32(__p0_631, __p1_631, __p2_631) __extension__ ({ \
+  uint32x4_t __s0_631 = __p0_631; \
+  uint32x4_t __s1_631 = __p1_631; \
+  uint32x4_t __rev0_631;  __rev0_631 = __builtin_shufflevector(__s0_631, __s0_631, 3, 2, 1, 0); \
+  uint32x4_t __rev1_631;  __rev1_631 = __builtin_shufflevector(__s1_631, __s1_631, 3, 2, 1, 0); \
+  uint32x4_t __ret_631; \
+  __ret_631 = __rev0_631 * __noswap_splatq_laneq_u32(__rev1_631, __p2_631); \
+  __ret_631 = __builtin_shufflevector(__ret_631, __ret_631, 3, 2, 1, 0); \
+  __ret_631; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmulq_laneq_u16(__p0_540, __p1_540, __p2_540) __extension__ ({ \
-  uint16x8_t __s0_540 = __p0_540; \
-  uint16x8_t __s1_540 = __p1_540; \
-  uint16x8_t __ret_540; \
-  __ret_540 = __s0_540 * splatq_laneq_u16(__s1_540, __p2_540); \
-  __ret_540; \
+#define vmulq_laneq_u16(__p0_632, __p1_632, __p2_632) __extension__ ({ \
+  uint16x8_t __s0_632 = __p0_632; \
+  uint16x8_t __s1_632 = __p1_632; \
+  uint16x8_t __ret_632; \
+  __ret_632 = __s0_632 * splatq_laneq_u16(__s1_632, __p2_632); \
+  __ret_632; \
 })
 #else
-#define vmulq_laneq_u16(__p0_541, __p1_541, __p2_541) __extension__ ({ \
-  uint16x8_t __s0_541 = __p0_541; \
-  uint16x8_t __s1_541 = __p1_541; \
-  uint16x8_t __rev0_541;  __rev0_541 = __builtin_shufflevector(__s0_541, __s0_541, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __rev1_541;  __rev1_541 = __builtin_shufflevector(__s1_541, __s1_541, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __ret_541; \
-  __ret_541 = __rev0_541 * __noswap_splatq_laneq_u16(__rev1_541, __p2_541); \
-  __ret_541 = __builtin_shufflevector(__ret_541, __ret_541, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_541; \
+#define vmulq_laneq_u16(__p0_633, __p1_633, __p2_633) __extension__ ({ \
+  uint16x8_t __s0_633 = __p0_633; \
+  uint16x8_t __s1_633 = __p1_633; \
+  uint16x8_t __rev0_633;  __rev0_633 = __builtin_shufflevector(__s0_633, __s0_633, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1_633;  __rev1_633 = __builtin_shufflevector(__s1_633, __s1_633, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret_633; \
+  __ret_633 = __rev0_633 * __noswap_splatq_laneq_u16(__rev1_633, __p2_633); \
+  __ret_633 = __builtin_shufflevector(__ret_633, __ret_633, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_633; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmulq_laneq_f64(__p0_542, __p1_542, __p2_542) __extension__ ({ \
-  float64x2_t __s0_542 = __p0_542; \
-  float64x2_t __s1_542 = __p1_542; \
-  float64x2_t __ret_542; \
-  __ret_542 = __s0_542 * splatq_laneq_f64(__s1_542, __p2_542); \
-  __ret_542; \
+#define vmulq_laneq_f64(__p0_634, __p1_634, __p2_634) __extension__ ({ \
+  float64x2_t __s0_634 = __p0_634; \
+  float64x2_t __s1_634 = __p1_634; \
+  float64x2_t __ret_634; \
+  __ret_634 = __s0_634 * splatq_laneq_f64(__s1_634, __p2_634); \
+  __ret_634; \
 })
 #else
-#define vmulq_laneq_f64(__p0_543, __p1_543, __p2_543) __extension__ ({ \
-  float64x2_t __s0_543 = __p0_543; \
-  float64x2_t __s1_543 = __p1_543; \
-  float64x2_t __rev0_543;  __rev0_543 = __builtin_shufflevector(__s0_543, __s0_543, 1, 0); \
-  float64x2_t __rev1_543;  __rev1_543 = __builtin_shufflevector(__s1_543, __s1_543, 1, 0); \
-  float64x2_t __ret_543; \
-  __ret_543 = __rev0_543 * __noswap_splatq_laneq_f64(__rev1_543, __p2_543); \
-  __ret_543 = __builtin_shufflevector(__ret_543, __ret_543, 1, 0); \
-  __ret_543; \
+#define vmulq_laneq_f64(__p0_635, __p1_635, __p2_635) __extension__ ({ \
+  float64x2_t __s0_635 = __p0_635; \
+  float64x2_t __s1_635 = __p1_635; \
+  float64x2_t __rev0_635;  __rev0_635 = __builtin_shufflevector(__s0_635, __s0_635, 1, 0); \
+  float64x2_t __rev1_635;  __rev1_635 = __builtin_shufflevector(__s1_635, __s1_635, 1, 0); \
+  float64x2_t __ret_635; \
+  __ret_635 = __rev0_635 * __noswap_splatq_laneq_f64(__rev1_635, __p2_635); \
+  __ret_635 = __builtin_shufflevector(__ret_635, __ret_635, 1, 0); \
+  __ret_635; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmulq_laneq_f32(__p0_544, __p1_544, __p2_544) __extension__ ({ \
-  float32x4_t __s0_544 = __p0_544; \
-  float32x4_t __s1_544 = __p1_544; \
-  float32x4_t __ret_544; \
-  __ret_544 = __s0_544 * splatq_laneq_f32(__s1_544, __p2_544); \
-  __ret_544; \
+#define vmulq_laneq_f32(__p0_636, __p1_636, __p2_636) __extension__ ({ \
+  float32x4_t __s0_636 = __p0_636; \
+  float32x4_t __s1_636 = __p1_636; \
+  float32x4_t __ret_636; \
+  __ret_636 = __s0_636 * splatq_laneq_f32(__s1_636, __p2_636); \
+  __ret_636; \
 })
 #else
-#define vmulq_laneq_f32(__p0_545, __p1_545, __p2_545) __extension__ ({ \
-  float32x4_t __s0_545 = __p0_545; \
-  float32x4_t __s1_545 = __p1_545; \
-  float32x4_t __rev0_545;  __rev0_545 = __builtin_shufflevector(__s0_545, __s0_545, 3, 2, 1, 0); \
-  float32x4_t __rev1_545;  __rev1_545 = __builtin_shufflevector(__s1_545, __s1_545, 3, 2, 1, 0); \
-  float32x4_t __ret_545; \
-  __ret_545 = __rev0_545 * __noswap_splatq_laneq_f32(__rev1_545, __p2_545); \
-  __ret_545 = __builtin_shufflevector(__ret_545, __ret_545, 3, 2, 1, 0); \
-  __ret_545; \
+#define vmulq_laneq_f32(__p0_637, __p1_637, __p2_637) __extension__ ({ \
+  float32x4_t __s0_637 = __p0_637; \
+  float32x4_t __s1_637 = __p1_637; \
+  float32x4_t __rev0_637;  __rev0_637 = __builtin_shufflevector(__s0_637, __s0_637, 3, 2, 1, 0); \
+  float32x4_t __rev1_637;  __rev1_637 = __builtin_shufflevector(__s1_637, __s1_637, 3, 2, 1, 0); \
+  float32x4_t __ret_637; \
+  __ret_637 = __rev0_637 * __noswap_splatq_laneq_f32(__rev1_637, __p2_637); \
+  __ret_637 = __builtin_shufflevector(__ret_637, __ret_637, 3, 2, 1, 0); \
+  __ret_637; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmulq_laneq_s32(__p0_546, __p1_546, __p2_546) __extension__ ({ \
-  int32x4_t __s0_546 = __p0_546; \
-  int32x4_t __s1_546 = __p1_546; \
-  int32x4_t __ret_546; \
-  __ret_546 = __s0_546 * splatq_laneq_s32(__s1_546, __p2_546); \
-  __ret_546; \
+#define vmulq_laneq_s32(__p0_638, __p1_638, __p2_638) __extension__ ({ \
+  int32x4_t __s0_638 = __p0_638; \
+  int32x4_t __s1_638 = __p1_638; \
+  int32x4_t __ret_638; \
+  __ret_638 = __s0_638 * splatq_laneq_s32(__s1_638, __p2_638); \
+  __ret_638; \
 })
 #else
-#define vmulq_laneq_s32(__p0_547, __p1_547, __p2_547) __extension__ ({ \
-  int32x4_t __s0_547 = __p0_547; \
-  int32x4_t __s1_547 = __p1_547; \
-  int32x4_t __rev0_547;  __rev0_547 = __builtin_shufflevector(__s0_547, __s0_547, 3, 2, 1, 0); \
-  int32x4_t __rev1_547;  __rev1_547 = __builtin_shufflevector(__s1_547, __s1_547, 3, 2, 1, 0); \
-  int32x4_t __ret_547; \
-  __ret_547 = __rev0_547 * __noswap_splatq_laneq_s32(__rev1_547, __p2_547); \
-  __ret_547 = __builtin_shufflevector(__ret_547, __ret_547, 3, 2, 1, 0); \
-  __ret_547; \
+#define vmulq_laneq_s32(__p0_639, __p1_639, __p2_639) __extension__ ({ \
+  int32x4_t __s0_639 = __p0_639; \
+  int32x4_t __s1_639 = __p1_639; \
+  int32x4_t __rev0_639;  __rev0_639 = __builtin_shufflevector(__s0_639, __s0_639, 3, 2, 1, 0); \
+  int32x4_t __rev1_639;  __rev1_639 = __builtin_shufflevector(__s1_639, __s1_639, 3, 2, 1, 0); \
+  int32x4_t __ret_639; \
+  __ret_639 = __rev0_639 * __noswap_splatq_laneq_s32(__rev1_639, __p2_639); \
+  __ret_639 = __builtin_shufflevector(__ret_639, __ret_639, 3, 2, 1, 0); \
+  __ret_639; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmulq_laneq_s16(__p0_548, __p1_548, __p2_548) __extension__ ({ \
-  int16x8_t __s0_548 = __p0_548; \
-  int16x8_t __s1_548 = __p1_548; \
-  int16x8_t __ret_548; \
-  __ret_548 = __s0_548 * splatq_laneq_s16(__s1_548, __p2_548); \
-  __ret_548; \
+#define vmulq_laneq_s16(__p0_640, __p1_640, __p2_640) __extension__ ({ \
+  int16x8_t __s0_640 = __p0_640; \
+  int16x8_t __s1_640 = __p1_640; \
+  int16x8_t __ret_640; \
+  __ret_640 = __s0_640 * splatq_laneq_s16(__s1_640, __p2_640); \
+  __ret_640; \
 })
 #else
-#define vmulq_laneq_s16(__p0_549, __p1_549, __p2_549) __extension__ ({ \
-  int16x8_t __s0_549 = __p0_549; \
-  int16x8_t __s1_549 = __p1_549; \
-  int16x8_t __rev0_549;  __rev0_549 = __builtin_shufflevector(__s0_549, __s0_549, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1_549;  __rev1_549 = __builtin_shufflevector(__s1_549, __s1_549, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __ret_549; \
-  __ret_549 = __rev0_549 * __noswap_splatq_laneq_s16(__rev1_549, __p2_549); \
-  __ret_549 = __builtin_shufflevector(__ret_549, __ret_549, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_549; \
+#define vmulq_laneq_s16(__p0_641, __p1_641, __p2_641) __extension__ ({ \
+  int16x8_t __s0_641 = __p0_641; \
+  int16x8_t __s1_641 = __p1_641; \
+  int16x8_t __rev0_641;  __rev0_641 = __builtin_shufflevector(__s0_641, __s0_641, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1_641;  __rev1_641 = __builtin_shufflevector(__s1_641, __s1_641, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret_641; \
+  __ret_641 = __rev0_641 * __noswap_splatq_laneq_s16(__rev1_641, __p2_641); \
+  __ret_641 = __builtin_shufflevector(__ret_641, __ret_641, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_641; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmul_laneq_u32(__p0_550, __p1_550, __p2_550) __extension__ ({ \
-  uint32x2_t __s0_550 = __p0_550; \
-  uint32x4_t __s1_550 = __p1_550; \
-  uint32x2_t __ret_550; \
-  __ret_550 = __s0_550 * splat_laneq_u32(__s1_550, __p2_550); \
-  __ret_550; \
+#define vmul_laneq_u32(__p0_642, __p1_642, __p2_642) __extension__ ({ \
+  uint32x2_t __s0_642 = __p0_642; \
+  uint32x4_t __s1_642 = __p1_642; \
+  uint32x2_t __ret_642; \
+  __ret_642 = __s0_642 * splat_laneq_u32(__s1_642, __p2_642); \
+  __ret_642; \
 })
 #else
-#define vmul_laneq_u32(__p0_551, __p1_551, __p2_551) __extension__ ({ \
-  uint32x2_t __s0_551 = __p0_551; \
-  uint32x4_t __s1_551 = __p1_551; \
-  uint32x2_t __rev0_551;  __rev0_551 = __builtin_shufflevector(__s0_551, __s0_551, 1, 0); \
-  uint32x4_t __rev1_551;  __rev1_551 = __builtin_shufflevector(__s1_551, __s1_551, 3, 2, 1, 0); \
-  uint32x2_t __ret_551; \
-  __ret_551 = __rev0_551 * __noswap_splat_laneq_u32(__rev1_551, __p2_551); \
-  __ret_551 = __builtin_shufflevector(__ret_551, __ret_551, 1, 0); \
-  __ret_551; \
+#define vmul_laneq_u32(__p0_643, __p1_643, __p2_643) __extension__ ({ \
+  uint32x2_t __s0_643 = __p0_643; \
+  uint32x4_t __s1_643 = __p1_643; \
+  uint32x2_t __rev0_643;  __rev0_643 = __builtin_shufflevector(__s0_643, __s0_643, 1, 0); \
+  uint32x4_t __rev1_643;  __rev1_643 = __builtin_shufflevector(__s1_643, __s1_643, 3, 2, 1, 0); \
+  uint32x2_t __ret_643; \
+  __ret_643 = __rev0_643 * __noswap_splat_laneq_u32(__rev1_643, __p2_643); \
+  __ret_643 = __builtin_shufflevector(__ret_643, __ret_643, 1, 0); \
+  __ret_643; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmul_laneq_u16(__p0_552, __p1_552, __p2_552) __extension__ ({ \
-  uint16x4_t __s0_552 = __p0_552; \
-  uint16x8_t __s1_552 = __p1_552; \
-  uint16x4_t __ret_552; \
-  __ret_552 = __s0_552 * splat_laneq_u16(__s1_552, __p2_552); \
-  __ret_552; \
+#define vmul_laneq_u16(__p0_644, __p1_644, __p2_644) __extension__ ({ \
+  uint16x4_t __s0_644 = __p0_644; \
+  uint16x8_t __s1_644 = __p1_644; \
+  uint16x4_t __ret_644; \
+  __ret_644 = __s0_644 * splat_laneq_u16(__s1_644, __p2_644); \
+  __ret_644; \
 })
 #else
-#define vmul_laneq_u16(__p0_553, __p1_553, __p2_553) __extension__ ({ \
-  uint16x4_t __s0_553 = __p0_553; \
-  uint16x8_t __s1_553 = __p1_553; \
-  uint16x4_t __rev0_553;  __rev0_553 = __builtin_shufflevector(__s0_553, __s0_553, 3, 2, 1, 0); \
-  uint16x8_t __rev1_553;  __rev1_553 = __builtin_shufflevector(__s1_553, __s1_553, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x4_t __ret_553; \
-  __ret_553 = __rev0_553 * __noswap_splat_laneq_u16(__rev1_553, __p2_553); \
-  __ret_553 = __builtin_shufflevector(__ret_553, __ret_553, 3, 2, 1, 0); \
-  __ret_553; \
+#define vmul_laneq_u16(__p0_645, __p1_645, __p2_645) __extension__ ({ \
+  uint16x4_t __s0_645 = __p0_645; \
+  uint16x8_t __s1_645 = __p1_645; \
+  uint16x4_t __rev0_645;  __rev0_645 = __builtin_shufflevector(__s0_645, __s0_645, 3, 2, 1, 0); \
+  uint16x8_t __rev1_645;  __rev1_645 = __builtin_shufflevector(__s1_645, __s1_645, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x4_t __ret_645; \
+  __ret_645 = __rev0_645 * __noswap_splat_laneq_u16(__rev1_645, __p2_645); \
+  __ret_645 = __builtin_shufflevector(__ret_645, __ret_645, 3, 2, 1, 0); \
+  __ret_645; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmul_laneq_f32(__p0_554, __p1_554, __p2_554) __extension__ ({ \
-  float32x2_t __s0_554 = __p0_554; \
-  float32x4_t __s1_554 = __p1_554; \
-  float32x2_t __ret_554; \
-  __ret_554 = __s0_554 * splat_laneq_f32(__s1_554, __p2_554); \
-  __ret_554; \
+#define vmul_laneq_f32(__p0_646, __p1_646, __p2_646) __extension__ ({ \
+  float32x2_t __s0_646 = __p0_646; \
+  float32x4_t __s1_646 = __p1_646; \
+  float32x2_t __ret_646; \
+  __ret_646 = __s0_646 * splat_laneq_f32(__s1_646, __p2_646); \
+  __ret_646; \
 })
 #else
-#define vmul_laneq_f32(__p0_555, __p1_555, __p2_555) __extension__ ({ \
-  float32x2_t __s0_555 = __p0_555; \
-  float32x4_t __s1_555 = __p1_555; \
-  float32x2_t __rev0_555;  __rev0_555 = __builtin_shufflevector(__s0_555, __s0_555, 1, 0); \
-  float32x4_t __rev1_555;  __rev1_555 = __builtin_shufflevector(__s1_555, __s1_555, 3, 2, 1, 0); \
-  float32x2_t __ret_555; \
-  __ret_555 = __rev0_555 * __noswap_splat_laneq_f32(__rev1_555, __p2_555); \
-  __ret_555 = __builtin_shufflevector(__ret_555, __ret_555, 1, 0); \
-  __ret_555; \
+#define vmul_laneq_f32(__p0_647, __p1_647, __p2_647) __extension__ ({ \
+  float32x2_t __s0_647 = __p0_647; \
+  float32x4_t __s1_647 = __p1_647; \
+  float32x2_t __rev0_647;  __rev0_647 = __builtin_shufflevector(__s0_647, __s0_647, 1, 0); \
+  float32x4_t __rev1_647;  __rev1_647 = __builtin_shufflevector(__s1_647, __s1_647, 3, 2, 1, 0); \
+  float32x2_t __ret_647; \
+  __ret_647 = __rev0_647 * __noswap_splat_laneq_f32(__rev1_647, __p2_647); \
+  __ret_647 = __builtin_shufflevector(__ret_647, __ret_647, 1, 0); \
+  __ret_647; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmul_laneq_s32(__p0_556, __p1_556, __p2_556) __extension__ ({ \
-  int32x2_t __s0_556 = __p0_556; \
-  int32x4_t __s1_556 = __p1_556; \
-  int32x2_t __ret_556; \
-  __ret_556 = __s0_556 * splat_laneq_s32(__s1_556, __p2_556); \
-  __ret_556; \
+#define vmul_laneq_s32(__p0_648, __p1_648, __p2_648) __extension__ ({ \
+  int32x2_t __s0_648 = __p0_648; \
+  int32x4_t __s1_648 = __p1_648; \
+  int32x2_t __ret_648; \
+  __ret_648 = __s0_648 * splat_laneq_s32(__s1_648, __p2_648); \
+  __ret_648; \
 })
 #else
-#define vmul_laneq_s32(__p0_557, __p1_557, __p2_557) __extension__ ({ \
-  int32x2_t __s0_557 = __p0_557; \
-  int32x4_t __s1_557 = __p1_557; \
-  int32x2_t __rev0_557;  __rev0_557 = __builtin_shufflevector(__s0_557, __s0_557, 1, 0); \
-  int32x4_t __rev1_557;  __rev1_557 = __builtin_shufflevector(__s1_557, __s1_557, 3, 2, 1, 0); \
-  int32x2_t __ret_557; \
-  __ret_557 = __rev0_557 * __noswap_splat_laneq_s32(__rev1_557, __p2_557); \
-  __ret_557 = __builtin_shufflevector(__ret_557, __ret_557, 1, 0); \
-  __ret_557; \
+#define vmul_laneq_s32(__p0_649, __p1_649, __p2_649) __extension__ ({ \
+  int32x2_t __s0_649 = __p0_649; \
+  int32x4_t __s1_649 = __p1_649; \
+  int32x2_t __rev0_649;  __rev0_649 = __builtin_shufflevector(__s0_649, __s0_649, 1, 0); \
+  int32x4_t __rev1_649;  __rev1_649 = __builtin_shufflevector(__s1_649, __s1_649, 3, 2, 1, 0); \
+  int32x2_t __ret_649; \
+  __ret_649 = __rev0_649 * __noswap_splat_laneq_s32(__rev1_649, __p2_649); \
+  __ret_649 = __builtin_shufflevector(__ret_649, __ret_649, 1, 0); \
+  __ret_649; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmul_laneq_s16(__p0_558, __p1_558, __p2_558) __extension__ ({ \
-  int16x4_t __s0_558 = __p0_558; \
-  int16x8_t __s1_558 = __p1_558; \
-  int16x4_t __ret_558; \
-  __ret_558 = __s0_558 * splat_laneq_s16(__s1_558, __p2_558); \
-  __ret_558; \
+#define vmul_laneq_s16(__p0_650, __p1_650, __p2_650) __extension__ ({ \
+  int16x4_t __s0_650 = __p0_650; \
+  int16x8_t __s1_650 = __p1_650; \
+  int16x4_t __ret_650; \
+  __ret_650 = __s0_650 * splat_laneq_s16(__s1_650, __p2_650); \
+  __ret_650; \
 })
 #else
-#define vmul_laneq_s16(__p0_559, __p1_559, __p2_559) __extension__ ({ \
-  int16x4_t __s0_559 = __p0_559; \
-  int16x8_t __s1_559 = __p1_559; \
-  int16x4_t __rev0_559;  __rev0_559 = __builtin_shufflevector(__s0_559, __s0_559, 3, 2, 1, 0); \
-  int16x8_t __rev1_559;  __rev1_559 = __builtin_shufflevector(__s1_559, __s1_559, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x4_t __ret_559; \
-  __ret_559 = __rev0_559 * __noswap_splat_laneq_s16(__rev1_559, __p2_559); \
-  __ret_559 = __builtin_shufflevector(__ret_559, __ret_559, 3, 2, 1, 0); \
-  __ret_559; \
+#define vmul_laneq_s16(__p0_651, __p1_651, __p2_651) __extension__ ({ \
+  int16x4_t __s0_651 = __p0_651; \
+  int16x8_t __s1_651 = __p1_651; \
+  int16x4_t __rev0_651;  __rev0_651 = __builtin_shufflevector(__s0_651, __s0_651, 3, 2, 1, 0); \
+  int16x8_t __rev1_651;  __rev1_651 = __builtin_shufflevector(__s1_651, __s1_651, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __ret_651; \
+  __ret_651 = __rev0_651 * __noswap_splat_laneq_s16(__rev1_651, __p2_651); \
+  __ret_651 = __builtin_shufflevector(__ret_651, __ret_651, 3, 2, 1, 0); \
+  __ret_651; \
 })
 #endif
 
@@ -54471,170 +56141,170 @@ __ai poly128_t vmull_high_p64(poly64x2_t __p0, poly64x2_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmull_high_lane_u32(__p0_560, __p1_560, __p2_560) __extension__ ({ \
-  uint32x4_t __s0_560 = __p0_560; \
-  uint32x2_t __s1_560 = __p1_560; \
-  uint64x2_t __ret_560; \
-  __ret_560 = vmull_u32(vget_high_u32(__s0_560), splat_lane_u32(__s1_560, __p2_560)); \
-  __ret_560; \
+#define vmull_high_lane_u32(__p0_652, __p1_652, __p2_652) __extension__ ({ \
+  uint32x4_t __s0_652 = __p0_652; \
+  uint32x2_t __s1_652 = __p1_652; \
+  uint64x2_t __ret_652; \
+  __ret_652 = vmull_u32(vget_high_u32(__s0_652), splat_lane_u32(__s1_652, __p2_652)); \
+  __ret_652; \
 })
 #else
-#define vmull_high_lane_u32(__p0_561, __p1_561, __p2_561) __extension__ ({ \
-  uint32x4_t __s0_561 = __p0_561; \
-  uint32x2_t __s1_561 = __p1_561; \
-  uint32x4_t __rev0_561;  __rev0_561 = __builtin_shufflevector(__s0_561, __s0_561, 3, 2, 1, 0); \
-  uint32x2_t __rev1_561;  __rev1_561 = __builtin_shufflevector(__s1_561, __s1_561, 1, 0); \
-  uint64x2_t __ret_561; \
-  __ret_561 = __noswap_vmull_u32(__noswap_vget_high_u32(__rev0_561), __noswap_splat_lane_u32(__rev1_561, __p2_561)); \
-  __ret_561 = __builtin_shufflevector(__ret_561, __ret_561, 1, 0); \
-  __ret_561; \
+#define vmull_high_lane_u32(__p0_653, __p1_653, __p2_653) __extension__ ({ \
+  uint32x4_t __s0_653 = __p0_653; \
+  uint32x2_t __s1_653 = __p1_653; \
+  uint32x4_t __rev0_653;  __rev0_653 = __builtin_shufflevector(__s0_653, __s0_653, 3, 2, 1, 0); \
+  uint32x2_t __rev1_653;  __rev1_653 = __builtin_shufflevector(__s1_653, __s1_653, 1, 0); \
+  uint64x2_t __ret_653; \
+  __ret_653 = __noswap_vmull_u32(__noswap_vget_high_u32(__rev0_653), __noswap_splat_lane_u32(__rev1_653, __p2_653)); \
+  __ret_653 = __builtin_shufflevector(__ret_653, __ret_653, 1, 0); \
+  __ret_653; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmull_high_lane_u16(__p0_562, __p1_562, __p2_562) __extension__ ({ \
-  uint16x8_t __s0_562 = __p0_562; \
-  uint16x4_t __s1_562 = __p1_562; \
-  uint32x4_t __ret_562; \
-  __ret_562 = vmull_u16(vget_high_u16(__s0_562), splat_lane_u16(__s1_562, __p2_562)); \
-  __ret_562; \
+#define vmull_high_lane_u16(__p0_654, __p1_654, __p2_654) __extension__ ({ \
+  uint16x8_t __s0_654 = __p0_654; \
+  uint16x4_t __s1_654 = __p1_654; \
+  uint32x4_t __ret_654; \
+  __ret_654 = vmull_u16(vget_high_u16(__s0_654), splat_lane_u16(__s1_654, __p2_654)); \
+  __ret_654; \
 })
 #else
-#define vmull_high_lane_u16(__p0_563, __p1_563, __p2_563) __extension__ ({ \
-  uint16x8_t __s0_563 = __p0_563; \
-  uint16x4_t __s1_563 = __p1_563; \
-  uint16x8_t __rev0_563;  __rev0_563 = __builtin_shufflevector(__s0_563, __s0_563, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x4_t __rev1_563;  __rev1_563 = __builtin_shufflevector(__s1_563, __s1_563, 3, 2, 1, 0); \
-  uint32x4_t __ret_563; \
-  __ret_563 = __noswap_vmull_u16(__noswap_vget_high_u16(__rev0_563), __noswap_splat_lane_u16(__rev1_563, __p2_563)); \
-  __ret_563 = __builtin_shufflevector(__ret_563, __ret_563, 3, 2, 1, 0); \
-  __ret_563; \
+#define vmull_high_lane_u16(__p0_655, __p1_655, __p2_655) __extension__ ({ \
+  uint16x8_t __s0_655 = __p0_655; \
+  uint16x4_t __s1_655 = __p1_655; \
+  uint16x8_t __rev0_655;  __rev0_655 = __builtin_shufflevector(__s0_655, __s0_655, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x4_t __rev1_655;  __rev1_655 = __builtin_shufflevector(__s1_655, __s1_655, 3, 2, 1, 0); \
+  uint32x4_t __ret_655; \
+  __ret_655 = __noswap_vmull_u16(__noswap_vget_high_u16(__rev0_655), __noswap_splat_lane_u16(__rev1_655, __p2_655)); \
+  __ret_655 = __builtin_shufflevector(__ret_655, __ret_655, 3, 2, 1, 0); \
+  __ret_655; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmull_high_lane_s32(__p0_564, __p1_564, __p2_564) __extension__ ({ \
-  int32x4_t __s0_564 = __p0_564; \
-  int32x2_t __s1_564 = __p1_564; \
-  int64x2_t __ret_564; \
-  __ret_564 = vmull_s32(vget_high_s32(__s0_564), splat_lane_s32(__s1_564, __p2_564)); \
-  __ret_564; \
+#define vmull_high_lane_s32(__p0_656, __p1_656, __p2_656) __extension__ ({ \
+  int32x4_t __s0_656 = __p0_656; \
+  int32x2_t __s1_656 = __p1_656; \
+  int64x2_t __ret_656; \
+  __ret_656 = vmull_s32(vget_high_s32(__s0_656), splat_lane_s32(__s1_656, __p2_656)); \
+  __ret_656; \
 })
 #else
-#define vmull_high_lane_s32(__p0_565, __p1_565, __p2_565) __extension__ ({ \
-  int32x4_t __s0_565 = __p0_565; \
-  int32x2_t __s1_565 = __p1_565; \
-  int32x4_t __rev0_565;  __rev0_565 = __builtin_shufflevector(__s0_565, __s0_565, 3, 2, 1, 0); \
-  int32x2_t __rev1_565;  __rev1_565 = __builtin_shufflevector(__s1_565, __s1_565, 1, 0); \
-  int64x2_t __ret_565; \
-  __ret_565 = __noswap_vmull_s32(__noswap_vget_high_s32(__rev0_565), __noswap_splat_lane_s32(__rev1_565, __p2_565)); \
-  __ret_565 = __builtin_shufflevector(__ret_565, __ret_565, 1, 0); \
-  __ret_565; \
+#define vmull_high_lane_s32(__p0_657, __p1_657, __p2_657) __extension__ ({ \
+  int32x4_t __s0_657 = __p0_657; \
+  int32x2_t __s1_657 = __p1_657; \
+  int32x4_t __rev0_657;  __rev0_657 = __builtin_shufflevector(__s0_657, __s0_657, 3, 2, 1, 0); \
+  int32x2_t __rev1_657;  __rev1_657 = __builtin_shufflevector(__s1_657, __s1_657, 1, 0); \
+  int64x2_t __ret_657; \
+  __ret_657 = __noswap_vmull_s32(__noswap_vget_high_s32(__rev0_657), __noswap_splat_lane_s32(__rev1_657, __p2_657)); \
+  __ret_657 = __builtin_shufflevector(__ret_657, __ret_657, 1, 0); \
+  __ret_657; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmull_high_lane_s16(__p0_566, __p1_566, __p2_566) __extension__ ({ \
-  int16x8_t __s0_566 = __p0_566; \
-  int16x4_t __s1_566 = __p1_566; \
-  int32x4_t __ret_566; \
-  __ret_566 = vmull_s16(vget_high_s16(__s0_566), splat_lane_s16(__s1_566, __p2_566)); \
-  __ret_566; \
+#define vmull_high_lane_s16(__p0_658, __p1_658, __p2_658) __extension__ ({ \
+  int16x8_t __s0_658 = __p0_658; \
+  int16x4_t __s1_658 = __p1_658; \
+  int32x4_t __ret_658; \
+  __ret_658 = vmull_s16(vget_high_s16(__s0_658), splat_lane_s16(__s1_658, __p2_658)); \
+  __ret_658; \
 })
 #else
-#define vmull_high_lane_s16(__p0_567, __p1_567, __p2_567) __extension__ ({ \
-  int16x8_t __s0_567 = __p0_567; \
-  int16x4_t __s1_567 = __p1_567; \
-  int16x8_t __rev0_567;  __rev0_567 = __builtin_shufflevector(__s0_567, __s0_567, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x4_t __rev1_567;  __rev1_567 = __builtin_shufflevector(__s1_567, __s1_567, 3, 2, 1, 0); \
-  int32x4_t __ret_567; \
-  __ret_567 = __noswap_vmull_s16(__noswap_vget_high_s16(__rev0_567), __noswap_splat_lane_s16(__rev1_567, __p2_567)); \
-  __ret_567 = __builtin_shufflevector(__ret_567, __ret_567, 3, 2, 1, 0); \
-  __ret_567; \
+#define vmull_high_lane_s16(__p0_659, __p1_659, __p2_659) __extension__ ({ \
+  int16x8_t __s0_659 = __p0_659; \
+  int16x4_t __s1_659 = __p1_659; \
+  int16x8_t __rev0_659;  __rev0_659 = __builtin_shufflevector(__s0_659, __s0_659, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __rev1_659;  __rev1_659 = __builtin_shufflevector(__s1_659, __s1_659, 3, 2, 1, 0); \
+  int32x4_t __ret_659; \
+  __ret_659 = __noswap_vmull_s16(__noswap_vget_high_s16(__rev0_659), __noswap_splat_lane_s16(__rev1_659, __p2_659)); \
+  __ret_659 = __builtin_shufflevector(__ret_659, __ret_659, 3, 2, 1, 0); \
+  __ret_659; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmull_high_laneq_u32(__p0_568, __p1_568, __p2_568) __extension__ ({ \
-  uint32x4_t __s0_568 = __p0_568; \
-  uint32x4_t __s1_568 = __p1_568; \
-  uint64x2_t __ret_568; \
-  __ret_568 = vmull_u32(vget_high_u32(__s0_568), splat_laneq_u32(__s1_568, __p2_568)); \
-  __ret_568; \
+#define vmull_high_laneq_u32(__p0_660, __p1_660, __p2_660) __extension__ ({ \
+  uint32x4_t __s0_660 = __p0_660; \
+  uint32x4_t __s1_660 = __p1_660; \
+  uint64x2_t __ret_660; \
+  __ret_660 = vmull_u32(vget_high_u32(__s0_660), splat_laneq_u32(__s1_660, __p2_660)); \
+  __ret_660; \
 })
 #else
-#define vmull_high_laneq_u32(__p0_569, __p1_569, __p2_569) __extension__ ({ \
-  uint32x4_t __s0_569 = __p0_569; \
-  uint32x4_t __s1_569 = __p1_569; \
-  uint32x4_t __rev0_569;  __rev0_569 = __builtin_shufflevector(__s0_569, __s0_569, 3, 2, 1, 0); \
-  uint32x4_t __rev1_569;  __rev1_569 = __builtin_shufflevector(__s1_569, __s1_569, 3, 2, 1, 0); \
-  uint64x2_t __ret_569; \
-  __ret_569 = __noswap_vmull_u32(__noswap_vget_high_u32(__rev0_569), __noswap_splat_laneq_u32(__rev1_569, __p2_569)); \
-  __ret_569 = __builtin_shufflevector(__ret_569, __ret_569, 1, 0); \
-  __ret_569; \
+#define vmull_high_laneq_u32(__p0_661, __p1_661, __p2_661) __extension__ ({ \
+  uint32x4_t __s0_661 = __p0_661; \
+  uint32x4_t __s1_661 = __p1_661; \
+  uint32x4_t __rev0_661;  __rev0_661 = __builtin_shufflevector(__s0_661, __s0_661, 3, 2, 1, 0); \
+  uint32x4_t __rev1_661;  __rev1_661 = __builtin_shufflevector(__s1_661, __s1_661, 3, 2, 1, 0); \
+  uint64x2_t __ret_661; \
+  __ret_661 = __noswap_vmull_u32(__noswap_vget_high_u32(__rev0_661), __noswap_splat_laneq_u32(__rev1_661, __p2_661)); \
+  __ret_661 = __builtin_shufflevector(__ret_661, __ret_661, 1, 0); \
+  __ret_661; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmull_high_laneq_u16(__p0_570, __p1_570, __p2_570) __extension__ ({ \
-  uint16x8_t __s0_570 = __p0_570; \
-  uint16x8_t __s1_570 = __p1_570; \
-  uint32x4_t __ret_570; \
-  __ret_570 = vmull_u16(vget_high_u16(__s0_570), splat_laneq_u16(__s1_570, __p2_570)); \
-  __ret_570; \
+#define vmull_high_laneq_u16(__p0_662, __p1_662, __p2_662) __extension__ ({ \
+  uint16x8_t __s0_662 = __p0_662; \
+  uint16x8_t __s1_662 = __p1_662; \
+  uint32x4_t __ret_662; \
+  __ret_662 = vmull_u16(vget_high_u16(__s0_662), splat_laneq_u16(__s1_662, __p2_662)); \
+  __ret_662; \
 })
 #else
-#define vmull_high_laneq_u16(__p0_571, __p1_571, __p2_571) __extension__ ({ \
-  uint16x8_t __s0_571 = __p0_571; \
-  uint16x8_t __s1_571 = __p1_571; \
-  uint16x8_t __rev0_571;  __rev0_571 = __builtin_shufflevector(__s0_571, __s0_571, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __rev1_571;  __rev1_571 = __builtin_shufflevector(__s1_571, __s1_571, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint32x4_t __ret_571; \
-  __ret_571 = __noswap_vmull_u16(__noswap_vget_high_u16(__rev0_571), __noswap_splat_laneq_u16(__rev1_571, __p2_571)); \
-  __ret_571 = __builtin_shufflevector(__ret_571, __ret_571, 3, 2, 1, 0); \
-  __ret_571; \
+#define vmull_high_laneq_u16(__p0_663, __p1_663, __p2_663) __extension__ ({ \
+  uint16x8_t __s0_663 = __p0_663; \
+  uint16x8_t __s1_663 = __p1_663; \
+  uint16x8_t __rev0_663;  __rev0_663 = __builtin_shufflevector(__s0_663, __s0_663, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1_663;  __rev1_663 = __builtin_shufflevector(__s1_663, __s1_663, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint32x4_t __ret_663; \
+  __ret_663 = __noswap_vmull_u16(__noswap_vget_high_u16(__rev0_663), __noswap_splat_laneq_u16(__rev1_663, __p2_663)); \
+  __ret_663 = __builtin_shufflevector(__ret_663, __ret_663, 3, 2, 1, 0); \
+  __ret_663; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmull_high_laneq_s32(__p0_572, __p1_572, __p2_572) __extension__ ({ \
-  int32x4_t __s0_572 = __p0_572; \
-  int32x4_t __s1_572 = __p1_572; \
-  int64x2_t __ret_572; \
-  __ret_572 = vmull_s32(vget_high_s32(__s0_572), splat_laneq_s32(__s1_572, __p2_572)); \
-  __ret_572; \
+#define vmull_high_laneq_s32(__p0_664, __p1_664, __p2_664) __extension__ ({ \
+  int32x4_t __s0_664 = __p0_664; \
+  int32x4_t __s1_664 = __p1_664; \
+  int64x2_t __ret_664; \
+  __ret_664 = vmull_s32(vget_high_s32(__s0_664), splat_laneq_s32(__s1_664, __p2_664)); \
+  __ret_664; \
 })
 #else
-#define vmull_high_laneq_s32(__p0_573, __p1_573, __p2_573) __extension__ ({ \
-  int32x4_t __s0_573 = __p0_573; \
-  int32x4_t __s1_573 = __p1_573; \
-  int32x4_t __rev0_573;  __rev0_573 = __builtin_shufflevector(__s0_573, __s0_573, 3, 2, 1, 0); \
-  int32x4_t __rev1_573;  __rev1_573 = __builtin_shufflevector(__s1_573, __s1_573, 3, 2, 1, 0); \
-  int64x2_t __ret_573; \
-  __ret_573 = __noswap_vmull_s32(__noswap_vget_high_s32(__rev0_573), __noswap_splat_laneq_s32(__rev1_573, __p2_573)); \
-  __ret_573 = __builtin_shufflevector(__ret_573, __ret_573, 1, 0); \
-  __ret_573; \
+#define vmull_high_laneq_s32(__p0_665, __p1_665, __p2_665) __extension__ ({ \
+  int32x4_t __s0_665 = __p0_665; \
+  int32x4_t __s1_665 = __p1_665; \
+  int32x4_t __rev0_665;  __rev0_665 = __builtin_shufflevector(__s0_665, __s0_665, 3, 2, 1, 0); \
+  int32x4_t __rev1_665;  __rev1_665 = __builtin_shufflevector(__s1_665, __s1_665, 3, 2, 1, 0); \
+  int64x2_t __ret_665; \
+  __ret_665 = __noswap_vmull_s32(__noswap_vget_high_s32(__rev0_665), __noswap_splat_laneq_s32(__rev1_665, __p2_665)); \
+  __ret_665 = __builtin_shufflevector(__ret_665, __ret_665, 1, 0); \
+  __ret_665; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmull_high_laneq_s16(__p0_574, __p1_574, __p2_574) __extension__ ({ \
-  int16x8_t __s0_574 = __p0_574; \
-  int16x8_t __s1_574 = __p1_574; \
-  int32x4_t __ret_574; \
-  __ret_574 = vmull_s16(vget_high_s16(__s0_574), splat_laneq_s16(__s1_574, __p2_574)); \
-  __ret_574; \
+#define vmull_high_laneq_s16(__p0_666, __p1_666, __p2_666) __extension__ ({ \
+  int16x8_t __s0_666 = __p0_666; \
+  int16x8_t __s1_666 = __p1_666; \
+  int32x4_t __ret_666; \
+  __ret_666 = vmull_s16(vget_high_s16(__s0_666), splat_laneq_s16(__s1_666, __p2_666)); \
+  __ret_666; \
 })
 #else
-#define vmull_high_laneq_s16(__p0_575, __p1_575, __p2_575) __extension__ ({ \
-  int16x8_t __s0_575 = __p0_575; \
-  int16x8_t __s1_575 = __p1_575; \
-  int16x8_t __rev0_575;  __rev0_575 = __builtin_shufflevector(__s0_575, __s0_575, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1_575;  __rev1_575 = __builtin_shufflevector(__s1_575, __s1_575, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int32x4_t __ret_575; \
-  __ret_575 = __noswap_vmull_s16(__noswap_vget_high_s16(__rev0_575), __noswap_splat_laneq_s16(__rev1_575, __p2_575)); \
-  __ret_575 = __builtin_shufflevector(__ret_575, __ret_575, 3, 2, 1, 0); \
-  __ret_575; \
+#define vmull_high_laneq_s16(__p0_667, __p1_667, __p2_667) __extension__ ({ \
+  int16x8_t __s0_667 = __p0_667; \
+  int16x8_t __s1_667 = __p1_667; \
+  int16x8_t __rev0_667;  __rev0_667 = __builtin_shufflevector(__s0_667, __s0_667, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1_667;  __rev1_667 = __builtin_shufflevector(__s1_667, __s1_667, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret_667; \
+  __ret_667 = __noswap_vmull_s16(__noswap_vget_high_s16(__rev0_667), __noswap_splat_laneq_s16(__rev1_667, __p2_667)); \
+  __ret_667 = __builtin_shufflevector(__ret_667, __ret_667, 3, 2, 1, 0); \
+  __ret_667; \
 })
 #endif
 
@@ -54703,86 +56373,86 @@ __ai int32x4_t vmull_high_n_s16(int16x8_t __p0, int16_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmull_laneq_u32(__p0_576, __p1_576, __p2_576) __extension__ ({ \
-  uint32x2_t __s0_576 = __p0_576; \
-  uint32x4_t __s1_576 = __p1_576; \
-  uint64x2_t __ret_576; \
-  __ret_576 = vmull_u32(__s0_576, splat_laneq_u32(__s1_576, __p2_576)); \
-  __ret_576; \
+#define vmull_laneq_u32(__p0_668, __p1_668, __p2_668) __extension__ ({ \
+  uint32x2_t __s0_668 = __p0_668; \
+  uint32x4_t __s1_668 = __p1_668; \
+  uint64x2_t __ret_668; \
+  __ret_668 = vmull_u32(__s0_668, splat_laneq_u32(__s1_668, __p2_668)); \
+  __ret_668; \
 })
 #else
-#define vmull_laneq_u32(__p0_577, __p1_577, __p2_577) __extension__ ({ \
-  uint32x2_t __s0_577 = __p0_577; \
-  uint32x4_t __s1_577 = __p1_577; \
-  uint32x2_t __rev0_577;  __rev0_577 = __builtin_shufflevector(__s0_577, __s0_577, 1, 0); \
-  uint32x4_t __rev1_577;  __rev1_577 = __builtin_shufflevector(__s1_577, __s1_577, 3, 2, 1, 0); \
-  uint64x2_t __ret_577; \
-  __ret_577 = __noswap_vmull_u32(__rev0_577, __noswap_splat_laneq_u32(__rev1_577, __p2_577)); \
-  __ret_577 = __builtin_shufflevector(__ret_577, __ret_577, 1, 0); \
-  __ret_577; \
+#define vmull_laneq_u32(__p0_669, __p1_669, __p2_669) __extension__ ({ \
+  uint32x2_t __s0_669 = __p0_669; \
+  uint32x4_t __s1_669 = __p1_669; \
+  uint32x2_t __rev0_669;  __rev0_669 = __builtin_shufflevector(__s0_669, __s0_669, 1, 0); \
+  uint32x4_t __rev1_669;  __rev1_669 = __builtin_shufflevector(__s1_669, __s1_669, 3, 2, 1, 0); \
+  uint64x2_t __ret_669; \
+  __ret_669 = __noswap_vmull_u32(__rev0_669, __noswap_splat_laneq_u32(__rev1_669, __p2_669)); \
+  __ret_669 = __builtin_shufflevector(__ret_669, __ret_669, 1, 0); \
+  __ret_669; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmull_laneq_u16(__p0_578, __p1_578, __p2_578) __extension__ ({ \
-  uint16x4_t __s0_578 = __p0_578; \
-  uint16x8_t __s1_578 = __p1_578; \
-  uint32x4_t __ret_578; \
-  __ret_578 = vmull_u16(__s0_578, splat_laneq_u16(__s1_578, __p2_578)); \
-  __ret_578; \
+#define vmull_laneq_u16(__p0_670, __p1_670, __p2_670) __extension__ ({ \
+  uint16x4_t __s0_670 = __p0_670; \
+  uint16x8_t __s1_670 = __p1_670; \
+  uint32x4_t __ret_670; \
+  __ret_670 = vmull_u16(__s0_670, splat_laneq_u16(__s1_670, __p2_670)); \
+  __ret_670; \
 })
 #else
-#define vmull_laneq_u16(__p0_579, __p1_579, __p2_579) __extension__ ({ \
-  uint16x4_t __s0_579 = __p0_579; \
-  uint16x8_t __s1_579 = __p1_579; \
-  uint16x4_t __rev0_579;  __rev0_579 = __builtin_shufflevector(__s0_579, __s0_579, 3, 2, 1, 0); \
-  uint16x8_t __rev1_579;  __rev1_579 = __builtin_shufflevector(__s1_579, __s1_579, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint32x4_t __ret_579; \
-  __ret_579 = __noswap_vmull_u16(__rev0_579, __noswap_splat_laneq_u16(__rev1_579, __p2_579)); \
-  __ret_579 = __builtin_shufflevector(__ret_579, __ret_579, 3, 2, 1, 0); \
-  __ret_579; \
+#define vmull_laneq_u16(__p0_671, __p1_671, __p2_671) __extension__ ({ \
+  uint16x4_t __s0_671 = __p0_671; \
+  uint16x8_t __s1_671 = __p1_671; \
+  uint16x4_t __rev0_671;  __rev0_671 = __builtin_shufflevector(__s0_671, __s0_671, 3, 2, 1, 0); \
+  uint16x8_t __rev1_671;  __rev1_671 = __builtin_shufflevector(__s1_671, __s1_671, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint32x4_t __ret_671; \
+  __ret_671 = __noswap_vmull_u16(__rev0_671, __noswap_splat_laneq_u16(__rev1_671, __p2_671)); \
+  __ret_671 = __builtin_shufflevector(__ret_671, __ret_671, 3, 2, 1, 0); \
+  __ret_671; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmull_laneq_s32(__p0_580, __p1_580, __p2_580) __extension__ ({ \
-  int32x2_t __s0_580 = __p0_580; \
-  int32x4_t __s1_580 = __p1_580; \
-  int64x2_t __ret_580; \
-  __ret_580 = vmull_s32(__s0_580, splat_laneq_s32(__s1_580, __p2_580)); \
-  __ret_580; \
+#define vmull_laneq_s32(__p0_672, __p1_672, __p2_672) __extension__ ({ \
+  int32x2_t __s0_672 = __p0_672; \
+  int32x4_t __s1_672 = __p1_672; \
+  int64x2_t __ret_672; \
+  __ret_672 = vmull_s32(__s0_672, splat_laneq_s32(__s1_672, __p2_672)); \
+  __ret_672; \
 })
 #else
-#define vmull_laneq_s32(__p0_581, __p1_581, __p2_581) __extension__ ({ \
-  int32x2_t __s0_581 = __p0_581; \
-  int32x4_t __s1_581 = __p1_581; \
-  int32x2_t __rev0_581;  __rev0_581 = __builtin_shufflevector(__s0_581, __s0_581, 1, 0); \
-  int32x4_t __rev1_581;  __rev1_581 = __builtin_shufflevector(__s1_581, __s1_581, 3, 2, 1, 0); \
-  int64x2_t __ret_581; \
-  __ret_581 = __noswap_vmull_s32(__rev0_581, __noswap_splat_laneq_s32(__rev1_581, __p2_581)); \
-  __ret_581 = __builtin_shufflevector(__ret_581, __ret_581, 1, 0); \
-  __ret_581; \
+#define vmull_laneq_s32(__p0_673, __p1_673, __p2_673) __extension__ ({ \
+  int32x2_t __s0_673 = __p0_673; \
+  int32x4_t __s1_673 = __p1_673; \
+  int32x2_t __rev0_673;  __rev0_673 = __builtin_shufflevector(__s0_673, __s0_673, 1, 0); \
+  int32x4_t __rev1_673;  __rev1_673 = __builtin_shufflevector(__s1_673, __s1_673, 3, 2, 1, 0); \
+  int64x2_t __ret_673; \
+  __ret_673 = __noswap_vmull_s32(__rev0_673, __noswap_splat_laneq_s32(__rev1_673, __p2_673)); \
+  __ret_673 = __builtin_shufflevector(__ret_673, __ret_673, 1, 0); \
+  __ret_673; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmull_laneq_s16(__p0_582, __p1_582, __p2_582) __extension__ ({ \
-  int16x4_t __s0_582 = __p0_582; \
-  int16x8_t __s1_582 = __p1_582; \
-  int32x4_t __ret_582; \
-  __ret_582 = vmull_s16(__s0_582, splat_laneq_s16(__s1_582, __p2_582)); \
-  __ret_582; \
+#define vmull_laneq_s16(__p0_674, __p1_674, __p2_674) __extension__ ({ \
+  int16x4_t __s0_674 = __p0_674; \
+  int16x8_t __s1_674 = __p1_674; \
+  int32x4_t __ret_674; \
+  __ret_674 = vmull_s16(__s0_674, splat_laneq_s16(__s1_674, __p2_674)); \
+  __ret_674; \
 })
 #else
-#define vmull_laneq_s16(__p0_583, __p1_583, __p2_583) __extension__ ({ \
-  int16x4_t __s0_583 = __p0_583; \
-  int16x8_t __s1_583 = __p1_583; \
-  int16x4_t __rev0_583;  __rev0_583 = __builtin_shufflevector(__s0_583, __s0_583, 3, 2, 1, 0); \
-  int16x8_t __rev1_583;  __rev1_583 = __builtin_shufflevector(__s1_583, __s1_583, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int32x4_t __ret_583; \
-  __ret_583 = __noswap_vmull_s16(__rev0_583, __noswap_splat_laneq_s16(__rev1_583, __p2_583)); \
-  __ret_583 = __builtin_shufflevector(__ret_583, __ret_583, 3, 2, 1, 0); \
-  __ret_583; \
+#define vmull_laneq_s16(__p0_675, __p1_675, __p2_675) __extension__ ({ \
+  int16x4_t __s0_675 = __p0_675; \
+  int16x8_t __s1_675 = __p1_675; \
+  int16x4_t __rev0_675;  __rev0_675 = __builtin_shufflevector(__s0_675, __s0_675, 3, 2, 1, 0); \
+  int16x8_t __rev1_675;  __rev1_675 = __builtin_shufflevector(__s1_675, __s1_675, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret_675; \
+  __ret_675 = __noswap_vmull_s16(__rev0_675, __noswap_splat_laneq_s16(__rev1_675, __p2_675)); \
+  __ret_675 = __builtin_shufflevector(__ret_675, __ret_675, 3, 2, 1, 0); \
+  __ret_675; \
 })
 #endif
 
@@ -54867,192 +56537,192 @@ __ai float32_t vmulxs_f32(float32_t __p0, float32_t __p1) {
   __ret = (float32_t) __builtin_neon_vmulxs_f32(__p0, __p1);
   return __ret;
 }
-#define vmulxd_lane_f64(__p0_584, __p1_584, __p2_584) __extension__ ({ \
-  float64_t __s0_584 = __p0_584; \
-  float64x1_t __s1_584 = __p1_584; \
-  float64_t __ret_584; \
-  __ret_584 = vmulxd_f64(__s0_584, vget_lane_f64(__s1_584, __p2_584)); \
-  __ret_584; \
+#define vmulxd_lane_f64(__p0_676, __p1_676, __p2_676) __extension__ ({ \
+  float64_t __s0_676 = __p0_676; \
+  float64x1_t __s1_676 = __p1_676; \
+  float64_t __ret_676; \
+  __ret_676 = vmulxd_f64(__s0_676, vget_lane_f64(__s1_676, __p2_676)); \
+  __ret_676; \
 })
 #ifdef __LITTLE_ENDIAN__
-#define vmulxs_lane_f32(__p0_585, __p1_585, __p2_585) __extension__ ({ \
-  float32_t __s0_585 = __p0_585; \
-  float32x2_t __s1_585 = __p1_585; \
-  float32_t __ret_585; \
-  __ret_585 = vmulxs_f32(__s0_585, vget_lane_f32(__s1_585, __p2_585)); \
-  __ret_585; \
+#define vmulxs_lane_f32(__p0_677, __p1_677, __p2_677) __extension__ ({ \
+  float32_t __s0_677 = __p0_677; \
+  float32x2_t __s1_677 = __p1_677; \
+  float32_t __ret_677; \
+  __ret_677 = vmulxs_f32(__s0_677, vget_lane_f32(__s1_677, __p2_677)); \
+  __ret_677; \
 })
 #else
-#define vmulxs_lane_f32(__p0_586, __p1_586, __p2_586) __extension__ ({ \
-  float32_t __s0_586 = __p0_586; \
-  float32x2_t __s1_586 = __p1_586; \
-  float32x2_t __rev1_586;  __rev1_586 = __builtin_shufflevector(__s1_586, __s1_586, 1, 0); \
-  float32_t __ret_586; \
-  __ret_586 = vmulxs_f32(__s0_586, __noswap_vget_lane_f32(__rev1_586, __p2_586)); \
-  __ret_586; \
+#define vmulxs_lane_f32(__p0_678, __p1_678, __p2_678) __extension__ ({ \
+  float32_t __s0_678 = __p0_678; \
+  float32x2_t __s1_678 = __p1_678; \
+  float32x2_t __rev1_678;  __rev1_678 = __builtin_shufflevector(__s1_678, __s1_678, 1, 0); \
+  float32_t __ret_678; \
+  __ret_678 = vmulxs_f32(__s0_678, __noswap_vget_lane_f32(__rev1_678, __p2_678)); \
+  __ret_678; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmulxq_lane_f64(__p0_587, __p1_587, __p2_587) __extension__ ({ \
-  float64x2_t __s0_587 = __p0_587; \
-  float64x1_t __s1_587 = __p1_587; \
-  float64x2_t __ret_587; \
-  __ret_587 = vmulxq_f64(__s0_587, splatq_lane_f64(__s1_587, __p2_587)); \
-  __ret_587; \
+#define vmulxq_lane_f64(__p0_679, __p1_679, __p2_679) __extension__ ({ \
+  float64x2_t __s0_679 = __p0_679; \
+  float64x1_t __s1_679 = __p1_679; \
+  float64x2_t __ret_679; \
+  __ret_679 = vmulxq_f64(__s0_679, splatq_lane_f64(__s1_679, __p2_679)); \
+  __ret_679; \
 })
 #else
-#define vmulxq_lane_f64(__p0_588, __p1_588, __p2_588) __extension__ ({ \
-  float64x2_t __s0_588 = __p0_588; \
-  float64x1_t __s1_588 = __p1_588; \
-  float64x2_t __rev0_588;  __rev0_588 = __builtin_shufflevector(__s0_588, __s0_588, 1, 0); \
-  float64x2_t __ret_588; \
-  __ret_588 = __noswap_vmulxq_f64(__rev0_588, __noswap_splatq_lane_f64(__s1_588, __p2_588)); \
-  __ret_588 = __builtin_shufflevector(__ret_588, __ret_588, 1, 0); \
-  __ret_588; \
+#define vmulxq_lane_f64(__p0_680, __p1_680, __p2_680) __extension__ ({ \
+  float64x2_t __s0_680 = __p0_680; \
+  float64x1_t __s1_680 = __p1_680; \
+  float64x2_t __rev0_680;  __rev0_680 = __builtin_shufflevector(__s0_680, __s0_680, 1, 0); \
+  float64x2_t __ret_680; \
+  __ret_680 = __noswap_vmulxq_f64(__rev0_680, __noswap_splatq_lane_f64(__s1_680, __p2_680)); \
+  __ret_680 = __builtin_shufflevector(__ret_680, __ret_680, 1, 0); \
+  __ret_680; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmulxq_lane_f32(__p0_589, __p1_589, __p2_589) __extension__ ({ \
-  float32x4_t __s0_589 = __p0_589; \
-  float32x2_t __s1_589 = __p1_589; \
-  float32x4_t __ret_589; \
-  __ret_589 = vmulxq_f32(__s0_589, splatq_lane_f32(__s1_589, __p2_589)); \
-  __ret_589; \
+#define vmulxq_lane_f32(__p0_681, __p1_681, __p2_681) __extension__ ({ \
+  float32x4_t __s0_681 = __p0_681; \
+  float32x2_t __s1_681 = __p1_681; \
+  float32x4_t __ret_681; \
+  __ret_681 = vmulxq_f32(__s0_681, splatq_lane_f32(__s1_681, __p2_681)); \
+  __ret_681; \
 })
 #else
-#define vmulxq_lane_f32(__p0_590, __p1_590, __p2_590) __extension__ ({ \
-  float32x4_t __s0_590 = __p0_590; \
-  float32x2_t __s1_590 = __p1_590; \
-  float32x4_t __rev0_590;  __rev0_590 = __builtin_shufflevector(__s0_590, __s0_590, 3, 2, 1, 0); \
-  float32x2_t __rev1_590;  __rev1_590 = __builtin_shufflevector(__s1_590, __s1_590, 1, 0); \
-  float32x4_t __ret_590; \
-  __ret_590 = __noswap_vmulxq_f32(__rev0_590, __noswap_splatq_lane_f32(__rev1_590, __p2_590)); \
-  __ret_590 = __builtin_shufflevector(__ret_590, __ret_590, 3, 2, 1, 0); \
-  __ret_590; \
+#define vmulxq_lane_f32(__p0_682, __p1_682, __p2_682) __extension__ ({ \
+  float32x4_t __s0_682 = __p0_682; \
+  float32x2_t __s1_682 = __p1_682; \
+  float32x4_t __rev0_682;  __rev0_682 = __builtin_shufflevector(__s0_682, __s0_682, 3, 2, 1, 0); \
+  float32x2_t __rev1_682;  __rev1_682 = __builtin_shufflevector(__s1_682, __s1_682, 1, 0); \
+  float32x4_t __ret_682; \
+  __ret_682 = __noswap_vmulxq_f32(__rev0_682, __noswap_splatq_lane_f32(__rev1_682, __p2_682)); \
+  __ret_682 = __builtin_shufflevector(__ret_682, __ret_682, 3, 2, 1, 0); \
+  __ret_682; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmulx_lane_f32(__p0_591, __p1_591, __p2_591) __extension__ ({ \
-  float32x2_t __s0_591 = __p0_591; \
-  float32x2_t __s1_591 = __p1_591; \
-  float32x2_t __ret_591; \
-  __ret_591 = vmulx_f32(__s0_591, splat_lane_f32(__s1_591, __p2_591)); \
-  __ret_591; \
+#define vmulx_lane_f32(__p0_683, __p1_683, __p2_683) __extension__ ({ \
+  float32x2_t __s0_683 = __p0_683; \
+  float32x2_t __s1_683 = __p1_683; \
+  float32x2_t __ret_683; \
+  __ret_683 = vmulx_f32(__s0_683, splat_lane_f32(__s1_683, __p2_683)); \
+  __ret_683; \
 })
 #else
-#define vmulx_lane_f32(__p0_592, __p1_592, __p2_592) __extension__ ({ \
-  float32x2_t __s0_592 = __p0_592; \
-  float32x2_t __s1_592 = __p1_592; \
-  float32x2_t __rev0_592;  __rev0_592 = __builtin_shufflevector(__s0_592, __s0_592, 1, 0); \
-  float32x2_t __rev1_592;  __rev1_592 = __builtin_shufflevector(__s1_592, __s1_592, 1, 0); \
-  float32x2_t __ret_592; \
-  __ret_592 = __noswap_vmulx_f32(__rev0_592, __noswap_splat_lane_f32(__rev1_592, __p2_592)); \
-  __ret_592 = __builtin_shufflevector(__ret_592, __ret_592, 1, 0); \
-  __ret_592; \
+#define vmulx_lane_f32(__p0_684, __p1_684, __p2_684) __extension__ ({ \
+  float32x2_t __s0_684 = __p0_684; \
+  float32x2_t __s1_684 = __p1_684; \
+  float32x2_t __rev0_684;  __rev0_684 = __builtin_shufflevector(__s0_684, __s0_684, 1, 0); \
+  float32x2_t __rev1_684;  __rev1_684 = __builtin_shufflevector(__s1_684, __s1_684, 1, 0); \
+  float32x2_t __ret_684; \
+  __ret_684 = __noswap_vmulx_f32(__rev0_684, __noswap_splat_lane_f32(__rev1_684, __p2_684)); \
+  __ret_684 = __builtin_shufflevector(__ret_684, __ret_684, 1, 0); \
+  __ret_684; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmulxd_laneq_f64(__p0_593, __p1_593, __p2_593) __extension__ ({ \
-  float64_t __s0_593 = __p0_593; \
-  float64x2_t __s1_593 = __p1_593; \
-  float64_t __ret_593; \
-  __ret_593 = vmulxd_f64(__s0_593, vgetq_lane_f64(__s1_593, __p2_593)); \
-  __ret_593; \
+#define vmulxd_laneq_f64(__p0_685, __p1_685, __p2_685) __extension__ ({ \
+  float64_t __s0_685 = __p0_685; \
+  float64x2_t __s1_685 = __p1_685; \
+  float64_t __ret_685; \
+  __ret_685 = vmulxd_f64(__s0_685, vgetq_lane_f64(__s1_685, __p2_685)); \
+  __ret_685; \
 })
 #else
-#define vmulxd_laneq_f64(__p0_594, __p1_594, __p2_594) __extension__ ({ \
-  float64_t __s0_594 = __p0_594; \
-  float64x2_t __s1_594 = __p1_594; \
-  float64x2_t __rev1_594;  __rev1_594 = __builtin_shufflevector(__s1_594, __s1_594, 1, 0); \
-  float64_t __ret_594; \
-  __ret_594 = vmulxd_f64(__s0_594, __noswap_vgetq_lane_f64(__rev1_594, __p2_594)); \
-  __ret_594; \
+#define vmulxd_laneq_f64(__p0_686, __p1_686, __p2_686) __extension__ ({ \
+  float64_t __s0_686 = __p0_686; \
+  float64x2_t __s1_686 = __p1_686; \
+  float64x2_t __rev1_686;  __rev1_686 = __builtin_shufflevector(__s1_686, __s1_686, 1, 0); \
+  float64_t __ret_686; \
+  __ret_686 = vmulxd_f64(__s0_686, __noswap_vgetq_lane_f64(__rev1_686, __p2_686)); \
+  __ret_686; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmulxs_laneq_f32(__p0_595, __p1_595, __p2_595) __extension__ ({ \
-  float32_t __s0_595 = __p0_595; \
-  float32x4_t __s1_595 = __p1_595; \
-  float32_t __ret_595; \
-  __ret_595 = vmulxs_f32(__s0_595, vgetq_lane_f32(__s1_595, __p2_595)); \
-  __ret_595; \
+#define vmulxs_laneq_f32(__p0_687, __p1_687, __p2_687) __extension__ ({ \
+  float32_t __s0_687 = __p0_687; \
+  float32x4_t __s1_687 = __p1_687; \
+  float32_t __ret_687; \
+  __ret_687 = vmulxs_f32(__s0_687, vgetq_lane_f32(__s1_687, __p2_687)); \
+  __ret_687; \
 })
 #else
-#define vmulxs_laneq_f32(__p0_596, __p1_596, __p2_596) __extension__ ({ \
-  float32_t __s0_596 = __p0_596; \
-  float32x4_t __s1_596 = __p1_596; \
-  float32x4_t __rev1_596;  __rev1_596 = __builtin_shufflevector(__s1_596, __s1_596, 3, 2, 1, 0); \
-  float32_t __ret_596; \
-  __ret_596 = vmulxs_f32(__s0_596, __noswap_vgetq_lane_f32(__rev1_596, __p2_596)); \
-  __ret_596; \
+#define vmulxs_laneq_f32(__p0_688, __p1_688, __p2_688) __extension__ ({ \
+  float32_t __s0_688 = __p0_688; \
+  float32x4_t __s1_688 = __p1_688; \
+  float32x4_t __rev1_688;  __rev1_688 = __builtin_shufflevector(__s1_688, __s1_688, 3, 2, 1, 0); \
+  float32_t __ret_688; \
+  __ret_688 = vmulxs_f32(__s0_688, __noswap_vgetq_lane_f32(__rev1_688, __p2_688)); \
+  __ret_688; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmulxq_laneq_f64(__p0_597, __p1_597, __p2_597) __extension__ ({ \
-  float64x2_t __s0_597 = __p0_597; \
-  float64x2_t __s1_597 = __p1_597; \
-  float64x2_t __ret_597; \
-  __ret_597 = vmulxq_f64(__s0_597, splatq_laneq_f64(__s1_597, __p2_597)); \
-  __ret_597; \
+#define vmulxq_laneq_f64(__p0_689, __p1_689, __p2_689) __extension__ ({ \
+  float64x2_t __s0_689 = __p0_689; \
+  float64x2_t __s1_689 = __p1_689; \
+  float64x2_t __ret_689; \
+  __ret_689 = vmulxq_f64(__s0_689, splatq_laneq_f64(__s1_689, __p2_689)); \
+  __ret_689; \
 })
 #else
-#define vmulxq_laneq_f64(__p0_598, __p1_598, __p2_598) __extension__ ({ \
-  float64x2_t __s0_598 = __p0_598; \
-  float64x2_t __s1_598 = __p1_598; \
-  float64x2_t __rev0_598;  __rev0_598 = __builtin_shufflevector(__s0_598, __s0_598, 1, 0); \
-  float64x2_t __rev1_598;  __rev1_598 = __builtin_shufflevector(__s1_598, __s1_598, 1, 0); \
-  float64x2_t __ret_598; \
-  __ret_598 = __noswap_vmulxq_f64(__rev0_598, __noswap_splatq_laneq_f64(__rev1_598, __p2_598)); \
-  __ret_598 = __builtin_shufflevector(__ret_598, __ret_598, 1, 0); \
-  __ret_598; \
+#define vmulxq_laneq_f64(__p0_690, __p1_690, __p2_690) __extension__ ({ \
+  float64x2_t __s0_690 = __p0_690; \
+  float64x2_t __s1_690 = __p1_690; \
+  float64x2_t __rev0_690;  __rev0_690 = __builtin_shufflevector(__s0_690, __s0_690, 1, 0); \
+  float64x2_t __rev1_690;  __rev1_690 = __builtin_shufflevector(__s1_690, __s1_690, 1, 0); \
+  float64x2_t __ret_690; \
+  __ret_690 = __noswap_vmulxq_f64(__rev0_690, __noswap_splatq_laneq_f64(__rev1_690, __p2_690)); \
+  __ret_690 = __builtin_shufflevector(__ret_690, __ret_690, 1, 0); \
+  __ret_690; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmulxq_laneq_f32(__p0_599, __p1_599, __p2_599) __extension__ ({ \
-  float32x4_t __s0_599 = __p0_599; \
-  float32x4_t __s1_599 = __p1_599; \
-  float32x4_t __ret_599; \
-  __ret_599 = vmulxq_f32(__s0_599, splatq_laneq_f32(__s1_599, __p2_599)); \
-  __ret_599; \
+#define vmulxq_laneq_f32(__p0_691, __p1_691, __p2_691) __extension__ ({ \
+  float32x4_t __s0_691 = __p0_691; \
+  float32x4_t __s1_691 = __p1_691; \
+  float32x4_t __ret_691; \
+  __ret_691 = vmulxq_f32(__s0_691, splatq_laneq_f32(__s1_691, __p2_691)); \
+  __ret_691; \
 })
 #else
-#define vmulxq_laneq_f32(__p0_600, __p1_600, __p2_600) __extension__ ({ \
-  float32x4_t __s0_600 = __p0_600; \
-  float32x4_t __s1_600 = __p1_600; \
-  float32x4_t __rev0_600;  __rev0_600 = __builtin_shufflevector(__s0_600, __s0_600, 3, 2, 1, 0); \
-  float32x4_t __rev1_600;  __rev1_600 = __builtin_shufflevector(__s1_600, __s1_600, 3, 2, 1, 0); \
-  float32x4_t __ret_600; \
-  __ret_600 = __noswap_vmulxq_f32(__rev0_600, __noswap_splatq_laneq_f32(__rev1_600, __p2_600)); \
-  __ret_600 = __builtin_shufflevector(__ret_600, __ret_600, 3, 2, 1, 0); \
-  __ret_600; \
+#define vmulxq_laneq_f32(__p0_692, __p1_692, __p2_692) __extension__ ({ \
+  float32x4_t __s0_692 = __p0_692; \
+  float32x4_t __s1_692 = __p1_692; \
+  float32x4_t __rev0_692;  __rev0_692 = __builtin_shufflevector(__s0_692, __s0_692, 3, 2, 1, 0); \
+  float32x4_t __rev1_692;  __rev1_692 = __builtin_shufflevector(__s1_692, __s1_692, 3, 2, 1, 0); \
+  float32x4_t __ret_692; \
+  __ret_692 = __noswap_vmulxq_f32(__rev0_692, __noswap_splatq_laneq_f32(__rev1_692, __p2_692)); \
+  __ret_692 = __builtin_shufflevector(__ret_692, __ret_692, 3, 2, 1, 0); \
+  __ret_692; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmulx_laneq_f32(__p0_601, __p1_601, __p2_601) __extension__ ({ \
-  float32x2_t __s0_601 = __p0_601; \
-  float32x4_t __s1_601 = __p1_601; \
-  float32x2_t __ret_601; \
-  __ret_601 = vmulx_f32(__s0_601, splat_laneq_f32(__s1_601, __p2_601)); \
-  __ret_601; \
+#define vmulx_laneq_f32(__p0_693, __p1_693, __p2_693) __extension__ ({ \
+  float32x2_t __s0_693 = __p0_693; \
+  float32x4_t __s1_693 = __p1_693; \
+  float32x2_t __ret_693; \
+  __ret_693 = vmulx_f32(__s0_693, splat_laneq_f32(__s1_693, __p2_693)); \
+  __ret_693; \
 })
 #else
-#define vmulx_laneq_f32(__p0_602, __p1_602, __p2_602) __extension__ ({ \
-  float32x2_t __s0_602 = __p0_602; \
-  float32x4_t __s1_602 = __p1_602; \
-  float32x2_t __rev0_602;  __rev0_602 = __builtin_shufflevector(__s0_602, __s0_602, 1, 0); \
-  float32x4_t __rev1_602;  __rev1_602 = __builtin_shufflevector(__s1_602, __s1_602, 3, 2, 1, 0); \
-  float32x2_t __ret_602; \
-  __ret_602 = __noswap_vmulx_f32(__rev0_602, __noswap_splat_laneq_f32(__rev1_602, __p2_602)); \
-  __ret_602 = __builtin_shufflevector(__ret_602, __ret_602, 1, 0); \
-  __ret_602; \
+#define vmulx_laneq_f32(__p0_694, __p1_694, __p2_694) __extension__ ({ \
+  float32x2_t __s0_694 = __p0_694; \
+  float32x4_t __s1_694 = __p1_694; \
+  float32x2_t __rev0_694;  __rev0_694 = __builtin_shufflevector(__s0_694, __s0_694, 1, 0); \
+  float32x4_t __rev1_694;  __rev1_694 = __builtin_shufflevector(__s1_694, __s1_694, 3, 2, 1, 0); \
+  float32x2_t __ret_694; \
+  __ret_694 = __noswap_vmulx_f32(__rev0_694, __noswap_splat_laneq_f32(__rev1_694, __p2_694)); \
+  __ret_694 = __builtin_shufflevector(__ret_694, __ret_694, 1, 0); \
+  __ret_694; \
 })
 #endif
 
@@ -55955,98 +57625,98 @@ __ai int32x4_t vqdmlal_high_s16(int32x4_t __p0, int16x8_t __p1, int16x8_t __p2)
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqdmlal_high_lane_s32(__p0_603, __p1_603, __p2_603, __p3_603) __extension__ ({ \
-  int64x2_t __s0_603 = __p0_603; \
-  int32x4_t __s1_603 = __p1_603; \
-  int32x2_t __s2_603 = __p2_603; \
-  int64x2_t __ret_603; \
-  __ret_603 = vqdmlal_s32(__s0_603, vget_high_s32(__s1_603), splat_lane_s32(__s2_603, __p3_603)); \
-  __ret_603; \
+#define vqdmlal_high_lane_s32(__p0_695, __p1_695, __p2_695, __p3_695) __extension__ ({ \
+  int64x2_t __s0_695 = __p0_695; \
+  int32x4_t __s1_695 = __p1_695; \
+  int32x2_t __s2_695 = __p2_695; \
+  int64x2_t __ret_695; \
+  __ret_695 = vqdmlal_s32(__s0_695, vget_high_s32(__s1_695), splat_lane_s32(__s2_695, __p3_695)); \
+  __ret_695; \
 })
 #else
-#define vqdmlal_high_lane_s32(__p0_604, __p1_604, __p2_604, __p3_604) __extension__ ({ \
-  int64x2_t __s0_604 = __p0_604; \
-  int32x4_t __s1_604 = __p1_604; \
-  int32x2_t __s2_604 = __p2_604; \
-  int64x2_t __rev0_604;  __rev0_604 = __builtin_shufflevector(__s0_604, __s0_604, 1, 0); \
-  int32x4_t __rev1_604;  __rev1_604 = __builtin_shufflevector(__s1_604, __s1_604, 3, 2, 1, 0); \
-  int32x2_t __rev2_604;  __rev2_604 = __builtin_shufflevector(__s2_604, __s2_604, 1, 0); \
-  int64x2_t __ret_604; \
-  __ret_604 = __noswap_vqdmlal_s32(__rev0_604, __noswap_vget_high_s32(__rev1_604), __noswap_splat_lane_s32(__rev2_604, __p3_604)); \
-  __ret_604 = __builtin_shufflevector(__ret_604, __ret_604, 1, 0); \
-  __ret_604; \
+#define vqdmlal_high_lane_s32(__p0_696, __p1_696, __p2_696, __p3_696) __extension__ ({ \
+  int64x2_t __s0_696 = __p0_696; \
+  int32x4_t __s1_696 = __p1_696; \
+  int32x2_t __s2_696 = __p2_696; \
+  int64x2_t __rev0_696;  __rev0_696 = __builtin_shufflevector(__s0_696, __s0_696, 1, 0); \
+  int32x4_t __rev1_696;  __rev1_696 = __builtin_shufflevector(__s1_696, __s1_696, 3, 2, 1, 0); \
+  int32x2_t __rev2_696;  __rev2_696 = __builtin_shufflevector(__s2_696, __s2_696, 1, 0); \
+  int64x2_t __ret_696; \
+  __ret_696 = __noswap_vqdmlal_s32(__rev0_696, __noswap_vget_high_s32(__rev1_696), __noswap_splat_lane_s32(__rev2_696, __p3_696)); \
+  __ret_696 = __builtin_shufflevector(__ret_696, __ret_696, 1, 0); \
+  __ret_696; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqdmlal_high_lane_s16(__p0_605, __p1_605, __p2_605, __p3_605) __extension__ ({ \
-  int32x4_t __s0_605 = __p0_605; \
-  int16x8_t __s1_605 = __p1_605; \
-  int16x4_t __s2_605 = __p2_605; \
-  int32x4_t __ret_605; \
-  __ret_605 = vqdmlal_s16(__s0_605, vget_high_s16(__s1_605), splat_lane_s16(__s2_605, __p3_605)); \
-  __ret_605; \
+#define vqdmlal_high_lane_s16(__p0_697, __p1_697, __p2_697, __p3_697) __extension__ ({ \
+  int32x4_t __s0_697 = __p0_697; \
+  int16x8_t __s1_697 = __p1_697; \
+  int16x4_t __s2_697 = __p2_697; \
+  int32x4_t __ret_697; \
+  __ret_697 = vqdmlal_s16(__s0_697, vget_high_s16(__s1_697), splat_lane_s16(__s2_697, __p3_697)); \
+  __ret_697; \
 })
 #else
-#define vqdmlal_high_lane_s16(__p0_606, __p1_606, __p2_606, __p3_606) __extension__ ({ \
-  int32x4_t __s0_606 = __p0_606; \
-  int16x8_t __s1_606 = __p1_606; \
-  int16x4_t __s2_606 = __p2_606; \
-  int32x4_t __rev0_606;  __rev0_606 = __builtin_shufflevector(__s0_606, __s0_606, 3, 2, 1, 0); \
-  int16x8_t __rev1_606;  __rev1_606 = __builtin_shufflevector(__s1_606, __s1_606, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x4_t __rev2_606;  __rev2_606 = __builtin_shufflevector(__s2_606, __s2_606, 3, 2, 1, 0); \
-  int32x4_t __ret_606; \
-  __ret_606 = __noswap_vqdmlal_s16(__rev0_606, __noswap_vget_high_s16(__rev1_606), __noswap_splat_lane_s16(__rev2_606, __p3_606)); \
-  __ret_606 = __builtin_shufflevector(__ret_606, __ret_606, 3, 2, 1, 0); \
-  __ret_606; \
+#define vqdmlal_high_lane_s16(__p0_698, __p1_698, __p2_698, __p3_698) __extension__ ({ \
+  int32x4_t __s0_698 = __p0_698; \
+  int16x8_t __s1_698 = __p1_698; \
+  int16x4_t __s2_698 = __p2_698; \
+  int32x4_t __rev0_698;  __rev0_698 = __builtin_shufflevector(__s0_698, __s0_698, 3, 2, 1, 0); \
+  int16x8_t __rev1_698;  __rev1_698 = __builtin_shufflevector(__s1_698, __s1_698, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __rev2_698;  __rev2_698 = __builtin_shufflevector(__s2_698, __s2_698, 3, 2, 1, 0); \
+  int32x4_t __ret_698; \
+  __ret_698 = __noswap_vqdmlal_s16(__rev0_698, __noswap_vget_high_s16(__rev1_698), __noswap_splat_lane_s16(__rev2_698, __p3_698)); \
+  __ret_698 = __builtin_shufflevector(__ret_698, __ret_698, 3, 2, 1, 0); \
+  __ret_698; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqdmlal_high_laneq_s32(__p0_607, __p1_607, __p2_607, __p3_607) __extension__ ({ \
-  int64x2_t __s0_607 = __p0_607; \
-  int32x4_t __s1_607 = __p1_607; \
-  int32x4_t __s2_607 = __p2_607; \
-  int64x2_t __ret_607; \
-  __ret_607 = vqdmlal_s32(__s0_607, vget_high_s32(__s1_607), splat_laneq_s32(__s2_607, __p3_607)); \
-  __ret_607; \
+#define vqdmlal_high_laneq_s32(__p0_699, __p1_699, __p2_699, __p3_699) __extension__ ({ \
+  int64x2_t __s0_699 = __p0_699; \
+  int32x4_t __s1_699 = __p1_699; \
+  int32x4_t __s2_699 = __p2_699; \
+  int64x2_t __ret_699; \
+  __ret_699 = vqdmlal_s32(__s0_699, vget_high_s32(__s1_699), splat_laneq_s32(__s2_699, __p3_699)); \
+  __ret_699; \
 })
 #else
-#define vqdmlal_high_laneq_s32(__p0_608, __p1_608, __p2_608, __p3_608) __extension__ ({ \
-  int64x2_t __s0_608 = __p0_608; \
-  int32x4_t __s1_608 = __p1_608; \
-  int32x4_t __s2_608 = __p2_608; \
-  int64x2_t __rev0_608;  __rev0_608 = __builtin_shufflevector(__s0_608, __s0_608, 1, 0); \
-  int32x4_t __rev1_608;  __rev1_608 = __builtin_shufflevector(__s1_608, __s1_608, 3, 2, 1, 0); \
-  int32x4_t __rev2_608;  __rev2_608 = __builtin_shufflevector(__s2_608, __s2_608, 3, 2, 1, 0); \
-  int64x2_t __ret_608; \
-  __ret_608 = __noswap_vqdmlal_s32(__rev0_608, __noswap_vget_high_s32(__rev1_608), __noswap_splat_laneq_s32(__rev2_608, __p3_608)); \
-  __ret_608 = __builtin_shufflevector(__ret_608, __ret_608, 1, 0); \
-  __ret_608; \
+#define vqdmlal_high_laneq_s32(__p0_700, __p1_700, __p2_700, __p3_700) __extension__ ({ \
+  int64x2_t __s0_700 = __p0_700; \
+  int32x4_t __s1_700 = __p1_700; \
+  int32x4_t __s2_700 = __p2_700; \
+  int64x2_t __rev0_700;  __rev0_700 = __builtin_shufflevector(__s0_700, __s0_700, 1, 0); \
+  int32x4_t __rev1_700;  __rev1_700 = __builtin_shufflevector(__s1_700, __s1_700, 3, 2, 1, 0); \
+  int32x4_t __rev2_700;  __rev2_700 = __builtin_shufflevector(__s2_700, __s2_700, 3, 2, 1, 0); \
+  int64x2_t __ret_700; \
+  __ret_700 = __noswap_vqdmlal_s32(__rev0_700, __noswap_vget_high_s32(__rev1_700), __noswap_splat_laneq_s32(__rev2_700, __p3_700)); \
+  __ret_700 = __builtin_shufflevector(__ret_700, __ret_700, 1, 0); \
+  __ret_700; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqdmlal_high_laneq_s16(__p0_609, __p1_609, __p2_609, __p3_609) __extension__ ({ \
-  int32x4_t __s0_609 = __p0_609; \
-  int16x8_t __s1_609 = __p1_609; \
-  int16x8_t __s2_609 = __p2_609; \
-  int32x4_t __ret_609; \
-  __ret_609 = vqdmlal_s16(__s0_609, vget_high_s16(__s1_609), splat_laneq_s16(__s2_609, __p3_609)); \
-  __ret_609; \
+#define vqdmlal_high_laneq_s16(__p0_701, __p1_701, __p2_701, __p3_701) __extension__ ({ \
+  int32x4_t __s0_701 = __p0_701; \
+  int16x8_t __s1_701 = __p1_701; \
+  int16x8_t __s2_701 = __p2_701; \
+  int32x4_t __ret_701; \
+  __ret_701 = vqdmlal_s16(__s0_701, vget_high_s16(__s1_701), splat_laneq_s16(__s2_701, __p3_701)); \
+  __ret_701; \
 })
 #else
-#define vqdmlal_high_laneq_s16(__p0_610, __p1_610, __p2_610, __p3_610) __extension__ ({ \
-  int32x4_t __s0_610 = __p0_610; \
-  int16x8_t __s1_610 = __p1_610; \
-  int16x8_t __s2_610 = __p2_610; \
-  int32x4_t __rev0_610;  __rev0_610 = __builtin_shufflevector(__s0_610, __s0_610, 3, 2, 1, 0); \
-  int16x8_t __rev1_610;  __rev1_610 = __builtin_shufflevector(__s1_610, __s1_610, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev2_610;  __rev2_610 = __builtin_shufflevector(__s2_610, __s2_610, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int32x4_t __ret_610; \
-  __ret_610 = __noswap_vqdmlal_s16(__rev0_610, __noswap_vget_high_s16(__rev1_610), __noswap_splat_laneq_s16(__rev2_610, __p3_610)); \
-  __ret_610 = __builtin_shufflevector(__ret_610, __ret_610, 3, 2, 1, 0); \
-  __ret_610; \
+#define vqdmlal_high_laneq_s16(__p0_702, __p1_702, __p2_702, __p3_702) __extension__ ({ \
+  int32x4_t __s0_702 = __p0_702; \
+  int16x8_t __s1_702 = __p1_702; \
+  int16x8_t __s2_702 = __p2_702; \
+  int32x4_t __rev0_702;  __rev0_702 = __builtin_shufflevector(__s0_702, __s0_702, 3, 2, 1, 0); \
+  int16x8_t __rev1_702;  __rev1_702 = __builtin_shufflevector(__s1_702, __s1_702, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev2_702;  __rev2_702 = __builtin_shufflevector(__s2_702, __s2_702, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret_702; \
+  __ret_702 = __noswap_vqdmlal_s16(__rev0_702, __noswap_vget_high_s16(__rev1_702), __noswap_splat_laneq_s16(__rev2_702, __p3_702)); \
+  __ret_702 = __builtin_shufflevector(__ret_702, __ret_702, 3, 2, 1, 0); \
+  __ret_702; \
 })
 #endif
 
@@ -56169,50 +57839,50 @@ __ai int32x4_t vqdmlal_high_n_s16(int32x4_t __p0, int16x8_t __p1, int16_t __p2)
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqdmlal_laneq_s32(__p0_611, __p1_611, __p2_611, __p3_611) __extension__ ({ \
-  int64x2_t __s0_611 = __p0_611; \
-  int32x2_t __s1_611 = __p1_611; \
-  int32x4_t __s2_611 = __p2_611; \
-  int64x2_t __ret_611; \
-  __ret_611 = vqdmlal_s32(__s0_611, __s1_611, splat_laneq_s32(__s2_611, __p3_611)); \
-  __ret_611; \
+#define vqdmlal_laneq_s32(__p0_703, __p1_703, __p2_703, __p3_703) __extension__ ({ \
+  int64x2_t __s0_703 = __p0_703; \
+  int32x2_t __s1_703 = __p1_703; \
+  int32x4_t __s2_703 = __p2_703; \
+  int64x2_t __ret_703; \
+  __ret_703 = vqdmlal_s32(__s0_703, __s1_703, splat_laneq_s32(__s2_703, __p3_703)); \
+  __ret_703; \
 })
 #else
-#define vqdmlal_laneq_s32(__p0_612, __p1_612, __p2_612, __p3_612) __extension__ ({ \
-  int64x2_t __s0_612 = __p0_612; \
-  int32x2_t __s1_612 = __p1_612; \
-  int32x4_t __s2_612 = __p2_612; \
-  int64x2_t __rev0_612;  __rev0_612 = __builtin_shufflevector(__s0_612, __s0_612, 1, 0); \
-  int32x2_t __rev1_612;  __rev1_612 = __builtin_shufflevector(__s1_612, __s1_612, 1, 0); \
-  int32x4_t __rev2_612;  __rev2_612 = __builtin_shufflevector(__s2_612, __s2_612, 3, 2, 1, 0); \
-  int64x2_t __ret_612; \
-  __ret_612 = __noswap_vqdmlal_s32(__rev0_612, __rev1_612, __noswap_splat_laneq_s32(__rev2_612, __p3_612)); \
-  __ret_612 = __builtin_shufflevector(__ret_612, __ret_612, 1, 0); \
-  __ret_612; \
+#define vqdmlal_laneq_s32(__p0_704, __p1_704, __p2_704, __p3_704) __extension__ ({ \
+  int64x2_t __s0_704 = __p0_704; \
+  int32x2_t __s1_704 = __p1_704; \
+  int32x4_t __s2_704 = __p2_704; \
+  int64x2_t __rev0_704;  __rev0_704 = __builtin_shufflevector(__s0_704, __s0_704, 1, 0); \
+  int32x2_t __rev1_704;  __rev1_704 = __builtin_shufflevector(__s1_704, __s1_704, 1, 0); \
+  int32x4_t __rev2_704;  __rev2_704 = __builtin_shufflevector(__s2_704, __s2_704, 3, 2, 1, 0); \
+  int64x2_t __ret_704; \
+  __ret_704 = __noswap_vqdmlal_s32(__rev0_704, __rev1_704, __noswap_splat_laneq_s32(__rev2_704, __p3_704)); \
+  __ret_704 = __builtin_shufflevector(__ret_704, __ret_704, 1, 0); \
+  __ret_704; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqdmlal_laneq_s16(__p0_613, __p1_613, __p2_613, __p3_613) __extension__ ({ \
-  int32x4_t __s0_613 = __p0_613; \
-  int16x4_t __s1_613 = __p1_613; \
-  int16x8_t __s2_613 = __p2_613; \
-  int32x4_t __ret_613; \
-  __ret_613 = vqdmlal_s16(__s0_613, __s1_613, splat_laneq_s16(__s2_613, __p3_613)); \
-  __ret_613; \
+#define vqdmlal_laneq_s16(__p0_705, __p1_705, __p2_705, __p3_705) __extension__ ({ \
+  int32x4_t __s0_705 = __p0_705; \
+  int16x4_t __s1_705 = __p1_705; \
+  int16x8_t __s2_705 = __p2_705; \
+  int32x4_t __ret_705; \
+  __ret_705 = vqdmlal_s16(__s0_705, __s1_705, splat_laneq_s16(__s2_705, __p3_705)); \
+  __ret_705; \
 })
 #else
-#define vqdmlal_laneq_s16(__p0_614, __p1_614, __p2_614, __p3_614) __extension__ ({ \
-  int32x4_t __s0_614 = __p0_614; \
-  int16x4_t __s1_614 = __p1_614; \
-  int16x8_t __s2_614 = __p2_614; \
-  int32x4_t __rev0_614;  __rev0_614 = __builtin_shufflevector(__s0_614, __s0_614, 3, 2, 1, 0); \
-  int16x4_t __rev1_614;  __rev1_614 = __builtin_shufflevector(__s1_614, __s1_614, 3, 2, 1, 0); \
-  int16x8_t __rev2_614;  __rev2_614 = __builtin_shufflevector(__s2_614, __s2_614, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int32x4_t __ret_614; \
-  __ret_614 = __noswap_vqdmlal_s16(__rev0_614, __rev1_614, __noswap_splat_laneq_s16(__rev2_614, __p3_614)); \
-  __ret_614 = __builtin_shufflevector(__ret_614, __ret_614, 3, 2, 1, 0); \
-  __ret_614; \
+#define vqdmlal_laneq_s16(__p0_706, __p1_706, __p2_706, __p3_706) __extension__ ({ \
+  int32x4_t __s0_706 = __p0_706; \
+  int16x4_t __s1_706 = __p1_706; \
+  int16x8_t __s2_706 = __p2_706; \
+  int32x4_t __rev0_706;  __rev0_706 = __builtin_shufflevector(__s0_706, __s0_706, 3, 2, 1, 0); \
+  int16x4_t __rev1_706;  __rev1_706 = __builtin_shufflevector(__s1_706, __s1_706, 3, 2, 1, 0); \
+  int16x8_t __rev2_706;  __rev2_706 = __builtin_shufflevector(__s2_706, __s2_706, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret_706; \
+  __ret_706 = __noswap_vqdmlal_s16(__rev0_706, __rev1_706, __noswap_splat_laneq_s16(__rev2_706, __p3_706)); \
+  __ret_706 = __builtin_shufflevector(__ret_706, __ret_706, 3, 2, 1, 0); \
+  __ret_706; \
 })
 #endif
 
@@ -56263,98 +57933,98 @@ __ai int32x4_t vqdmlsl_high_s16(int32x4_t __p0, int16x8_t __p1, int16x8_t __p2)
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqdmlsl_high_lane_s32(__p0_615, __p1_615, __p2_615, __p3_615) __extension__ ({ \
-  int64x2_t __s0_615 = __p0_615; \
-  int32x4_t __s1_615 = __p1_615; \
-  int32x2_t __s2_615 = __p2_615; \
-  int64x2_t __ret_615; \
-  __ret_615 = vqdmlsl_s32(__s0_615, vget_high_s32(__s1_615), splat_lane_s32(__s2_615, __p3_615)); \
-  __ret_615; \
+#define vqdmlsl_high_lane_s32(__p0_707, __p1_707, __p2_707, __p3_707) __extension__ ({ \
+  int64x2_t __s0_707 = __p0_707; \
+  int32x4_t __s1_707 = __p1_707; \
+  int32x2_t __s2_707 = __p2_707; \
+  int64x2_t __ret_707; \
+  __ret_707 = vqdmlsl_s32(__s0_707, vget_high_s32(__s1_707), splat_lane_s32(__s2_707, __p3_707)); \
+  __ret_707; \
 })
 #else
-#define vqdmlsl_high_lane_s32(__p0_616, __p1_616, __p2_616, __p3_616) __extension__ ({ \
-  int64x2_t __s0_616 = __p0_616; \
-  int32x4_t __s1_616 = __p1_616; \
-  int32x2_t __s2_616 = __p2_616; \
-  int64x2_t __rev0_616;  __rev0_616 = __builtin_shufflevector(__s0_616, __s0_616, 1, 0); \
-  int32x4_t __rev1_616;  __rev1_616 = __builtin_shufflevector(__s1_616, __s1_616, 3, 2, 1, 0); \
-  int32x2_t __rev2_616;  __rev2_616 = __builtin_shufflevector(__s2_616, __s2_616, 1, 0); \
-  int64x2_t __ret_616; \
-  __ret_616 = __noswap_vqdmlsl_s32(__rev0_616, __noswap_vget_high_s32(__rev1_616), __noswap_splat_lane_s32(__rev2_616, __p3_616)); \
-  __ret_616 = __builtin_shufflevector(__ret_616, __ret_616, 1, 0); \
-  __ret_616; \
+#define vqdmlsl_high_lane_s32(__p0_708, __p1_708, __p2_708, __p3_708) __extension__ ({ \
+  int64x2_t __s0_708 = __p0_708; \
+  int32x4_t __s1_708 = __p1_708; \
+  int32x2_t __s2_708 = __p2_708; \
+  int64x2_t __rev0_708;  __rev0_708 = __builtin_shufflevector(__s0_708, __s0_708, 1, 0); \
+  int32x4_t __rev1_708;  __rev1_708 = __builtin_shufflevector(__s1_708, __s1_708, 3, 2, 1, 0); \
+  int32x2_t __rev2_708;  __rev2_708 = __builtin_shufflevector(__s2_708, __s2_708, 1, 0); \
+  int64x2_t __ret_708; \
+  __ret_708 = __noswap_vqdmlsl_s32(__rev0_708, __noswap_vget_high_s32(__rev1_708), __noswap_splat_lane_s32(__rev2_708, __p3_708)); \
+  __ret_708 = __builtin_shufflevector(__ret_708, __ret_708, 1, 0); \
+  __ret_708; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqdmlsl_high_lane_s16(__p0_617, __p1_617, __p2_617, __p3_617) __extension__ ({ \
-  int32x4_t __s0_617 = __p0_617; \
-  int16x8_t __s1_617 = __p1_617; \
-  int16x4_t __s2_617 = __p2_617; \
-  int32x4_t __ret_617; \
-  __ret_617 = vqdmlsl_s16(__s0_617, vget_high_s16(__s1_617), splat_lane_s16(__s2_617, __p3_617)); \
-  __ret_617; \
+#define vqdmlsl_high_lane_s16(__p0_709, __p1_709, __p2_709, __p3_709) __extension__ ({ \
+  int32x4_t __s0_709 = __p0_709; \
+  int16x8_t __s1_709 = __p1_709; \
+  int16x4_t __s2_709 = __p2_709; \
+  int32x4_t __ret_709; \
+  __ret_709 = vqdmlsl_s16(__s0_709, vget_high_s16(__s1_709), splat_lane_s16(__s2_709, __p3_709)); \
+  __ret_709; \
 })
 #else
-#define vqdmlsl_high_lane_s16(__p0_618, __p1_618, __p2_618, __p3_618) __extension__ ({ \
-  int32x4_t __s0_618 = __p0_618; \
-  int16x8_t __s1_618 = __p1_618; \
-  int16x4_t __s2_618 = __p2_618; \
-  int32x4_t __rev0_618;  __rev0_618 = __builtin_shufflevector(__s0_618, __s0_618, 3, 2, 1, 0); \
-  int16x8_t __rev1_618;  __rev1_618 = __builtin_shufflevector(__s1_618, __s1_618, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x4_t __rev2_618;  __rev2_618 = __builtin_shufflevector(__s2_618, __s2_618, 3, 2, 1, 0); \
-  int32x4_t __ret_618; \
-  __ret_618 = __noswap_vqdmlsl_s16(__rev0_618, __noswap_vget_high_s16(__rev1_618), __noswap_splat_lane_s16(__rev2_618, __p3_618)); \
-  __ret_618 = __builtin_shufflevector(__ret_618, __ret_618, 3, 2, 1, 0); \
-  __ret_618; \
+#define vqdmlsl_high_lane_s16(__p0_710, __p1_710, __p2_710, __p3_710) __extension__ ({ \
+  int32x4_t __s0_710 = __p0_710; \
+  int16x8_t __s1_710 = __p1_710; \
+  int16x4_t __s2_710 = __p2_710; \
+  int32x4_t __rev0_710;  __rev0_710 = __builtin_shufflevector(__s0_710, __s0_710, 3, 2, 1, 0); \
+  int16x8_t __rev1_710;  __rev1_710 = __builtin_shufflevector(__s1_710, __s1_710, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __rev2_710;  __rev2_710 = __builtin_shufflevector(__s2_710, __s2_710, 3, 2, 1, 0); \
+  int32x4_t __ret_710; \
+  __ret_710 = __noswap_vqdmlsl_s16(__rev0_710, __noswap_vget_high_s16(__rev1_710), __noswap_splat_lane_s16(__rev2_710, __p3_710)); \
+  __ret_710 = __builtin_shufflevector(__ret_710, __ret_710, 3, 2, 1, 0); \
+  __ret_710; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqdmlsl_high_laneq_s32(__p0_619, __p1_619, __p2_619, __p3_619) __extension__ ({ \
-  int64x2_t __s0_619 = __p0_619; \
-  int32x4_t __s1_619 = __p1_619; \
-  int32x4_t __s2_619 = __p2_619; \
-  int64x2_t __ret_619; \
-  __ret_619 = vqdmlsl_s32(__s0_619, vget_high_s32(__s1_619), splat_laneq_s32(__s2_619, __p3_619)); \
-  __ret_619; \
+#define vqdmlsl_high_laneq_s32(__p0_711, __p1_711, __p2_711, __p3_711) __extension__ ({ \
+  int64x2_t __s0_711 = __p0_711; \
+  int32x4_t __s1_711 = __p1_711; \
+  int32x4_t __s2_711 = __p2_711; \
+  int64x2_t __ret_711; \
+  __ret_711 = vqdmlsl_s32(__s0_711, vget_high_s32(__s1_711), splat_laneq_s32(__s2_711, __p3_711)); \
+  __ret_711; \
 })
 #else
-#define vqdmlsl_high_laneq_s32(__p0_620, __p1_620, __p2_620, __p3_620) __extension__ ({ \
-  int64x2_t __s0_620 = __p0_620; \
-  int32x4_t __s1_620 = __p1_620; \
-  int32x4_t __s2_620 = __p2_620; \
-  int64x2_t __rev0_620;  __rev0_620 = __builtin_shufflevector(__s0_620, __s0_620, 1, 0); \
-  int32x4_t __rev1_620;  __rev1_620 = __builtin_shufflevector(__s1_620, __s1_620, 3, 2, 1, 0); \
-  int32x4_t __rev2_620;  __rev2_620 = __builtin_shufflevector(__s2_620, __s2_620, 3, 2, 1, 0); \
-  int64x2_t __ret_620; \
-  __ret_620 = __noswap_vqdmlsl_s32(__rev0_620, __noswap_vget_high_s32(__rev1_620), __noswap_splat_laneq_s32(__rev2_620, __p3_620)); \
-  __ret_620 = __builtin_shufflevector(__ret_620, __ret_620, 1, 0); \
-  __ret_620; \
+#define vqdmlsl_high_laneq_s32(__p0_712, __p1_712, __p2_712, __p3_712) __extension__ ({ \
+  int64x2_t __s0_712 = __p0_712; \
+  int32x4_t __s1_712 = __p1_712; \
+  int32x4_t __s2_712 = __p2_712; \
+  int64x2_t __rev0_712;  __rev0_712 = __builtin_shufflevector(__s0_712, __s0_712, 1, 0); \
+  int32x4_t __rev1_712;  __rev1_712 = __builtin_shufflevector(__s1_712, __s1_712, 3, 2, 1, 0); \
+  int32x4_t __rev2_712;  __rev2_712 = __builtin_shufflevector(__s2_712, __s2_712, 3, 2, 1, 0); \
+  int64x2_t __ret_712; \
+  __ret_712 = __noswap_vqdmlsl_s32(__rev0_712, __noswap_vget_high_s32(__rev1_712), __noswap_splat_laneq_s32(__rev2_712, __p3_712)); \
+  __ret_712 = __builtin_shufflevector(__ret_712, __ret_712, 1, 0); \
+  __ret_712; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqdmlsl_high_laneq_s16(__p0_621, __p1_621, __p2_621, __p3_621) __extension__ ({ \
-  int32x4_t __s0_621 = __p0_621; \
-  int16x8_t __s1_621 = __p1_621; \
-  int16x8_t __s2_621 = __p2_621; \
-  int32x4_t __ret_621; \
-  __ret_621 = vqdmlsl_s16(__s0_621, vget_high_s16(__s1_621), splat_laneq_s16(__s2_621, __p3_621)); \
-  __ret_621; \
+#define vqdmlsl_high_laneq_s16(__p0_713, __p1_713, __p2_713, __p3_713) __extension__ ({ \
+  int32x4_t __s0_713 = __p0_713; \
+  int16x8_t __s1_713 = __p1_713; \
+  int16x8_t __s2_713 = __p2_713; \
+  int32x4_t __ret_713; \
+  __ret_713 = vqdmlsl_s16(__s0_713, vget_high_s16(__s1_713), splat_laneq_s16(__s2_713, __p3_713)); \
+  __ret_713; \
 })
 #else
-#define vqdmlsl_high_laneq_s16(__p0_622, __p1_622, __p2_622, __p3_622) __extension__ ({ \
-  int32x4_t __s0_622 = __p0_622; \
-  int16x8_t __s1_622 = __p1_622; \
-  int16x8_t __s2_622 = __p2_622; \
-  int32x4_t __rev0_622;  __rev0_622 = __builtin_shufflevector(__s0_622, __s0_622, 3, 2, 1, 0); \
-  int16x8_t __rev1_622;  __rev1_622 = __builtin_shufflevector(__s1_622, __s1_622, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev2_622;  __rev2_622 = __builtin_shufflevector(__s2_622, __s2_622, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int32x4_t __ret_622; \
-  __ret_622 = __noswap_vqdmlsl_s16(__rev0_622, __noswap_vget_high_s16(__rev1_622), __noswap_splat_laneq_s16(__rev2_622, __p3_622)); \
-  __ret_622 = __builtin_shufflevector(__ret_622, __ret_622, 3, 2, 1, 0); \
-  __ret_622; \
+#define vqdmlsl_high_laneq_s16(__p0_714, __p1_714, __p2_714, __p3_714) __extension__ ({ \
+  int32x4_t __s0_714 = __p0_714; \
+  int16x8_t __s1_714 = __p1_714; \
+  int16x8_t __s2_714 = __p2_714; \
+  int32x4_t __rev0_714;  __rev0_714 = __builtin_shufflevector(__s0_714, __s0_714, 3, 2, 1, 0); \
+  int16x8_t __rev1_714;  __rev1_714 = __builtin_shufflevector(__s1_714, __s1_714, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev2_714;  __rev2_714 = __builtin_shufflevector(__s2_714, __s2_714, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret_714; \
+  __ret_714 = __noswap_vqdmlsl_s16(__rev0_714, __noswap_vget_high_s16(__rev1_714), __noswap_splat_laneq_s16(__rev2_714, __p3_714)); \
+  __ret_714 = __builtin_shufflevector(__ret_714, __ret_714, 3, 2, 1, 0); \
+  __ret_714; \
 })
 #endif
 
@@ -56477,50 +58147,50 @@ __ai int32x4_t vqdmlsl_high_n_s16(int32x4_t __p0, int16x8_t __p1, int16_t __p2)
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqdmlsl_laneq_s32(__p0_623, __p1_623, __p2_623, __p3_623) __extension__ ({ \
-  int64x2_t __s0_623 = __p0_623; \
-  int32x2_t __s1_623 = __p1_623; \
-  int32x4_t __s2_623 = __p2_623; \
-  int64x2_t __ret_623; \
-  __ret_623 = vqdmlsl_s32(__s0_623, __s1_623, splat_laneq_s32(__s2_623, __p3_623)); \
-  __ret_623; \
+#define vqdmlsl_laneq_s32(__p0_715, __p1_715, __p2_715, __p3_715) __extension__ ({ \
+  int64x2_t __s0_715 = __p0_715; \
+  int32x2_t __s1_715 = __p1_715; \
+  int32x4_t __s2_715 = __p2_715; \
+  int64x2_t __ret_715; \
+  __ret_715 = vqdmlsl_s32(__s0_715, __s1_715, splat_laneq_s32(__s2_715, __p3_715)); \
+  __ret_715; \
 })
 #else
-#define vqdmlsl_laneq_s32(__p0_624, __p1_624, __p2_624, __p3_624) __extension__ ({ \
-  int64x2_t __s0_624 = __p0_624; \
-  int32x2_t __s1_624 = __p1_624; \
-  int32x4_t __s2_624 = __p2_624; \
-  int64x2_t __rev0_624;  __rev0_624 = __builtin_shufflevector(__s0_624, __s0_624, 1, 0); \
-  int32x2_t __rev1_624;  __rev1_624 = __builtin_shufflevector(__s1_624, __s1_624, 1, 0); \
-  int32x4_t __rev2_624;  __rev2_624 = __builtin_shufflevector(__s2_624, __s2_624, 3, 2, 1, 0); \
-  int64x2_t __ret_624; \
-  __ret_624 = __noswap_vqdmlsl_s32(__rev0_624, __rev1_624, __noswap_splat_laneq_s32(__rev2_624, __p3_624)); \
-  __ret_624 = __builtin_shufflevector(__ret_624, __ret_624, 1, 0); \
-  __ret_624; \
+#define vqdmlsl_laneq_s32(__p0_716, __p1_716, __p2_716, __p3_716) __extension__ ({ \
+  int64x2_t __s0_716 = __p0_716; \
+  int32x2_t __s1_716 = __p1_716; \
+  int32x4_t __s2_716 = __p2_716; \
+  int64x2_t __rev0_716;  __rev0_716 = __builtin_shufflevector(__s0_716, __s0_716, 1, 0); \
+  int32x2_t __rev1_716;  __rev1_716 = __builtin_shufflevector(__s1_716, __s1_716, 1, 0); \
+  int32x4_t __rev2_716;  __rev2_716 = __builtin_shufflevector(__s2_716, __s2_716, 3, 2, 1, 0); \
+  int64x2_t __ret_716; \
+  __ret_716 = __noswap_vqdmlsl_s32(__rev0_716, __rev1_716, __noswap_splat_laneq_s32(__rev2_716, __p3_716)); \
+  __ret_716 = __builtin_shufflevector(__ret_716, __ret_716, 1, 0); \
+  __ret_716; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqdmlsl_laneq_s16(__p0_625, __p1_625, __p2_625, __p3_625) __extension__ ({ \
-  int32x4_t __s0_625 = __p0_625; \
-  int16x4_t __s1_625 = __p1_625; \
-  int16x8_t __s2_625 = __p2_625; \
-  int32x4_t __ret_625; \
-  __ret_625 = vqdmlsl_s16(__s0_625, __s1_625, splat_laneq_s16(__s2_625, __p3_625)); \
-  __ret_625; \
+#define vqdmlsl_laneq_s16(__p0_717, __p1_717, __p2_717, __p3_717) __extension__ ({ \
+  int32x4_t __s0_717 = __p0_717; \
+  int16x4_t __s1_717 = __p1_717; \
+  int16x8_t __s2_717 = __p2_717; \
+  int32x4_t __ret_717; \
+  __ret_717 = vqdmlsl_s16(__s0_717, __s1_717, splat_laneq_s16(__s2_717, __p3_717)); \
+  __ret_717; \
 })
 #else
-#define vqdmlsl_laneq_s16(__p0_626, __p1_626, __p2_626, __p3_626) __extension__ ({ \
-  int32x4_t __s0_626 = __p0_626; \
-  int16x4_t __s1_626 = __p1_626; \
-  int16x8_t __s2_626 = __p2_626; \
-  int32x4_t __rev0_626;  __rev0_626 = __builtin_shufflevector(__s0_626, __s0_626, 3, 2, 1, 0); \
-  int16x4_t __rev1_626;  __rev1_626 = __builtin_shufflevector(__s1_626, __s1_626, 3, 2, 1, 0); \
-  int16x8_t __rev2_626;  __rev2_626 = __builtin_shufflevector(__s2_626, __s2_626, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int32x4_t __ret_626; \
-  __ret_626 = __noswap_vqdmlsl_s16(__rev0_626, __rev1_626, __noswap_splat_laneq_s16(__rev2_626, __p3_626)); \
-  __ret_626 = __builtin_shufflevector(__ret_626, __ret_626, 3, 2, 1, 0); \
-  __ret_626; \
+#define vqdmlsl_laneq_s16(__p0_718, __p1_718, __p2_718, __p3_718) __extension__ ({ \
+  int32x4_t __s0_718 = __p0_718; \
+  int16x4_t __s1_718 = __p1_718; \
+  int16x8_t __s2_718 = __p2_718; \
+  int32x4_t __rev0_718;  __rev0_718 = __builtin_shufflevector(__s0_718, __s0_718, 3, 2, 1, 0); \
+  int16x4_t __rev1_718;  __rev1_718 = __builtin_shufflevector(__s1_718, __s1_718, 3, 2, 1, 0); \
+  int16x8_t __rev2_718;  __rev2_718 = __builtin_shufflevector(__s2_718, __s2_718, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret_718; \
+  __ret_718 = __noswap_vqdmlsl_s16(__rev0_718, __rev1_718, __noswap_splat_laneq_s16(__rev2_718, __p3_718)); \
+  __ret_718 = __builtin_shufflevector(__ret_718, __ret_718, 3, 2, 1, 0); \
+  __ret_718; \
 })
 #endif
 
@@ -56619,78 +58289,78 @@ __ai int16_t vqdmulhh_s16(int16_t __p0, int16_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqdmulhs_lane_s32(__p0_627, __p1_627, __p2_627) __extension__ ({ \
-  int32_t __s0_627 = __p0_627; \
-  int32x2_t __s1_627 = __p1_627; \
-  int32_t __ret_627; \
-  __ret_627 = vqdmulhs_s32(__s0_627, vget_lane_s32(__s1_627, __p2_627)); \
-  __ret_627; \
+#define vqdmulhs_lane_s32(__p0_719, __p1_719, __p2_719) __extension__ ({ \
+  int32_t __s0_719 = __p0_719; \
+  int32x2_t __s1_719 = __p1_719; \
+  int32_t __ret_719; \
+  __ret_719 = vqdmulhs_s32(__s0_719, vget_lane_s32(__s1_719, __p2_719)); \
+  __ret_719; \
 })
 #else
-#define vqdmulhs_lane_s32(__p0_628, __p1_628, __p2_628) __extension__ ({ \
-  int32_t __s0_628 = __p0_628; \
-  int32x2_t __s1_628 = __p1_628; \
-  int32x2_t __rev1_628;  __rev1_628 = __builtin_shufflevector(__s1_628, __s1_628, 1, 0); \
-  int32_t __ret_628; \
-  __ret_628 = vqdmulhs_s32(__s0_628, __noswap_vget_lane_s32(__rev1_628, __p2_628)); \
-  __ret_628; \
+#define vqdmulhs_lane_s32(__p0_720, __p1_720, __p2_720) __extension__ ({ \
+  int32_t __s0_720 = __p0_720; \
+  int32x2_t __s1_720 = __p1_720; \
+  int32x2_t __rev1_720;  __rev1_720 = __builtin_shufflevector(__s1_720, __s1_720, 1, 0); \
+  int32_t __ret_720; \
+  __ret_720 = vqdmulhs_s32(__s0_720, __noswap_vget_lane_s32(__rev1_720, __p2_720)); \
+  __ret_720; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqdmulhh_lane_s16(__p0_629, __p1_629, __p2_629) __extension__ ({ \
-  int16_t __s0_629 = __p0_629; \
-  int16x4_t __s1_629 = __p1_629; \
-  int16_t __ret_629; \
-  __ret_629 = vqdmulhh_s16(__s0_629, vget_lane_s16(__s1_629, __p2_629)); \
-  __ret_629; \
+#define vqdmulhh_lane_s16(__p0_721, __p1_721, __p2_721) __extension__ ({ \
+  int16_t __s0_721 = __p0_721; \
+  int16x4_t __s1_721 = __p1_721; \
+  int16_t __ret_721; \
+  __ret_721 = vqdmulhh_s16(__s0_721, vget_lane_s16(__s1_721, __p2_721)); \
+  __ret_721; \
 })
 #else
-#define vqdmulhh_lane_s16(__p0_630, __p1_630, __p2_630) __extension__ ({ \
-  int16_t __s0_630 = __p0_630; \
-  int16x4_t __s1_630 = __p1_630; \
-  int16x4_t __rev1_630;  __rev1_630 = __builtin_shufflevector(__s1_630, __s1_630, 3, 2, 1, 0); \
-  int16_t __ret_630; \
-  __ret_630 = vqdmulhh_s16(__s0_630, __noswap_vget_lane_s16(__rev1_630, __p2_630)); \
-  __ret_630; \
+#define vqdmulhh_lane_s16(__p0_722, __p1_722, __p2_722) __extension__ ({ \
+  int16_t __s0_722 = __p0_722; \
+  int16x4_t __s1_722 = __p1_722; \
+  int16x4_t __rev1_722;  __rev1_722 = __builtin_shufflevector(__s1_722, __s1_722, 3, 2, 1, 0); \
+  int16_t __ret_722; \
+  __ret_722 = vqdmulhh_s16(__s0_722, __noswap_vget_lane_s16(__rev1_722, __p2_722)); \
+  __ret_722; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqdmulhs_laneq_s32(__p0_631, __p1_631, __p2_631) __extension__ ({ \
-  int32_t __s0_631 = __p0_631; \
-  int32x4_t __s1_631 = __p1_631; \
-  int32_t __ret_631; \
-  __ret_631 = vqdmulhs_s32(__s0_631, vgetq_lane_s32(__s1_631, __p2_631)); \
-  __ret_631; \
+#define vqdmulhs_laneq_s32(__p0_723, __p1_723, __p2_723) __extension__ ({ \
+  int32_t __s0_723 = __p0_723; \
+  int32x4_t __s1_723 = __p1_723; \
+  int32_t __ret_723; \
+  __ret_723 = vqdmulhs_s32(__s0_723, vgetq_lane_s32(__s1_723, __p2_723)); \
+  __ret_723; \
 })
 #else
-#define vqdmulhs_laneq_s32(__p0_632, __p1_632, __p2_632) __extension__ ({ \
-  int32_t __s0_632 = __p0_632; \
-  int32x4_t __s1_632 = __p1_632; \
-  int32x4_t __rev1_632;  __rev1_632 = __builtin_shufflevector(__s1_632, __s1_632, 3, 2, 1, 0); \
-  int32_t __ret_632; \
-  __ret_632 = vqdmulhs_s32(__s0_632, __noswap_vgetq_lane_s32(__rev1_632, __p2_632)); \
-  __ret_632; \
+#define vqdmulhs_laneq_s32(__p0_724, __p1_724, __p2_724) __extension__ ({ \
+  int32_t __s0_724 = __p0_724; \
+  int32x4_t __s1_724 = __p1_724; \
+  int32x4_t __rev1_724;  __rev1_724 = __builtin_shufflevector(__s1_724, __s1_724, 3, 2, 1, 0); \
+  int32_t __ret_724; \
+  __ret_724 = vqdmulhs_s32(__s0_724, __noswap_vgetq_lane_s32(__rev1_724, __p2_724)); \
+  __ret_724; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqdmulhh_laneq_s16(__p0_633, __p1_633, __p2_633) __extension__ ({ \
-  int16_t __s0_633 = __p0_633; \
-  int16x8_t __s1_633 = __p1_633; \
-  int16_t __ret_633; \
-  __ret_633 = vqdmulhh_s16(__s0_633, vgetq_lane_s16(__s1_633, __p2_633)); \
-  __ret_633; \
+#define vqdmulhh_laneq_s16(__p0_725, __p1_725, __p2_725) __extension__ ({ \
+  int16_t __s0_725 = __p0_725; \
+  int16x8_t __s1_725 = __p1_725; \
+  int16_t __ret_725; \
+  __ret_725 = vqdmulhh_s16(__s0_725, vgetq_lane_s16(__s1_725, __p2_725)); \
+  __ret_725; \
 })
 #else
-#define vqdmulhh_laneq_s16(__p0_634, __p1_634, __p2_634) __extension__ ({ \
-  int16_t __s0_634 = __p0_634; \
-  int16x8_t __s1_634 = __p1_634; \
-  int16x8_t __rev1_634;  __rev1_634 = __builtin_shufflevector(__s1_634, __s1_634, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16_t __ret_634; \
-  __ret_634 = vqdmulhh_s16(__s0_634, __noswap_vgetq_lane_s16(__rev1_634, __p2_634)); \
-  __ret_634; \
+#define vqdmulhh_laneq_s16(__p0_726, __p1_726, __p2_726) __extension__ ({ \
+  int16_t __s0_726 = __p0_726; \
+  int16x8_t __s1_726 = __p1_726; \
+  int16x8_t __rev1_726;  __rev1_726 = __builtin_shufflevector(__s1_726, __s1_726, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16_t __ret_726; \
+  __ret_726 = vqdmulhh_s16(__s0_726, __noswap_vgetq_lane_s16(__rev1_726, __p2_726)); \
+  __ret_726; \
 })
 #endif
 
@@ -56823,86 +58493,86 @@ __ai int32x4_t vqdmull_high_s16(int16x8_t __p0, int16x8_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqdmull_high_lane_s32(__p0_635, __p1_635, __p2_635) __extension__ ({ \
-  int32x4_t __s0_635 = __p0_635; \
-  int32x2_t __s1_635 = __p1_635; \
-  int64x2_t __ret_635; \
-  __ret_635 = vqdmull_s32(vget_high_s32(__s0_635), splat_lane_s32(__s1_635, __p2_635)); \
-  __ret_635; \
+#define vqdmull_high_lane_s32(__p0_727, __p1_727, __p2_727) __extension__ ({ \
+  int32x4_t __s0_727 = __p0_727; \
+  int32x2_t __s1_727 = __p1_727; \
+  int64x2_t __ret_727; \
+  __ret_727 = vqdmull_s32(vget_high_s32(__s0_727), splat_lane_s32(__s1_727, __p2_727)); \
+  __ret_727; \
 })
 #else
-#define vqdmull_high_lane_s32(__p0_636, __p1_636, __p2_636) __extension__ ({ \
-  int32x4_t __s0_636 = __p0_636; \
-  int32x2_t __s1_636 = __p1_636; \
-  int32x4_t __rev0_636;  __rev0_636 = __builtin_shufflevector(__s0_636, __s0_636, 3, 2, 1, 0); \
-  int32x2_t __rev1_636;  __rev1_636 = __builtin_shufflevector(__s1_636, __s1_636, 1, 0); \
-  int64x2_t __ret_636; \
-  __ret_636 = __noswap_vqdmull_s32(__noswap_vget_high_s32(__rev0_636), __noswap_splat_lane_s32(__rev1_636, __p2_636)); \
-  __ret_636 = __builtin_shufflevector(__ret_636, __ret_636, 1, 0); \
-  __ret_636; \
+#define vqdmull_high_lane_s32(__p0_728, __p1_728, __p2_728) __extension__ ({ \
+  int32x4_t __s0_728 = __p0_728; \
+  int32x2_t __s1_728 = __p1_728; \
+  int32x4_t __rev0_728;  __rev0_728 = __builtin_shufflevector(__s0_728, __s0_728, 3, 2, 1, 0); \
+  int32x2_t __rev1_728;  __rev1_728 = __builtin_shufflevector(__s1_728, __s1_728, 1, 0); \
+  int64x2_t __ret_728; \
+  __ret_728 = __noswap_vqdmull_s32(__noswap_vget_high_s32(__rev0_728), __noswap_splat_lane_s32(__rev1_728, __p2_728)); \
+  __ret_728 = __builtin_shufflevector(__ret_728, __ret_728, 1, 0); \
+  __ret_728; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqdmull_high_lane_s16(__p0_637, __p1_637, __p2_637) __extension__ ({ \
-  int16x8_t __s0_637 = __p0_637; \
-  int16x4_t __s1_637 = __p1_637; \
-  int32x4_t __ret_637; \
-  __ret_637 = vqdmull_s16(vget_high_s16(__s0_637), splat_lane_s16(__s1_637, __p2_637)); \
-  __ret_637; \
+#define vqdmull_high_lane_s16(__p0_729, __p1_729, __p2_729) __extension__ ({ \
+  int16x8_t __s0_729 = __p0_729; \
+  int16x4_t __s1_729 = __p1_729; \
+  int32x4_t __ret_729; \
+  __ret_729 = vqdmull_s16(vget_high_s16(__s0_729), splat_lane_s16(__s1_729, __p2_729)); \
+  __ret_729; \
 })
 #else
-#define vqdmull_high_lane_s16(__p0_638, __p1_638, __p2_638) __extension__ ({ \
-  int16x8_t __s0_638 = __p0_638; \
-  int16x4_t __s1_638 = __p1_638; \
-  int16x8_t __rev0_638;  __rev0_638 = __builtin_shufflevector(__s0_638, __s0_638, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x4_t __rev1_638;  __rev1_638 = __builtin_shufflevector(__s1_638, __s1_638, 3, 2, 1, 0); \
-  int32x4_t __ret_638; \
-  __ret_638 = __noswap_vqdmull_s16(__noswap_vget_high_s16(__rev0_638), __noswap_splat_lane_s16(__rev1_638, __p2_638)); \
-  __ret_638 = __builtin_shufflevector(__ret_638, __ret_638, 3, 2, 1, 0); \
-  __ret_638; \
+#define vqdmull_high_lane_s16(__p0_730, __p1_730, __p2_730) __extension__ ({ \
+  int16x8_t __s0_730 = __p0_730; \
+  int16x4_t __s1_730 = __p1_730; \
+  int16x8_t __rev0_730;  __rev0_730 = __builtin_shufflevector(__s0_730, __s0_730, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __rev1_730;  __rev1_730 = __builtin_shufflevector(__s1_730, __s1_730, 3, 2, 1, 0); \
+  int32x4_t __ret_730; \
+  __ret_730 = __noswap_vqdmull_s16(__noswap_vget_high_s16(__rev0_730), __noswap_splat_lane_s16(__rev1_730, __p2_730)); \
+  __ret_730 = __builtin_shufflevector(__ret_730, __ret_730, 3, 2, 1, 0); \
+  __ret_730; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqdmull_high_laneq_s32(__p0_639, __p1_639, __p2_639) __extension__ ({ \
-  int32x4_t __s0_639 = __p0_639; \
-  int32x4_t __s1_639 = __p1_639; \
-  int64x2_t __ret_639; \
-  __ret_639 = vqdmull_s32(vget_high_s32(__s0_639), splat_laneq_s32(__s1_639, __p2_639)); \
-  __ret_639; \
+#define vqdmull_high_laneq_s32(__p0_731, __p1_731, __p2_731) __extension__ ({ \
+  int32x4_t __s0_731 = __p0_731; \
+  int32x4_t __s1_731 = __p1_731; \
+  int64x2_t __ret_731; \
+  __ret_731 = vqdmull_s32(vget_high_s32(__s0_731), splat_laneq_s32(__s1_731, __p2_731)); \
+  __ret_731; \
 })
 #else
-#define vqdmull_high_laneq_s32(__p0_640, __p1_640, __p2_640) __extension__ ({ \
-  int32x4_t __s0_640 = __p0_640; \
-  int32x4_t __s1_640 = __p1_640; \
-  int32x4_t __rev0_640;  __rev0_640 = __builtin_shufflevector(__s0_640, __s0_640, 3, 2, 1, 0); \
-  int32x4_t __rev1_640;  __rev1_640 = __builtin_shufflevector(__s1_640, __s1_640, 3, 2, 1, 0); \
-  int64x2_t __ret_640; \
-  __ret_640 = __noswap_vqdmull_s32(__noswap_vget_high_s32(__rev0_640), __noswap_splat_laneq_s32(__rev1_640, __p2_640)); \
-  __ret_640 = __builtin_shufflevector(__ret_640, __ret_640, 1, 0); \
-  __ret_640; \
+#define vqdmull_high_laneq_s32(__p0_732, __p1_732, __p2_732) __extension__ ({ \
+  int32x4_t __s0_732 = __p0_732; \
+  int32x4_t __s1_732 = __p1_732; \
+  int32x4_t __rev0_732;  __rev0_732 = __builtin_shufflevector(__s0_732, __s0_732, 3, 2, 1, 0); \
+  int32x4_t __rev1_732;  __rev1_732 = __builtin_shufflevector(__s1_732, __s1_732, 3, 2, 1, 0); \
+  int64x2_t __ret_732; \
+  __ret_732 = __noswap_vqdmull_s32(__noswap_vget_high_s32(__rev0_732), __noswap_splat_laneq_s32(__rev1_732, __p2_732)); \
+  __ret_732 = __builtin_shufflevector(__ret_732, __ret_732, 1, 0); \
+  __ret_732; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqdmull_high_laneq_s16(__p0_641, __p1_641, __p2_641) __extension__ ({ \
-  int16x8_t __s0_641 = __p0_641; \
-  int16x8_t __s1_641 = __p1_641; \
-  int32x4_t __ret_641; \
-  __ret_641 = vqdmull_s16(vget_high_s16(__s0_641), splat_laneq_s16(__s1_641, __p2_641)); \
-  __ret_641; \
+#define vqdmull_high_laneq_s16(__p0_733, __p1_733, __p2_733) __extension__ ({ \
+  int16x8_t __s0_733 = __p0_733; \
+  int16x8_t __s1_733 = __p1_733; \
+  int32x4_t __ret_733; \
+  __ret_733 = vqdmull_s16(vget_high_s16(__s0_733), splat_laneq_s16(__s1_733, __p2_733)); \
+  __ret_733; \
 })
 #else
-#define vqdmull_high_laneq_s16(__p0_642, __p1_642, __p2_642) __extension__ ({ \
-  int16x8_t __s0_642 = __p0_642; \
-  int16x8_t __s1_642 = __p1_642; \
-  int16x8_t __rev0_642;  __rev0_642 = __builtin_shufflevector(__s0_642, __s0_642, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1_642;  __rev1_642 = __builtin_shufflevector(__s1_642, __s1_642, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int32x4_t __ret_642; \
-  __ret_642 = __noswap_vqdmull_s16(__noswap_vget_high_s16(__rev0_642), __noswap_splat_laneq_s16(__rev1_642, __p2_642)); \
-  __ret_642 = __builtin_shufflevector(__ret_642, __ret_642, 3, 2, 1, 0); \
-  __ret_642; \
+#define vqdmull_high_laneq_s16(__p0_734, __p1_734, __p2_734) __extension__ ({ \
+  int16x8_t __s0_734 = __p0_734; \
+  int16x8_t __s1_734 = __p1_734; \
+  int16x8_t __rev0_734;  __rev0_734 = __builtin_shufflevector(__s0_734, __s0_734, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1_734;  __rev1_734 = __builtin_shufflevector(__s1_734, __s1_734, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret_734; \
+  __ret_734 = __noswap_vqdmull_s16(__noswap_vget_high_s16(__rev0_734), __noswap_splat_laneq_s16(__rev1_734, __p2_734)); \
+  __ret_734 = __builtin_shufflevector(__ret_734, __ret_734, 3, 2, 1, 0); \
+  __ret_734; \
 })
 #endif
 
@@ -56939,120 +58609,120 @@ __ai int32x4_t vqdmull_high_n_s16(int16x8_t __p0, int16_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqdmulls_lane_s32(__p0_643, __p1_643, __p2_643) __extension__ ({ \
-  int32_t __s0_643 = __p0_643; \
-  int32x2_t __s1_643 = __p1_643; \
-  int64_t __ret_643; \
-  __ret_643 = vqdmulls_s32(__s0_643, vget_lane_s32(__s1_643, __p2_643)); \
-  __ret_643; \
+#define vqdmulls_lane_s32(__p0_735, __p1_735, __p2_735) __extension__ ({ \
+  int32_t __s0_735 = __p0_735; \
+  int32x2_t __s1_735 = __p1_735; \
+  int64_t __ret_735; \
+  __ret_735 = vqdmulls_s32(__s0_735, vget_lane_s32(__s1_735, __p2_735)); \
+  __ret_735; \
 })
 #else
-#define vqdmulls_lane_s32(__p0_644, __p1_644, __p2_644) __extension__ ({ \
-  int32_t __s0_644 = __p0_644; \
-  int32x2_t __s1_644 = __p1_644; \
-  int32x2_t __rev1_644;  __rev1_644 = __builtin_shufflevector(__s1_644, __s1_644, 1, 0); \
-  int64_t __ret_644; \
-  __ret_644 = vqdmulls_s32(__s0_644, __noswap_vget_lane_s32(__rev1_644, __p2_644)); \
-  __ret_644; \
+#define vqdmulls_lane_s32(__p0_736, __p1_736, __p2_736) __extension__ ({ \
+  int32_t __s0_736 = __p0_736; \
+  int32x2_t __s1_736 = __p1_736; \
+  int32x2_t __rev1_736;  __rev1_736 = __builtin_shufflevector(__s1_736, __s1_736, 1, 0); \
+  int64_t __ret_736; \
+  __ret_736 = vqdmulls_s32(__s0_736, __noswap_vget_lane_s32(__rev1_736, __p2_736)); \
+  __ret_736; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqdmullh_lane_s16(__p0_645, __p1_645, __p2_645) __extension__ ({ \
-  int16_t __s0_645 = __p0_645; \
-  int16x4_t __s1_645 = __p1_645; \
-  int32_t __ret_645; \
-  __ret_645 = vqdmullh_s16(__s0_645, vget_lane_s16(__s1_645, __p2_645)); \
-  __ret_645; \
+#define vqdmullh_lane_s16(__p0_737, __p1_737, __p2_737) __extension__ ({ \
+  int16_t __s0_737 = __p0_737; \
+  int16x4_t __s1_737 = __p1_737; \
+  int32_t __ret_737; \
+  __ret_737 = vqdmullh_s16(__s0_737, vget_lane_s16(__s1_737, __p2_737)); \
+  __ret_737; \
 })
 #else
-#define vqdmullh_lane_s16(__p0_646, __p1_646, __p2_646) __extension__ ({ \
-  int16_t __s0_646 = __p0_646; \
-  int16x4_t __s1_646 = __p1_646; \
-  int16x4_t __rev1_646;  __rev1_646 = __builtin_shufflevector(__s1_646, __s1_646, 3, 2, 1, 0); \
-  int32_t __ret_646; \
-  __ret_646 = vqdmullh_s16(__s0_646, __noswap_vget_lane_s16(__rev1_646, __p2_646)); \
-  __ret_646; \
+#define vqdmullh_lane_s16(__p0_738, __p1_738, __p2_738) __extension__ ({ \
+  int16_t __s0_738 = __p0_738; \
+  int16x4_t __s1_738 = __p1_738; \
+  int16x4_t __rev1_738;  __rev1_738 = __builtin_shufflevector(__s1_738, __s1_738, 3, 2, 1, 0); \
+  int32_t __ret_738; \
+  __ret_738 = vqdmullh_s16(__s0_738, __noswap_vget_lane_s16(__rev1_738, __p2_738)); \
+  __ret_738; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqdmulls_laneq_s32(__p0_647, __p1_647, __p2_647) __extension__ ({ \
-  int32_t __s0_647 = __p0_647; \
-  int32x4_t __s1_647 = __p1_647; \
-  int64_t __ret_647; \
-  __ret_647 = vqdmulls_s32(__s0_647, vgetq_lane_s32(__s1_647, __p2_647)); \
-  __ret_647; \
+#define vqdmulls_laneq_s32(__p0_739, __p1_739, __p2_739) __extension__ ({ \
+  int32_t __s0_739 = __p0_739; \
+  int32x4_t __s1_739 = __p1_739; \
+  int64_t __ret_739; \
+  __ret_739 = vqdmulls_s32(__s0_739, vgetq_lane_s32(__s1_739, __p2_739)); \
+  __ret_739; \
 })
 #else
-#define vqdmulls_laneq_s32(__p0_648, __p1_648, __p2_648) __extension__ ({ \
-  int32_t __s0_648 = __p0_648; \
-  int32x4_t __s1_648 = __p1_648; \
-  int32x4_t __rev1_648;  __rev1_648 = __builtin_shufflevector(__s1_648, __s1_648, 3, 2, 1, 0); \
-  int64_t __ret_648; \
-  __ret_648 = vqdmulls_s32(__s0_648, __noswap_vgetq_lane_s32(__rev1_648, __p2_648)); \
-  __ret_648; \
+#define vqdmulls_laneq_s32(__p0_740, __p1_740, __p2_740) __extension__ ({ \
+  int32_t __s0_740 = __p0_740; \
+  int32x4_t __s1_740 = __p1_740; \
+  int32x4_t __rev1_740;  __rev1_740 = __builtin_shufflevector(__s1_740, __s1_740, 3, 2, 1, 0); \
+  int64_t __ret_740; \
+  __ret_740 = vqdmulls_s32(__s0_740, __noswap_vgetq_lane_s32(__rev1_740, __p2_740)); \
+  __ret_740; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqdmullh_laneq_s16(__p0_649, __p1_649, __p2_649) __extension__ ({ \
-  int16_t __s0_649 = __p0_649; \
-  int16x8_t __s1_649 = __p1_649; \
-  int32_t __ret_649; \
-  __ret_649 = vqdmullh_s16(__s0_649, vgetq_lane_s16(__s1_649, __p2_649)); \
-  __ret_649; \
+#define vqdmullh_laneq_s16(__p0_741, __p1_741, __p2_741) __extension__ ({ \
+  int16_t __s0_741 = __p0_741; \
+  int16x8_t __s1_741 = __p1_741; \
+  int32_t __ret_741; \
+  __ret_741 = vqdmullh_s16(__s0_741, vgetq_lane_s16(__s1_741, __p2_741)); \
+  __ret_741; \
 })
 #else
-#define vqdmullh_laneq_s16(__p0_650, __p1_650, __p2_650) __extension__ ({ \
-  int16_t __s0_650 = __p0_650; \
-  int16x8_t __s1_650 = __p1_650; \
-  int16x8_t __rev1_650;  __rev1_650 = __builtin_shufflevector(__s1_650, __s1_650, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int32_t __ret_650; \
-  __ret_650 = vqdmullh_s16(__s0_650, __noswap_vgetq_lane_s16(__rev1_650, __p2_650)); \
-  __ret_650; \
+#define vqdmullh_laneq_s16(__p0_742, __p1_742, __p2_742) __extension__ ({ \
+  int16_t __s0_742 = __p0_742; \
+  int16x8_t __s1_742 = __p1_742; \
+  int16x8_t __rev1_742;  __rev1_742 = __builtin_shufflevector(__s1_742, __s1_742, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32_t __ret_742; \
+  __ret_742 = vqdmullh_s16(__s0_742, __noswap_vgetq_lane_s16(__rev1_742, __p2_742)); \
+  __ret_742; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqdmull_laneq_s32(__p0_651, __p1_651, __p2_651) __extension__ ({ \
-  int32x2_t __s0_651 = __p0_651; \
-  int32x4_t __s1_651 = __p1_651; \
-  int64x2_t __ret_651; \
-  __ret_651 = vqdmull_s32(__s0_651, splat_laneq_s32(__s1_651, __p2_651)); \
-  __ret_651; \
+#define vqdmull_laneq_s32(__p0_743, __p1_743, __p2_743) __extension__ ({ \
+  int32x2_t __s0_743 = __p0_743; \
+  int32x4_t __s1_743 = __p1_743; \
+  int64x2_t __ret_743; \
+  __ret_743 = vqdmull_s32(__s0_743, splat_laneq_s32(__s1_743, __p2_743)); \
+  __ret_743; \
 })
 #else
-#define vqdmull_laneq_s32(__p0_652, __p1_652, __p2_652) __extension__ ({ \
-  int32x2_t __s0_652 = __p0_652; \
-  int32x4_t __s1_652 = __p1_652; \
-  int32x2_t __rev0_652;  __rev0_652 = __builtin_shufflevector(__s0_652, __s0_652, 1, 0); \
-  int32x4_t __rev1_652;  __rev1_652 = __builtin_shufflevector(__s1_652, __s1_652, 3, 2, 1, 0); \
-  int64x2_t __ret_652; \
-  __ret_652 = __noswap_vqdmull_s32(__rev0_652, __noswap_splat_laneq_s32(__rev1_652, __p2_652)); \
-  __ret_652 = __builtin_shufflevector(__ret_652, __ret_652, 1, 0); \
-  __ret_652; \
+#define vqdmull_laneq_s32(__p0_744, __p1_744, __p2_744) __extension__ ({ \
+  int32x2_t __s0_744 = __p0_744; \
+  int32x4_t __s1_744 = __p1_744; \
+  int32x2_t __rev0_744;  __rev0_744 = __builtin_shufflevector(__s0_744, __s0_744, 1, 0); \
+  int32x4_t __rev1_744;  __rev1_744 = __builtin_shufflevector(__s1_744, __s1_744, 3, 2, 1, 0); \
+  int64x2_t __ret_744; \
+  __ret_744 = __noswap_vqdmull_s32(__rev0_744, __noswap_splat_laneq_s32(__rev1_744, __p2_744)); \
+  __ret_744 = __builtin_shufflevector(__ret_744, __ret_744, 1, 0); \
+  __ret_744; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqdmull_laneq_s16(__p0_653, __p1_653, __p2_653) __extension__ ({ \
-  int16x4_t __s0_653 = __p0_653; \
-  int16x8_t __s1_653 = __p1_653; \
-  int32x4_t __ret_653; \
-  __ret_653 = vqdmull_s16(__s0_653, splat_laneq_s16(__s1_653, __p2_653)); \
-  __ret_653; \
+#define vqdmull_laneq_s16(__p0_745, __p1_745, __p2_745) __extension__ ({ \
+  int16x4_t __s0_745 = __p0_745; \
+  int16x8_t __s1_745 = __p1_745; \
+  int32x4_t __ret_745; \
+  __ret_745 = vqdmull_s16(__s0_745, splat_laneq_s16(__s1_745, __p2_745)); \
+  __ret_745; \
 })
 #else
-#define vqdmull_laneq_s16(__p0_654, __p1_654, __p2_654) __extension__ ({ \
-  int16x4_t __s0_654 = __p0_654; \
-  int16x8_t __s1_654 = __p1_654; \
-  int16x4_t __rev0_654;  __rev0_654 = __builtin_shufflevector(__s0_654, __s0_654, 3, 2, 1, 0); \
-  int16x8_t __rev1_654;  __rev1_654 = __builtin_shufflevector(__s1_654, __s1_654, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int32x4_t __ret_654; \
-  __ret_654 = __noswap_vqdmull_s16(__rev0_654, __noswap_splat_laneq_s16(__rev1_654, __p2_654)); \
-  __ret_654 = __builtin_shufflevector(__ret_654, __ret_654, 3, 2, 1, 0); \
-  __ret_654; \
+#define vqdmull_laneq_s16(__p0_746, __p1_746, __p2_746) __extension__ ({ \
+  int16x4_t __s0_746 = __p0_746; \
+  int16x8_t __s1_746 = __p1_746; \
+  int16x4_t __rev0_746;  __rev0_746 = __builtin_shufflevector(__s0_746, __s0_746, 3, 2, 1, 0); \
+  int16x8_t __rev1_746;  __rev1_746 = __builtin_shufflevector(__s1_746, __s1_746, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret_746; \
+  __ret_746 = __noswap_vqdmull_s16(__rev0_746, __noswap_splat_laneq_s16(__rev1_746, __p2_746)); \
+  __ret_746 = __builtin_shufflevector(__ret_746, __ret_746, 3, 2, 1, 0); \
+  __ret_746; \
 })
 #endif
 
@@ -57188,30 +58858,30 @@ __ai int8x16_t vqmovn_high_s16(int8x8_t __p0, int16x8_t __p1) {
 }
 #endif
 
-__ai int16_t vqmovuns_s32(int32_t __p0) {
-  int16_t __ret;
-  __ret = (int16_t) __builtin_neon_vqmovuns_s32(__p0);
+__ai uint16_t vqmovuns_s32(int32_t __p0) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vqmovuns_s32(__p0);
   return __ret;
 }
-__ai int32_t vqmovund_s64(int64_t __p0) {
-  int32_t __ret;
-  __ret = (int32_t) __builtin_neon_vqmovund_s64(__p0);
+__ai uint32_t vqmovund_s64(int64_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vqmovund_s64(__p0);
   return __ret;
 }
-__ai int8_t vqmovunh_s16(int16_t __p0) {
-  int8_t __ret;
-  __ret = (int8_t) __builtin_neon_vqmovunh_s16(__p0);
+__ai uint8_t vqmovunh_s16(int16_t __p0) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vqmovunh_s16(__p0);
   return __ret;
 }
 #ifdef __LITTLE_ENDIAN__
-__ai uint16x8_t vqmovun_high_s32(int16x4_t __p0, int32x4_t __p1) {
+__ai uint16x8_t vqmovun_high_s32(uint16x4_t __p0, int32x4_t __p1) {
   uint16x8_t __ret;
   __ret = vcombine_u16((uint16x4_t)(__p0), vqmovun_s32(__p1));
   return __ret;
 }
 #else
-__ai uint16x8_t vqmovun_high_s32(int16x4_t __p0, int32x4_t __p1) {
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+__ai uint16x8_t vqmovun_high_s32(uint16x4_t __p0, int32x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
   int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
   uint16x8_t __ret;
   __ret = __noswap_vcombine_u16((uint16x4_t)(__rev0), __noswap_vqmovun_s32(__rev1));
@@ -57221,14 +58891,14 @@ __ai uint16x8_t vqmovun_high_s32(int16x4_t __p0, int32x4_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai uint32x4_t vqmovun_high_s64(int32x2_t __p0, int64x2_t __p1) {
+__ai uint32x4_t vqmovun_high_s64(uint32x2_t __p0, int64x2_t __p1) {
   uint32x4_t __ret;
   __ret = vcombine_u32((uint32x2_t)(__p0), vqmovun_s64(__p1));
   return __ret;
 }
 #else
-__ai uint32x4_t vqmovun_high_s64(int32x2_t __p0, int64x2_t __p1) {
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+__ai uint32x4_t vqmovun_high_s64(uint32x2_t __p0, int64x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
   int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
   uint32x4_t __ret;
   __ret = __noswap_vcombine_u32((uint32x2_t)(__rev0), __noswap_vqmovun_s64(__rev1));
@@ -57238,14 +58908,14 @@ __ai uint32x4_t vqmovun_high_s64(int32x2_t __p0, int64x2_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai uint8x16_t vqmovun_high_s16(int8x8_t __p0, int16x8_t __p1) {
+__ai uint8x16_t vqmovun_high_s16(uint8x8_t __p0, int16x8_t __p1) {
   uint8x16_t __ret;
   __ret = vcombine_u8((uint8x8_t)(__p0), vqmovun_s16(__p1));
   return __ret;
 }
 #else
-__ai uint8x16_t vqmovun_high_s16(int8x8_t __p0, int16x8_t __p1) {
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+__ai uint8x16_t vqmovun_high_s16(uint8x8_t __p0, int16x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
   int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
   uint8x16_t __ret;
   __ret = __noswap_vcombine_u8((uint8x8_t)(__rev0), __noswap_vqmovun_s16(__rev1));
@@ -57390,78 +59060,78 @@ __ai int16_t vqrdmulhh_s16(int16_t __p0, int16_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmulhs_lane_s32(__p0_655, __p1_655, __p2_655) __extension__ ({ \
-  int32_t __s0_655 = __p0_655; \
-  int32x2_t __s1_655 = __p1_655; \
-  int32_t __ret_655; \
-  __ret_655 = vqrdmulhs_s32(__s0_655, vget_lane_s32(__s1_655, __p2_655)); \
-  __ret_655; \
+#define vqrdmulhs_lane_s32(__p0_747, __p1_747, __p2_747) __extension__ ({ \
+  int32_t __s0_747 = __p0_747; \
+  int32x2_t __s1_747 = __p1_747; \
+  int32_t __ret_747; \
+  __ret_747 = vqrdmulhs_s32(__s0_747, vget_lane_s32(__s1_747, __p2_747)); \
+  __ret_747; \
 })
 #else
-#define vqrdmulhs_lane_s32(__p0_656, __p1_656, __p2_656) __extension__ ({ \
-  int32_t __s0_656 = __p0_656; \
-  int32x2_t __s1_656 = __p1_656; \
-  int32x2_t __rev1_656;  __rev1_656 = __builtin_shufflevector(__s1_656, __s1_656, 1, 0); \
-  int32_t __ret_656; \
-  __ret_656 = vqrdmulhs_s32(__s0_656, __noswap_vget_lane_s32(__rev1_656, __p2_656)); \
-  __ret_656; \
+#define vqrdmulhs_lane_s32(__p0_748, __p1_748, __p2_748) __extension__ ({ \
+  int32_t __s0_748 = __p0_748; \
+  int32x2_t __s1_748 = __p1_748; \
+  int32x2_t __rev1_748;  __rev1_748 = __builtin_shufflevector(__s1_748, __s1_748, 1, 0); \
+  int32_t __ret_748; \
+  __ret_748 = vqrdmulhs_s32(__s0_748, __noswap_vget_lane_s32(__rev1_748, __p2_748)); \
+  __ret_748; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmulhh_lane_s16(__p0_657, __p1_657, __p2_657) __extension__ ({ \
-  int16_t __s0_657 = __p0_657; \
-  int16x4_t __s1_657 = __p1_657; \
-  int16_t __ret_657; \
-  __ret_657 = vqrdmulhh_s16(__s0_657, vget_lane_s16(__s1_657, __p2_657)); \
-  __ret_657; \
+#define vqrdmulhh_lane_s16(__p0_749, __p1_749, __p2_749) __extension__ ({ \
+  int16_t __s0_749 = __p0_749; \
+  int16x4_t __s1_749 = __p1_749; \
+  int16_t __ret_749; \
+  __ret_749 = vqrdmulhh_s16(__s0_749, vget_lane_s16(__s1_749, __p2_749)); \
+  __ret_749; \
 })
 #else
-#define vqrdmulhh_lane_s16(__p0_658, __p1_658, __p2_658) __extension__ ({ \
-  int16_t __s0_658 = __p0_658; \
-  int16x4_t __s1_658 = __p1_658; \
-  int16x4_t __rev1_658;  __rev1_658 = __builtin_shufflevector(__s1_658, __s1_658, 3, 2, 1, 0); \
-  int16_t __ret_658; \
-  __ret_658 = vqrdmulhh_s16(__s0_658, __noswap_vget_lane_s16(__rev1_658, __p2_658)); \
-  __ret_658; \
+#define vqrdmulhh_lane_s16(__p0_750, __p1_750, __p2_750) __extension__ ({ \
+  int16_t __s0_750 = __p0_750; \
+  int16x4_t __s1_750 = __p1_750; \
+  int16x4_t __rev1_750;  __rev1_750 = __builtin_shufflevector(__s1_750, __s1_750, 3, 2, 1, 0); \
+  int16_t __ret_750; \
+  __ret_750 = vqrdmulhh_s16(__s0_750, __noswap_vget_lane_s16(__rev1_750, __p2_750)); \
+  __ret_750; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmulhs_laneq_s32(__p0_659, __p1_659, __p2_659) __extension__ ({ \
-  int32_t __s0_659 = __p0_659; \
-  int32x4_t __s1_659 = __p1_659; \
-  int32_t __ret_659; \
-  __ret_659 = vqrdmulhs_s32(__s0_659, vgetq_lane_s32(__s1_659, __p2_659)); \
-  __ret_659; \
+#define vqrdmulhs_laneq_s32(__p0_751, __p1_751, __p2_751) __extension__ ({ \
+  int32_t __s0_751 = __p0_751; \
+  int32x4_t __s1_751 = __p1_751; \
+  int32_t __ret_751; \
+  __ret_751 = vqrdmulhs_s32(__s0_751, vgetq_lane_s32(__s1_751, __p2_751)); \
+  __ret_751; \
 })
 #else
-#define vqrdmulhs_laneq_s32(__p0_660, __p1_660, __p2_660) __extension__ ({ \
-  int32_t __s0_660 = __p0_660; \
-  int32x4_t __s1_660 = __p1_660; \
-  int32x4_t __rev1_660;  __rev1_660 = __builtin_shufflevector(__s1_660, __s1_660, 3, 2, 1, 0); \
-  int32_t __ret_660; \
-  __ret_660 = vqrdmulhs_s32(__s0_660, __noswap_vgetq_lane_s32(__rev1_660, __p2_660)); \
-  __ret_660; \
+#define vqrdmulhs_laneq_s32(__p0_752, __p1_752, __p2_752) __extension__ ({ \
+  int32_t __s0_752 = __p0_752; \
+  int32x4_t __s1_752 = __p1_752; \
+  int32x4_t __rev1_752;  __rev1_752 = __builtin_shufflevector(__s1_752, __s1_752, 3, 2, 1, 0); \
+  int32_t __ret_752; \
+  __ret_752 = vqrdmulhs_s32(__s0_752, __noswap_vgetq_lane_s32(__rev1_752, __p2_752)); \
+  __ret_752; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmulhh_laneq_s16(__p0_661, __p1_661, __p2_661) __extension__ ({ \
-  int16_t __s0_661 = __p0_661; \
-  int16x8_t __s1_661 = __p1_661; \
-  int16_t __ret_661; \
-  __ret_661 = vqrdmulhh_s16(__s0_661, vgetq_lane_s16(__s1_661, __p2_661)); \
-  __ret_661; \
+#define vqrdmulhh_laneq_s16(__p0_753, __p1_753, __p2_753) __extension__ ({ \
+  int16_t __s0_753 = __p0_753; \
+  int16x8_t __s1_753 = __p1_753; \
+  int16_t __ret_753; \
+  __ret_753 = vqrdmulhh_s16(__s0_753, vgetq_lane_s16(__s1_753, __p2_753)); \
+  __ret_753; \
 })
 #else
-#define vqrdmulhh_laneq_s16(__p0_662, __p1_662, __p2_662) __extension__ ({ \
-  int16_t __s0_662 = __p0_662; \
-  int16x8_t __s1_662 = __p1_662; \
-  int16x8_t __rev1_662;  __rev1_662 = __builtin_shufflevector(__s1_662, __s1_662, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16_t __ret_662; \
-  __ret_662 = vqrdmulhh_s16(__s0_662, __noswap_vgetq_lane_s16(__rev1_662, __p2_662)); \
-  __ret_662; \
+#define vqrdmulhh_laneq_s16(__p0_754, __p1_754, __p2_754) __extension__ ({ \
+  int16_t __s0_754 = __p0_754; \
+  int16x8_t __s1_754 = __p1_754; \
+  int16x8_t __rev1_754;  __rev1_754 = __builtin_shufflevector(__s1_754, __s1_754, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16_t __ret_754; \
+  __ret_754 = vqrdmulhh_s16(__s0_754, __noswap_vgetq_lane_s16(__rev1_754, __p2_754)); \
+  __ret_754; \
 })
 #endif
 
@@ -57549,22 +59219,22 @@ __ai int16_t vqrdmulhh_s16(int16_t __p0, int16_t __p1) {
 })
 #endif
 
-__ai uint8_t vqrshlb_u8(uint8_t __p0, uint8_t __p1) {
+__ai uint8_t vqrshlb_u8(uint8_t __p0, int8_t __p1) {
   uint8_t __ret;
   __ret = (uint8_t) __builtin_neon_vqrshlb_u8(__p0, __p1);
   return __ret;
 }
-__ai uint32_t vqrshls_u32(uint32_t __p0, uint32_t __p1) {
+__ai uint32_t vqrshls_u32(uint32_t __p0, int32_t __p1) {
   uint32_t __ret;
   __ret = (uint32_t) __builtin_neon_vqrshls_u32(__p0, __p1);
   return __ret;
 }
-__ai uint64_t vqrshld_u64(uint64_t __p0, uint64_t __p1) {
+__ai uint64_t vqrshld_u64(uint64_t __p0, int64_t __p1) {
   uint64_t __ret;
   __ret = (uint64_t) __builtin_neon_vqrshld_u64(__p0, __p1);
   return __ret;
 }
-__ai uint16_t vqrshlh_u16(uint16_t __p0, uint16_t __p1) {
+__ai uint16_t vqrshlh_u16(uint16_t __p0, int16_t __p1) {
   uint16_t __ret;
   __ret = (uint16_t) __builtin_neon_vqrshlh_u16(__p0, __p1);
   return __ret;
@@ -57590,128 +59260,128 @@ __ai int16_t vqrshlh_s16(int16_t __p0, int16_t __p1) {
   return __ret;
 }
 #ifdef __LITTLE_ENDIAN__
-#define vqrshrn_high_n_u32(__p0_663, __p1_663, __p2_663) __extension__ ({ \
-  uint16x4_t __s0_663 = __p0_663; \
-  uint32x4_t __s1_663 = __p1_663; \
-  uint16x8_t __ret_663; \
-  __ret_663 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_663), (uint16x4_t)(vqrshrn_n_u32(__s1_663, __p2_663)))); \
-  __ret_663; \
+#define vqrshrn_high_n_u32(__p0_755, __p1_755, __p2_755) __extension__ ({ \
+  uint16x4_t __s0_755 = __p0_755; \
+  uint32x4_t __s1_755 = __p1_755; \
+  uint16x8_t __ret_755; \
+  __ret_755 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_755), (uint16x4_t)(vqrshrn_n_u32(__s1_755, __p2_755)))); \
+  __ret_755; \
 })
 #else
-#define vqrshrn_high_n_u32(__p0_664, __p1_664, __p2_664) __extension__ ({ \
-  uint16x4_t __s0_664 = __p0_664; \
-  uint32x4_t __s1_664 = __p1_664; \
-  uint16x4_t __rev0_664;  __rev0_664 = __builtin_shufflevector(__s0_664, __s0_664, 3, 2, 1, 0); \
-  uint32x4_t __rev1_664;  __rev1_664 = __builtin_shufflevector(__s1_664, __s1_664, 3, 2, 1, 0); \
-  uint16x8_t __ret_664; \
-  __ret_664 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_664), (uint16x4_t)(__noswap_vqrshrn_n_u32(__rev1_664, __p2_664)))); \
-  __ret_664 = __builtin_shufflevector(__ret_664, __ret_664, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_664; \
+#define vqrshrn_high_n_u32(__p0_756, __p1_756, __p2_756) __extension__ ({ \
+  uint16x4_t __s0_756 = __p0_756; \
+  uint32x4_t __s1_756 = __p1_756; \
+  uint16x4_t __rev0_756;  __rev0_756 = __builtin_shufflevector(__s0_756, __s0_756, 3, 2, 1, 0); \
+  uint32x4_t __rev1_756;  __rev1_756 = __builtin_shufflevector(__s1_756, __s1_756, 3, 2, 1, 0); \
+  uint16x8_t __ret_756; \
+  __ret_756 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_756), (uint16x4_t)(__noswap_vqrshrn_n_u32(__rev1_756, __p2_756)))); \
+  __ret_756 = __builtin_shufflevector(__ret_756, __ret_756, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_756; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrshrn_high_n_u64(__p0_665, __p1_665, __p2_665) __extension__ ({ \
-  uint32x2_t __s0_665 = __p0_665; \
-  uint64x2_t __s1_665 = __p1_665; \
-  uint32x4_t __ret_665; \
-  __ret_665 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_665), (uint32x2_t)(vqrshrn_n_u64(__s1_665, __p2_665)))); \
-  __ret_665; \
+#define vqrshrn_high_n_u64(__p0_757, __p1_757, __p2_757) __extension__ ({ \
+  uint32x2_t __s0_757 = __p0_757; \
+  uint64x2_t __s1_757 = __p1_757; \
+  uint32x4_t __ret_757; \
+  __ret_757 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_757), (uint32x2_t)(vqrshrn_n_u64(__s1_757, __p2_757)))); \
+  __ret_757; \
 })
 #else
-#define vqrshrn_high_n_u64(__p0_666, __p1_666, __p2_666) __extension__ ({ \
-  uint32x2_t __s0_666 = __p0_666; \
-  uint64x2_t __s1_666 = __p1_666; \
-  uint32x2_t __rev0_666;  __rev0_666 = __builtin_shufflevector(__s0_666, __s0_666, 1, 0); \
-  uint64x2_t __rev1_666;  __rev1_666 = __builtin_shufflevector(__s1_666, __s1_666, 1, 0); \
-  uint32x4_t __ret_666; \
-  __ret_666 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_666), (uint32x2_t)(__noswap_vqrshrn_n_u64(__rev1_666, __p2_666)))); \
-  __ret_666 = __builtin_shufflevector(__ret_666, __ret_666, 3, 2, 1, 0); \
-  __ret_666; \
+#define vqrshrn_high_n_u64(__p0_758, __p1_758, __p2_758) __extension__ ({ \
+  uint32x2_t __s0_758 = __p0_758; \
+  uint64x2_t __s1_758 = __p1_758; \
+  uint32x2_t __rev0_758;  __rev0_758 = __builtin_shufflevector(__s0_758, __s0_758, 1, 0); \
+  uint64x2_t __rev1_758;  __rev1_758 = __builtin_shufflevector(__s1_758, __s1_758, 1, 0); \
+  uint32x4_t __ret_758; \
+  __ret_758 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_758), (uint32x2_t)(__noswap_vqrshrn_n_u64(__rev1_758, __p2_758)))); \
+  __ret_758 = __builtin_shufflevector(__ret_758, __ret_758, 3, 2, 1, 0); \
+  __ret_758; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrshrn_high_n_u16(__p0_667, __p1_667, __p2_667) __extension__ ({ \
-  uint8x8_t __s0_667 = __p0_667; \
-  uint16x8_t __s1_667 = __p1_667; \
-  uint8x16_t __ret_667; \
-  __ret_667 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_667), (uint8x8_t)(vqrshrn_n_u16(__s1_667, __p2_667)))); \
-  __ret_667; \
+#define vqrshrn_high_n_u16(__p0_759, __p1_759, __p2_759) __extension__ ({ \
+  uint8x8_t __s0_759 = __p0_759; \
+  uint16x8_t __s1_759 = __p1_759; \
+  uint8x16_t __ret_759; \
+  __ret_759 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_759), (uint8x8_t)(vqrshrn_n_u16(__s1_759, __p2_759)))); \
+  __ret_759; \
 })
 #else
-#define vqrshrn_high_n_u16(__p0_668, __p1_668, __p2_668) __extension__ ({ \
-  uint8x8_t __s0_668 = __p0_668; \
-  uint16x8_t __s1_668 = __p1_668; \
-  uint8x8_t __rev0_668;  __rev0_668 = __builtin_shufflevector(__s0_668, __s0_668, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __rev1_668;  __rev1_668 = __builtin_shufflevector(__s1_668, __s1_668, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x16_t __ret_668; \
-  __ret_668 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_668), (uint8x8_t)(__noswap_vqrshrn_n_u16(__rev1_668, __p2_668)))); \
-  __ret_668 = __builtin_shufflevector(__ret_668, __ret_668, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_668; \
+#define vqrshrn_high_n_u16(__p0_760, __p1_760, __p2_760) __extension__ ({ \
+  uint8x8_t __s0_760 = __p0_760; \
+  uint16x8_t __s1_760 = __p1_760; \
+  uint8x8_t __rev0_760;  __rev0_760 = __builtin_shufflevector(__s0_760, __s0_760, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1_760;  __rev1_760 = __builtin_shufflevector(__s1_760, __s1_760, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret_760; \
+  __ret_760 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_760), (uint8x8_t)(__noswap_vqrshrn_n_u16(__rev1_760, __p2_760)))); \
+  __ret_760 = __builtin_shufflevector(__ret_760, __ret_760, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_760; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrshrn_high_n_s32(__p0_669, __p1_669, __p2_669) __extension__ ({ \
-  int16x4_t __s0_669 = __p0_669; \
-  int32x4_t __s1_669 = __p1_669; \
-  int16x8_t __ret_669; \
-  __ret_669 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_669), (int16x4_t)(vqrshrn_n_s32(__s1_669, __p2_669)))); \
-  __ret_669; \
+#define vqrshrn_high_n_s32(__p0_761, __p1_761, __p2_761) __extension__ ({ \
+  int16x4_t __s0_761 = __p0_761; \
+  int32x4_t __s1_761 = __p1_761; \
+  int16x8_t __ret_761; \
+  __ret_761 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_761), (int16x4_t)(vqrshrn_n_s32(__s1_761, __p2_761)))); \
+  __ret_761; \
 })
 #else
-#define vqrshrn_high_n_s32(__p0_670, __p1_670, __p2_670) __extension__ ({ \
-  int16x4_t __s0_670 = __p0_670; \
-  int32x4_t __s1_670 = __p1_670; \
-  int16x4_t __rev0_670;  __rev0_670 = __builtin_shufflevector(__s0_670, __s0_670, 3, 2, 1, 0); \
-  int32x4_t __rev1_670;  __rev1_670 = __builtin_shufflevector(__s1_670, __s1_670, 3, 2, 1, 0); \
-  int16x8_t __ret_670; \
-  __ret_670 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_670), (int16x4_t)(__noswap_vqrshrn_n_s32(__rev1_670, __p2_670)))); \
-  __ret_670 = __builtin_shufflevector(__ret_670, __ret_670, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_670; \
+#define vqrshrn_high_n_s32(__p0_762, __p1_762, __p2_762) __extension__ ({ \
+  int16x4_t __s0_762 = __p0_762; \
+  int32x4_t __s1_762 = __p1_762; \
+  int16x4_t __rev0_762;  __rev0_762 = __builtin_shufflevector(__s0_762, __s0_762, 3, 2, 1, 0); \
+  int32x4_t __rev1_762;  __rev1_762 = __builtin_shufflevector(__s1_762, __s1_762, 3, 2, 1, 0); \
+  int16x8_t __ret_762; \
+  __ret_762 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_762), (int16x4_t)(__noswap_vqrshrn_n_s32(__rev1_762, __p2_762)))); \
+  __ret_762 = __builtin_shufflevector(__ret_762, __ret_762, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_762; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrshrn_high_n_s64(__p0_671, __p1_671, __p2_671) __extension__ ({ \
-  int32x2_t __s0_671 = __p0_671; \
-  int64x2_t __s1_671 = __p1_671; \
-  int32x4_t __ret_671; \
-  __ret_671 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_671), (int32x2_t)(vqrshrn_n_s64(__s1_671, __p2_671)))); \
-  __ret_671; \
+#define vqrshrn_high_n_s64(__p0_763, __p1_763, __p2_763) __extension__ ({ \
+  int32x2_t __s0_763 = __p0_763; \
+  int64x2_t __s1_763 = __p1_763; \
+  int32x4_t __ret_763; \
+  __ret_763 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_763), (int32x2_t)(vqrshrn_n_s64(__s1_763, __p2_763)))); \
+  __ret_763; \
 })
 #else
-#define vqrshrn_high_n_s64(__p0_672, __p1_672, __p2_672) __extension__ ({ \
-  int32x2_t __s0_672 = __p0_672; \
-  int64x2_t __s1_672 = __p1_672; \
-  int32x2_t __rev0_672;  __rev0_672 = __builtin_shufflevector(__s0_672, __s0_672, 1, 0); \
-  int64x2_t __rev1_672;  __rev1_672 = __builtin_shufflevector(__s1_672, __s1_672, 1, 0); \
-  int32x4_t __ret_672; \
-  __ret_672 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_672), (int32x2_t)(__noswap_vqrshrn_n_s64(__rev1_672, __p2_672)))); \
-  __ret_672 = __builtin_shufflevector(__ret_672, __ret_672, 3, 2, 1, 0); \
-  __ret_672; \
+#define vqrshrn_high_n_s64(__p0_764, __p1_764, __p2_764) __extension__ ({ \
+  int32x2_t __s0_764 = __p0_764; \
+  int64x2_t __s1_764 = __p1_764; \
+  int32x2_t __rev0_764;  __rev0_764 = __builtin_shufflevector(__s0_764, __s0_764, 1, 0); \
+  int64x2_t __rev1_764;  __rev1_764 = __builtin_shufflevector(__s1_764, __s1_764, 1, 0); \
+  int32x4_t __ret_764; \
+  __ret_764 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_764), (int32x2_t)(__noswap_vqrshrn_n_s64(__rev1_764, __p2_764)))); \
+  __ret_764 = __builtin_shufflevector(__ret_764, __ret_764, 3, 2, 1, 0); \
+  __ret_764; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrshrn_high_n_s16(__p0_673, __p1_673, __p2_673) __extension__ ({ \
-  int8x8_t __s0_673 = __p0_673; \
-  int16x8_t __s1_673 = __p1_673; \
-  int8x16_t __ret_673; \
-  __ret_673 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_673), (int8x8_t)(vqrshrn_n_s16(__s1_673, __p2_673)))); \
-  __ret_673; \
+#define vqrshrn_high_n_s16(__p0_765, __p1_765, __p2_765) __extension__ ({ \
+  int8x8_t __s0_765 = __p0_765; \
+  int16x8_t __s1_765 = __p1_765; \
+  int8x16_t __ret_765; \
+  __ret_765 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_765), (int8x8_t)(vqrshrn_n_s16(__s1_765, __p2_765)))); \
+  __ret_765; \
 })
 #else
-#define vqrshrn_high_n_s16(__p0_674, __p1_674, __p2_674) __extension__ ({ \
-  int8x8_t __s0_674 = __p0_674; \
-  int16x8_t __s1_674 = __p1_674; \
-  int8x8_t __rev0_674;  __rev0_674 = __builtin_shufflevector(__s0_674, __s0_674, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1_674;  __rev1_674 = __builtin_shufflevector(__s1_674, __s1_674, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x16_t __ret_674; \
-  __ret_674 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_674), (int8x8_t)(__noswap_vqrshrn_n_s16(__rev1_674, __p2_674)))); \
-  __ret_674 = __builtin_shufflevector(__ret_674, __ret_674, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_674; \
+#define vqrshrn_high_n_s16(__p0_766, __p1_766, __p2_766) __extension__ ({ \
+  int8x8_t __s0_766 = __p0_766; \
+  int16x8_t __s1_766 = __p1_766; \
+  int8x8_t __rev0_766;  __rev0_766 = __builtin_shufflevector(__s0_766, __s0_766, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1_766;  __rev1_766 = __builtin_shufflevector(__s1_766, __s1_766, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret_766; \
+  __ret_766 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_766), (int8x8_t)(__noswap_vqrshrn_n_s16(__rev1_766, __p2_766)))); \
+  __ret_766 = __builtin_shufflevector(__ret_766, __ret_766, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_766; \
 })
 #endif
 
@@ -57752,65 +59422,65 @@ __ai int16_t vqrshlh_s16(int16_t __p0, int16_t __p1) {
   __ret; \
 })
 #ifdef __LITTLE_ENDIAN__
-#define vqrshrun_high_n_s32(__p0_675, __p1_675, __p2_675) __extension__ ({ \
-  int16x4_t __s0_675 = __p0_675; \
-  int32x4_t __s1_675 = __p1_675; \
-  int16x8_t __ret_675; \
-  __ret_675 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_675), (int16x4_t)(vqrshrun_n_s32(__s1_675, __p2_675)))); \
-  __ret_675; \
+#define vqrshrun_high_n_s32(__p0_767, __p1_767, __p2_767) __extension__ ({ \
+  int16x4_t __s0_767 = __p0_767; \
+  int32x4_t __s1_767 = __p1_767; \
+  int16x8_t __ret_767; \
+  __ret_767 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_767), (int16x4_t)(vqrshrun_n_s32(__s1_767, __p2_767)))); \
+  __ret_767; \
 })
 #else
-#define vqrshrun_high_n_s32(__p0_676, __p1_676, __p2_676) __extension__ ({ \
-  int16x4_t __s0_676 = __p0_676; \
-  int32x4_t __s1_676 = __p1_676; \
-  int16x4_t __rev0_676;  __rev0_676 = __builtin_shufflevector(__s0_676, __s0_676, 3, 2, 1, 0); \
-  int32x4_t __rev1_676;  __rev1_676 = __builtin_shufflevector(__s1_676, __s1_676, 3, 2, 1, 0); \
-  int16x8_t __ret_676; \
-  __ret_676 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_676), (int16x4_t)(__noswap_vqrshrun_n_s32(__rev1_676, __p2_676)))); \
-  __ret_676 = __builtin_shufflevector(__ret_676, __ret_676, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_676; \
+#define vqrshrun_high_n_s32(__p0_768, __p1_768, __p2_768) __extension__ ({ \
+  int16x4_t __s0_768 = __p0_768; \
+  int32x4_t __s1_768 = __p1_768; \
+  int16x4_t __rev0_768;  __rev0_768 = __builtin_shufflevector(__s0_768, __s0_768, 3, 2, 1, 0); \
+  int32x4_t __rev1_768;  __rev1_768 = __builtin_shufflevector(__s1_768, __s1_768, 3, 2, 1, 0); \
+  int16x8_t __ret_768; \
+  __ret_768 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_768), (int16x4_t)(__noswap_vqrshrun_n_s32(__rev1_768, __p2_768)))); \
+  __ret_768 = __builtin_shufflevector(__ret_768, __ret_768, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_768; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrshrun_high_n_s64(__p0_677, __p1_677, __p2_677) __extension__ ({ \
-  int32x2_t __s0_677 = __p0_677; \
-  int64x2_t __s1_677 = __p1_677; \
-  int32x4_t __ret_677; \
-  __ret_677 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_677), (int32x2_t)(vqrshrun_n_s64(__s1_677, __p2_677)))); \
-  __ret_677; \
+#define vqrshrun_high_n_s64(__p0_769, __p1_769, __p2_769) __extension__ ({ \
+  int32x2_t __s0_769 = __p0_769; \
+  int64x2_t __s1_769 = __p1_769; \
+  int32x4_t __ret_769; \
+  __ret_769 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_769), (int32x2_t)(vqrshrun_n_s64(__s1_769, __p2_769)))); \
+  __ret_769; \
 })
 #else
-#define vqrshrun_high_n_s64(__p0_678, __p1_678, __p2_678) __extension__ ({ \
-  int32x2_t __s0_678 = __p0_678; \
-  int64x2_t __s1_678 = __p1_678; \
-  int32x2_t __rev0_678;  __rev0_678 = __builtin_shufflevector(__s0_678, __s0_678, 1, 0); \
-  int64x2_t __rev1_678;  __rev1_678 = __builtin_shufflevector(__s1_678, __s1_678, 1, 0); \
-  int32x4_t __ret_678; \
-  __ret_678 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_678), (int32x2_t)(__noswap_vqrshrun_n_s64(__rev1_678, __p2_678)))); \
-  __ret_678 = __builtin_shufflevector(__ret_678, __ret_678, 3, 2, 1, 0); \
-  __ret_678; \
+#define vqrshrun_high_n_s64(__p0_770, __p1_770, __p2_770) __extension__ ({ \
+  int32x2_t __s0_770 = __p0_770; \
+  int64x2_t __s1_770 = __p1_770; \
+  int32x2_t __rev0_770;  __rev0_770 = __builtin_shufflevector(__s0_770, __s0_770, 1, 0); \
+  int64x2_t __rev1_770;  __rev1_770 = __builtin_shufflevector(__s1_770, __s1_770, 1, 0); \
+  int32x4_t __ret_770; \
+  __ret_770 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_770), (int32x2_t)(__noswap_vqrshrun_n_s64(__rev1_770, __p2_770)))); \
+  __ret_770 = __builtin_shufflevector(__ret_770, __ret_770, 3, 2, 1, 0); \
+  __ret_770; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrshrun_high_n_s16(__p0_679, __p1_679, __p2_679) __extension__ ({ \
-  int8x8_t __s0_679 = __p0_679; \
-  int16x8_t __s1_679 = __p1_679; \
-  int8x16_t __ret_679; \
-  __ret_679 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_679), (int8x8_t)(vqrshrun_n_s16(__s1_679, __p2_679)))); \
-  __ret_679; \
+#define vqrshrun_high_n_s16(__p0_771, __p1_771, __p2_771) __extension__ ({ \
+  int8x8_t __s0_771 = __p0_771; \
+  int16x8_t __s1_771 = __p1_771; \
+  int8x16_t __ret_771; \
+  __ret_771 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_771), (int8x8_t)(vqrshrun_n_s16(__s1_771, __p2_771)))); \
+  __ret_771; \
 })
 #else
-#define vqrshrun_high_n_s16(__p0_680, __p1_680, __p2_680) __extension__ ({ \
-  int8x8_t __s0_680 = __p0_680; \
-  int16x8_t __s1_680 = __p1_680; \
-  int8x8_t __rev0_680;  __rev0_680 = __builtin_shufflevector(__s0_680, __s0_680, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1_680;  __rev1_680 = __builtin_shufflevector(__s1_680, __s1_680, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x16_t __ret_680; \
-  __ret_680 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_680), (int8x8_t)(__noswap_vqrshrun_n_s16(__rev1_680, __p2_680)))); \
-  __ret_680 = __builtin_shufflevector(__ret_680, __ret_680, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_680; \
+#define vqrshrun_high_n_s16(__p0_772, __p1_772, __p2_772) __extension__ ({ \
+  int8x8_t __s0_772 = __p0_772; \
+  int16x8_t __s1_772 = __p1_772; \
+  int8x8_t __rev0_772;  __rev0_772 = __builtin_shufflevector(__s0_772, __s0_772, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1_772;  __rev1_772 = __builtin_shufflevector(__s1_772, __s1_772, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret_772; \
+  __ret_772 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_772), (int8x8_t)(__noswap_vqrshrun_n_s16(__rev1_772, __p2_772)))); \
+  __ret_772 = __builtin_shufflevector(__ret_772, __ret_772, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_772; \
 })
 #endif
 
@@ -57832,22 +59502,22 @@ __ai int16_t vqrshlh_s16(int16_t __p0, int16_t __p1) {
   __ret = (int8_t) __builtin_neon_vqrshrunh_n_s16(__s0, __p1); \
   __ret; \
 })
-__ai uint8_t vqshlb_u8(uint8_t __p0, uint8_t __p1) {
+__ai uint8_t vqshlb_u8(uint8_t __p0, int8_t __p1) {
   uint8_t __ret;
   __ret = (uint8_t) __builtin_neon_vqshlb_u8(__p0, __p1);
   return __ret;
 }
-__ai uint32_t vqshls_u32(uint32_t __p0, uint32_t __p1) {
+__ai uint32_t vqshls_u32(uint32_t __p0, int32_t __p1) {
   uint32_t __ret;
   __ret = (uint32_t) __builtin_neon_vqshls_u32(__p0, __p1);
   return __ret;
 }
-__ai uint64_t vqshld_u64(uint64_t __p0, uint64_t __p1) {
+__ai uint64_t vqshld_u64(uint64_t __p0, int64_t __p1) {
   uint64_t __ret;
   __ret = (uint64_t) __builtin_neon_vqshld_u64(__p0, __p1);
   return __ret;
 }
-__ai uint16_t vqshlh_u16(uint16_t __p0, uint16_t __p1) {
+__ai uint16_t vqshlh_u16(uint16_t __p0, int16_t __p1) {
   uint16_t __ret;
   __ret = (uint16_t) __builtin_neon_vqshlh_u16(__p0, __p1);
   return __ret;
@@ -57945,128 +59615,128 @@ __ai int16_t vqshlh_s16(int16_t __p0, int16_t __p1) {
   __ret; \
 })
 #ifdef __LITTLE_ENDIAN__
-#define vqshrn_high_n_u32(__p0_681, __p1_681, __p2_681) __extension__ ({ \
-  uint16x4_t __s0_681 = __p0_681; \
-  uint32x4_t __s1_681 = __p1_681; \
-  uint16x8_t __ret_681; \
-  __ret_681 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_681), (uint16x4_t)(vqshrn_n_u32(__s1_681, __p2_681)))); \
-  __ret_681; \
+#define vqshrn_high_n_u32(__p0_773, __p1_773, __p2_773) __extension__ ({ \
+  uint16x4_t __s0_773 = __p0_773; \
+  uint32x4_t __s1_773 = __p1_773; \
+  uint16x8_t __ret_773; \
+  __ret_773 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_773), (uint16x4_t)(vqshrn_n_u32(__s1_773, __p2_773)))); \
+  __ret_773; \
 })
 #else
-#define vqshrn_high_n_u32(__p0_682, __p1_682, __p2_682) __extension__ ({ \
-  uint16x4_t __s0_682 = __p0_682; \
-  uint32x4_t __s1_682 = __p1_682; \
-  uint16x4_t __rev0_682;  __rev0_682 = __builtin_shufflevector(__s0_682, __s0_682, 3, 2, 1, 0); \
-  uint32x4_t __rev1_682;  __rev1_682 = __builtin_shufflevector(__s1_682, __s1_682, 3, 2, 1, 0); \
-  uint16x8_t __ret_682; \
-  __ret_682 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_682), (uint16x4_t)(__noswap_vqshrn_n_u32(__rev1_682, __p2_682)))); \
-  __ret_682 = __builtin_shufflevector(__ret_682, __ret_682, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_682; \
+#define vqshrn_high_n_u32(__p0_774, __p1_774, __p2_774) __extension__ ({ \
+  uint16x4_t __s0_774 = __p0_774; \
+  uint32x4_t __s1_774 = __p1_774; \
+  uint16x4_t __rev0_774;  __rev0_774 = __builtin_shufflevector(__s0_774, __s0_774, 3, 2, 1, 0); \
+  uint32x4_t __rev1_774;  __rev1_774 = __builtin_shufflevector(__s1_774, __s1_774, 3, 2, 1, 0); \
+  uint16x8_t __ret_774; \
+  __ret_774 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_774), (uint16x4_t)(__noswap_vqshrn_n_u32(__rev1_774, __p2_774)))); \
+  __ret_774 = __builtin_shufflevector(__ret_774, __ret_774, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_774; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqshrn_high_n_u64(__p0_683, __p1_683, __p2_683) __extension__ ({ \
-  uint32x2_t __s0_683 = __p0_683; \
-  uint64x2_t __s1_683 = __p1_683; \
-  uint32x4_t __ret_683; \
-  __ret_683 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_683), (uint32x2_t)(vqshrn_n_u64(__s1_683, __p2_683)))); \
-  __ret_683; \
+#define vqshrn_high_n_u64(__p0_775, __p1_775, __p2_775) __extension__ ({ \
+  uint32x2_t __s0_775 = __p0_775; \
+  uint64x2_t __s1_775 = __p1_775; \
+  uint32x4_t __ret_775; \
+  __ret_775 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_775), (uint32x2_t)(vqshrn_n_u64(__s1_775, __p2_775)))); \
+  __ret_775; \
 })
 #else
-#define vqshrn_high_n_u64(__p0_684, __p1_684, __p2_684) __extension__ ({ \
-  uint32x2_t __s0_684 = __p0_684; \
-  uint64x2_t __s1_684 = __p1_684; \
-  uint32x2_t __rev0_684;  __rev0_684 = __builtin_shufflevector(__s0_684, __s0_684, 1, 0); \
-  uint64x2_t __rev1_684;  __rev1_684 = __builtin_shufflevector(__s1_684, __s1_684, 1, 0); \
-  uint32x4_t __ret_684; \
-  __ret_684 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_684), (uint32x2_t)(__noswap_vqshrn_n_u64(__rev1_684, __p2_684)))); \
-  __ret_684 = __builtin_shufflevector(__ret_684, __ret_684, 3, 2, 1, 0); \
-  __ret_684; \
+#define vqshrn_high_n_u64(__p0_776, __p1_776, __p2_776) __extension__ ({ \
+  uint32x2_t __s0_776 = __p0_776; \
+  uint64x2_t __s1_776 = __p1_776; \
+  uint32x2_t __rev0_776;  __rev0_776 = __builtin_shufflevector(__s0_776, __s0_776, 1, 0); \
+  uint64x2_t __rev1_776;  __rev1_776 = __builtin_shufflevector(__s1_776, __s1_776, 1, 0); \
+  uint32x4_t __ret_776; \
+  __ret_776 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_776), (uint32x2_t)(__noswap_vqshrn_n_u64(__rev1_776, __p2_776)))); \
+  __ret_776 = __builtin_shufflevector(__ret_776, __ret_776, 3, 2, 1, 0); \
+  __ret_776; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqshrn_high_n_u16(__p0_685, __p1_685, __p2_685) __extension__ ({ \
-  uint8x8_t __s0_685 = __p0_685; \
-  uint16x8_t __s1_685 = __p1_685; \
-  uint8x16_t __ret_685; \
-  __ret_685 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_685), (uint8x8_t)(vqshrn_n_u16(__s1_685, __p2_685)))); \
-  __ret_685; \
+#define vqshrn_high_n_u16(__p0_777, __p1_777, __p2_777) __extension__ ({ \
+  uint8x8_t __s0_777 = __p0_777; \
+  uint16x8_t __s1_777 = __p1_777; \
+  uint8x16_t __ret_777; \
+  __ret_777 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_777), (uint8x8_t)(vqshrn_n_u16(__s1_777, __p2_777)))); \
+  __ret_777; \
 })
 #else
-#define vqshrn_high_n_u16(__p0_686, __p1_686, __p2_686) __extension__ ({ \
-  uint8x8_t __s0_686 = __p0_686; \
-  uint16x8_t __s1_686 = __p1_686; \
-  uint8x8_t __rev0_686;  __rev0_686 = __builtin_shufflevector(__s0_686, __s0_686, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __rev1_686;  __rev1_686 = __builtin_shufflevector(__s1_686, __s1_686, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x16_t __ret_686; \
-  __ret_686 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_686), (uint8x8_t)(__noswap_vqshrn_n_u16(__rev1_686, __p2_686)))); \
-  __ret_686 = __builtin_shufflevector(__ret_686, __ret_686, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_686; \
+#define vqshrn_high_n_u16(__p0_778, __p1_778, __p2_778) __extension__ ({ \
+  uint8x8_t __s0_778 = __p0_778; \
+  uint16x8_t __s1_778 = __p1_778; \
+  uint8x8_t __rev0_778;  __rev0_778 = __builtin_shufflevector(__s0_778, __s0_778, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1_778;  __rev1_778 = __builtin_shufflevector(__s1_778, __s1_778, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret_778; \
+  __ret_778 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_778), (uint8x8_t)(__noswap_vqshrn_n_u16(__rev1_778, __p2_778)))); \
+  __ret_778 = __builtin_shufflevector(__ret_778, __ret_778, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_778; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqshrn_high_n_s32(__p0_687, __p1_687, __p2_687) __extension__ ({ \
-  int16x4_t __s0_687 = __p0_687; \
-  int32x4_t __s1_687 = __p1_687; \
-  int16x8_t __ret_687; \
-  __ret_687 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_687), (int16x4_t)(vqshrn_n_s32(__s1_687, __p2_687)))); \
-  __ret_687; \
+#define vqshrn_high_n_s32(__p0_779, __p1_779, __p2_779) __extension__ ({ \
+  int16x4_t __s0_779 = __p0_779; \
+  int32x4_t __s1_779 = __p1_779; \
+  int16x8_t __ret_779; \
+  __ret_779 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_779), (int16x4_t)(vqshrn_n_s32(__s1_779, __p2_779)))); \
+  __ret_779; \
 })
 #else
-#define vqshrn_high_n_s32(__p0_688, __p1_688, __p2_688) __extension__ ({ \
-  int16x4_t __s0_688 = __p0_688; \
-  int32x4_t __s1_688 = __p1_688; \
-  int16x4_t __rev0_688;  __rev0_688 = __builtin_shufflevector(__s0_688, __s0_688, 3, 2, 1, 0); \
-  int32x4_t __rev1_688;  __rev1_688 = __builtin_shufflevector(__s1_688, __s1_688, 3, 2, 1, 0); \
-  int16x8_t __ret_688; \
-  __ret_688 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_688), (int16x4_t)(__noswap_vqshrn_n_s32(__rev1_688, __p2_688)))); \
-  __ret_688 = __builtin_shufflevector(__ret_688, __ret_688, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_688; \
+#define vqshrn_high_n_s32(__p0_780, __p1_780, __p2_780) __extension__ ({ \
+  int16x4_t __s0_780 = __p0_780; \
+  int32x4_t __s1_780 = __p1_780; \
+  int16x4_t __rev0_780;  __rev0_780 = __builtin_shufflevector(__s0_780, __s0_780, 3, 2, 1, 0); \
+  int32x4_t __rev1_780;  __rev1_780 = __builtin_shufflevector(__s1_780, __s1_780, 3, 2, 1, 0); \
+  int16x8_t __ret_780; \
+  __ret_780 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_780), (int16x4_t)(__noswap_vqshrn_n_s32(__rev1_780, __p2_780)))); \
+  __ret_780 = __builtin_shufflevector(__ret_780, __ret_780, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_780; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqshrn_high_n_s64(__p0_689, __p1_689, __p2_689) __extension__ ({ \
-  int32x2_t __s0_689 = __p0_689; \
-  int64x2_t __s1_689 = __p1_689; \
-  int32x4_t __ret_689; \
-  __ret_689 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_689), (int32x2_t)(vqshrn_n_s64(__s1_689, __p2_689)))); \
-  __ret_689; \
+#define vqshrn_high_n_s64(__p0_781, __p1_781, __p2_781) __extension__ ({ \
+  int32x2_t __s0_781 = __p0_781; \
+  int64x2_t __s1_781 = __p1_781; \
+  int32x4_t __ret_781; \
+  __ret_781 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_781), (int32x2_t)(vqshrn_n_s64(__s1_781, __p2_781)))); \
+  __ret_781; \
 })
 #else
-#define vqshrn_high_n_s64(__p0_690, __p1_690, __p2_690) __extension__ ({ \
-  int32x2_t __s0_690 = __p0_690; \
-  int64x2_t __s1_690 = __p1_690; \
-  int32x2_t __rev0_690;  __rev0_690 = __builtin_shufflevector(__s0_690, __s0_690, 1, 0); \
-  int64x2_t __rev1_690;  __rev1_690 = __builtin_shufflevector(__s1_690, __s1_690, 1, 0); \
-  int32x4_t __ret_690; \
-  __ret_690 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_690), (int32x2_t)(__noswap_vqshrn_n_s64(__rev1_690, __p2_690)))); \
-  __ret_690 = __builtin_shufflevector(__ret_690, __ret_690, 3, 2, 1, 0); \
-  __ret_690; \
+#define vqshrn_high_n_s64(__p0_782, __p1_782, __p2_782) __extension__ ({ \
+  int32x2_t __s0_782 = __p0_782; \
+  int64x2_t __s1_782 = __p1_782; \
+  int32x2_t __rev0_782;  __rev0_782 = __builtin_shufflevector(__s0_782, __s0_782, 1, 0); \
+  int64x2_t __rev1_782;  __rev1_782 = __builtin_shufflevector(__s1_782, __s1_782, 1, 0); \
+  int32x4_t __ret_782; \
+  __ret_782 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_782), (int32x2_t)(__noswap_vqshrn_n_s64(__rev1_782, __p2_782)))); \
+  __ret_782 = __builtin_shufflevector(__ret_782, __ret_782, 3, 2, 1, 0); \
+  __ret_782; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqshrn_high_n_s16(__p0_691, __p1_691, __p2_691) __extension__ ({ \
-  int8x8_t __s0_691 = __p0_691; \
-  int16x8_t __s1_691 = __p1_691; \
-  int8x16_t __ret_691; \
-  __ret_691 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_691), (int8x8_t)(vqshrn_n_s16(__s1_691, __p2_691)))); \
-  __ret_691; \
+#define vqshrn_high_n_s16(__p0_783, __p1_783, __p2_783) __extension__ ({ \
+  int8x8_t __s0_783 = __p0_783; \
+  int16x8_t __s1_783 = __p1_783; \
+  int8x16_t __ret_783; \
+  __ret_783 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_783), (int8x8_t)(vqshrn_n_s16(__s1_783, __p2_783)))); \
+  __ret_783; \
 })
 #else
-#define vqshrn_high_n_s16(__p0_692, __p1_692, __p2_692) __extension__ ({ \
-  int8x8_t __s0_692 = __p0_692; \
-  int16x8_t __s1_692 = __p1_692; \
-  int8x8_t __rev0_692;  __rev0_692 = __builtin_shufflevector(__s0_692, __s0_692, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1_692;  __rev1_692 = __builtin_shufflevector(__s1_692, __s1_692, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x16_t __ret_692; \
-  __ret_692 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_692), (int8x8_t)(__noswap_vqshrn_n_s16(__rev1_692, __p2_692)))); \
-  __ret_692 = __builtin_shufflevector(__ret_692, __ret_692, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_692; \
+#define vqshrn_high_n_s16(__p0_784, __p1_784, __p2_784) __extension__ ({ \
+  int8x8_t __s0_784 = __p0_784; \
+  int16x8_t __s1_784 = __p1_784; \
+  int8x8_t __rev0_784;  __rev0_784 = __builtin_shufflevector(__s0_784, __s0_784, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1_784;  __rev1_784 = __builtin_shufflevector(__s1_784, __s1_784, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret_784; \
+  __ret_784 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_784), (int8x8_t)(__noswap_vqshrn_n_s16(__rev1_784, __p2_784)))); \
+  __ret_784 = __builtin_shufflevector(__ret_784, __ret_784, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_784; \
 })
 #endif
 
@@ -58107,65 +59777,65 @@ __ai int16_t vqshlh_s16(int16_t __p0, int16_t __p1) {
   __ret; \
 })
 #ifdef __LITTLE_ENDIAN__
-#define vqshrun_high_n_s32(__p0_693, __p1_693, __p2_693) __extension__ ({ \
-  int16x4_t __s0_693 = __p0_693; \
-  int32x4_t __s1_693 = __p1_693; \
-  int16x8_t __ret_693; \
-  __ret_693 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_693), (int16x4_t)(vqshrun_n_s32(__s1_693, __p2_693)))); \
-  __ret_693; \
+#define vqshrun_high_n_s32(__p0_785, __p1_785, __p2_785) __extension__ ({ \
+  int16x4_t __s0_785 = __p0_785; \
+  int32x4_t __s1_785 = __p1_785; \
+  int16x8_t __ret_785; \
+  __ret_785 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_785), (int16x4_t)(vqshrun_n_s32(__s1_785, __p2_785)))); \
+  __ret_785; \
 })
 #else
-#define vqshrun_high_n_s32(__p0_694, __p1_694, __p2_694) __extension__ ({ \
-  int16x4_t __s0_694 = __p0_694; \
-  int32x4_t __s1_694 = __p1_694; \
-  int16x4_t __rev0_694;  __rev0_694 = __builtin_shufflevector(__s0_694, __s0_694, 3, 2, 1, 0); \
-  int32x4_t __rev1_694;  __rev1_694 = __builtin_shufflevector(__s1_694, __s1_694, 3, 2, 1, 0); \
-  int16x8_t __ret_694; \
-  __ret_694 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_694), (int16x4_t)(__noswap_vqshrun_n_s32(__rev1_694, __p2_694)))); \
-  __ret_694 = __builtin_shufflevector(__ret_694, __ret_694, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_694; \
+#define vqshrun_high_n_s32(__p0_786, __p1_786, __p2_786) __extension__ ({ \
+  int16x4_t __s0_786 = __p0_786; \
+  int32x4_t __s1_786 = __p1_786; \
+  int16x4_t __rev0_786;  __rev0_786 = __builtin_shufflevector(__s0_786, __s0_786, 3, 2, 1, 0); \
+  int32x4_t __rev1_786;  __rev1_786 = __builtin_shufflevector(__s1_786, __s1_786, 3, 2, 1, 0); \
+  int16x8_t __ret_786; \
+  __ret_786 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_786), (int16x4_t)(__noswap_vqshrun_n_s32(__rev1_786, __p2_786)))); \
+  __ret_786 = __builtin_shufflevector(__ret_786, __ret_786, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_786; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqshrun_high_n_s64(__p0_695, __p1_695, __p2_695) __extension__ ({ \
-  int32x2_t __s0_695 = __p0_695; \
-  int64x2_t __s1_695 = __p1_695; \
-  int32x4_t __ret_695; \
-  __ret_695 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_695), (int32x2_t)(vqshrun_n_s64(__s1_695, __p2_695)))); \
-  __ret_695; \
+#define vqshrun_high_n_s64(__p0_787, __p1_787, __p2_787) __extension__ ({ \
+  int32x2_t __s0_787 = __p0_787; \
+  int64x2_t __s1_787 = __p1_787; \
+  int32x4_t __ret_787; \
+  __ret_787 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_787), (int32x2_t)(vqshrun_n_s64(__s1_787, __p2_787)))); \
+  __ret_787; \
 })
 #else
-#define vqshrun_high_n_s64(__p0_696, __p1_696, __p2_696) __extension__ ({ \
-  int32x2_t __s0_696 = __p0_696; \
-  int64x2_t __s1_696 = __p1_696; \
-  int32x2_t __rev0_696;  __rev0_696 = __builtin_shufflevector(__s0_696, __s0_696, 1, 0); \
-  int64x2_t __rev1_696;  __rev1_696 = __builtin_shufflevector(__s1_696, __s1_696, 1, 0); \
-  int32x4_t __ret_696; \
-  __ret_696 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_696), (int32x2_t)(__noswap_vqshrun_n_s64(__rev1_696, __p2_696)))); \
-  __ret_696 = __builtin_shufflevector(__ret_696, __ret_696, 3, 2, 1, 0); \
-  __ret_696; \
+#define vqshrun_high_n_s64(__p0_788, __p1_788, __p2_788) __extension__ ({ \
+  int32x2_t __s0_788 = __p0_788; \
+  int64x2_t __s1_788 = __p1_788; \
+  int32x2_t __rev0_788;  __rev0_788 = __builtin_shufflevector(__s0_788, __s0_788, 1, 0); \
+  int64x2_t __rev1_788;  __rev1_788 = __builtin_shufflevector(__s1_788, __s1_788, 1, 0); \
+  int32x4_t __ret_788; \
+  __ret_788 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_788), (int32x2_t)(__noswap_vqshrun_n_s64(__rev1_788, __p2_788)))); \
+  __ret_788 = __builtin_shufflevector(__ret_788, __ret_788, 3, 2, 1, 0); \
+  __ret_788; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqshrun_high_n_s16(__p0_697, __p1_697, __p2_697) __extension__ ({ \
-  int8x8_t __s0_697 = __p0_697; \
-  int16x8_t __s1_697 = __p1_697; \
-  int8x16_t __ret_697; \
-  __ret_697 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_697), (int8x8_t)(vqshrun_n_s16(__s1_697, __p2_697)))); \
-  __ret_697; \
+#define vqshrun_high_n_s16(__p0_789, __p1_789, __p2_789) __extension__ ({ \
+  int8x8_t __s0_789 = __p0_789; \
+  int16x8_t __s1_789 = __p1_789; \
+  int8x16_t __ret_789; \
+  __ret_789 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_789), (int8x8_t)(vqshrun_n_s16(__s1_789, __p2_789)))); \
+  __ret_789; \
 })
 #else
-#define vqshrun_high_n_s16(__p0_698, __p1_698, __p2_698) __extension__ ({ \
-  int8x8_t __s0_698 = __p0_698; \
-  int16x8_t __s1_698 = __p1_698; \
-  int8x8_t __rev0_698;  __rev0_698 = __builtin_shufflevector(__s0_698, __s0_698, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1_698;  __rev1_698 = __builtin_shufflevector(__s1_698, __s1_698, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x16_t __ret_698; \
-  __ret_698 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_698), (int8x8_t)(__noswap_vqshrun_n_s16(__rev1_698, __p2_698)))); \
-  __ret_698 = __builtin_shufflevector(__ret_698, __ret_698, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_698; \
+#define vqshrun_high_n_s16(__p0_790, __p1_790, __p2_790) __extension__ ({ \
+  int8x8_t __s0_790 = __p0_790; \
+  int16x8_t __s1_790 = __p1_790; \
+  int8x8_t __rev0_790;  __rev0_790 = __builtin_shufflevector(__s0_790, __s0_790, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1_790;  __rev1_790 = __builtin_shufflevector(__s1_790, __s1_790, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret_790; \
+  __ret_790 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_790), (int8x8_t)(__noswap_vqshrun_n_s16(__rev1_790, __p2_790)))); \
+  __ret_790 = __builtin_shufflevector(__ret_790, __ret_790, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_790; \
 })
 #endif
 
@@ -59452,7 +61122,7 @@ __ai float32_t vrecpxs_f32(float32_t __p0) {
   __ret = (float32_t) __builtin_neon_vrecpxs_f32(__p0);
   return __ret;
 }
-__ai uint64_t vrshld_u64(uint64_t __p0, uint64_t __p1) {
+__ai uint64_t vrshld_u64(uint64_t __p0, int64_t __p1) {
   uint64_t __ret;
   __ret = (uint64_t) __builtin_neon_vrshld_u64(__p0, __p1);
   return __ret;
@@ -59475,128 +61145,128 @@ __ai int64_t vrshld_s64(int64_t __p0, int64_t __p1) {
   __ret; \
 })
 #ifdef __LITTLE_ENDIAN__
-#define vrshrn_high_n_u32(__p0_699, __p1_699, __p2_699) __extension__ ({ \
-  uint16x4_t __s0_699 = __p0_699; \
-  uint32x4_t __s1_699 = __p1_699; \
-  uint16x8_t __ret_699; \
-  __ret_699 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_699), (uint16x4_t)(vrshrn_n_u32(__s1_699, __p2_699)))); \
-  __ret_699; \
+#define vrshrn_high_n_u32(__p0_791, __p1_791, __p2_791) __extension__ ({ \
+  uint16x4_t __s0_791 = __p0_791; \
+  uint32x4_t __s1_791 = __p1_791; \
+  uint16x8_t __ret_791; \
+  __ret_791 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_791), (uint16x4_t)(vrshrn_n_u32(__s1_791, __p2_791)))); \
+  __ret_791; \
 })
 #else
-#define vrshrn_high_n_u32(__p0_700, __p1_700, __p2_700) __extension__ ({ \
-  uint16x4_t __s0_700 = __p0_700; \
-  uint32x4_t __s1_700 = __p1_700; \
-  uint16x4_t __rev0_700;  __rev0_700 = __builtin_shufflevector(__s0_700, __s0_700, 3, 2, 1, 0); \
-  uint32x4_t __rev1_700;  __rev1_700 = __builtin_shufflevector(__s1_700, __s1_700, 3, 2, 1, 0); \
-  uint16x8_t __ret_700; \
-  __ret_700 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_700), (uint16x4_t)(__noswap_vrshrn_n_u32(__rev1_700, __p2_700)))); \
-  __ret_700 = __builtin_shufflevector(__ret_700, __ret_700, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_700; \
+#define vrshrn_high_n_u32(__p0_792, __p1_792, __p2_792) __extension__ ({ \
+  uint16x4_t __s0_792 = __p0_792; \
+  uint32x4_t __s1_792 = __p1_792; \
+  uint16x4_t __rev0_792;  __rev0_792 = __builtin_shufflevector(__s0_792, __s0_792, 3, 2, 1, 0); \
+  uint32x4_t __rev1_792;  __rev1_792 = __builtin_shufflevector(__s1_792, __s1_792, 3, 2, 1, 0); \
+  uint16x8_t __ret_792; \
+  __ret_792 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_792), (uint16x4_t)(__noswap_vrshrn_n_u32(__rev1_792, __p2_792)))); \
+  __ret_792 = __builtin_shufflevector(__ret_792, __ret_792, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_792; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vrshrn_high_n_u64(__p0_701, __p1_701, __p2_701) __extension__ ({ \
-  uint32x2_t __s0_701 = __p0_701; \
-  uint64x2_t __s1_701 = __p1_701; \
-  uint32x4_t __ret_701; \
-  __ret_701 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_701), (uint32x2_t)(vrshrn_n_u64(__s1_701, __p2_701)))); \
-  __ret_701; \
+#define vrshrn_high_n_u64(__p0_793, __p1_793, __p2_793) __extension__ ({ \
+  uint32x2_t __s0_793 = __p0_793; \
+  uint64x2_t __s1_793 = __p1_793; \
+  uint32x4_t __ret_793; \
+  __ret_793 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_793), (uint32x2_t)(vrshrn_n_u64(__s1_793, __p2_793)))); \
+  __ret_793; \
 })
 #else
-#define vrshrn_high_n_u64(__p0_702, __p1_702, __p2_702) __extension__ ({ \
-  uint32x2_t __s0_702 = __p0_702; \
-  uint64x2_t __s1_702 = __p1_702; \
-  uint32x2_t __rev0_702;  __rev0_702 = __builtin_shufflevector(__s0_702, __s0_702, 1, 0); \
-  uint64x2_t __rev1_702;  __rev1_702 = __builtin_shufflevector(__s1_702, __s1_702, 1, 0); \
-  uint32x4_t __ret_702; \
-  __ret_702 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_702), (uint32x2_t)(__noswap_vrshrn_n_u64(__rev1_702, __p2_702)))); \
-  __ret_702 = __builtin_shufflevector(__ret_702, __ret_702, 3, 2, 1, 0); \
-  __ret_702; \
+#define vrshrn_high_n_u64(__p0_794, __p1_794, __p2_794) __extension__ ({ \
+  uint32x2_t __s0_794 = __p0_794; \
+  uint64x2_t __s1_794 = __p1_794; \
+  uint32x2_t __rev0_794;  __rev0_794 = __builtin_shufflevector(__s0_794, __s0_794, 1, 0); \
+  uint64x2_t __rev1_794;  __rev1_794 = __builtin_shufflevector(__s1_794, __s1_794, 1, 0); \
+  uint32x4_t __ret_794; \
+  __ret_794 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_794), (uint32x2_t)(__noswap_vrshrn_n_u64(__rev1_794, __p2_794)))); \
+  __ret_794 = __builtin_shufflevector(__ret_794, __ret_794, 3, 2, 1, 0); \
+  __ret_794; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vrshrn_high_n_u16(__p0_703, __p1_703, __p2_703) __extension__ ({ \
-  uint8x8_t __s0_703 = __p0_703; \
-  uint16x8_t __s1_703 = __p1_703; \
-  uint8x16_t __ret_703; \
-  __ret_703 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_703), (uint8x8_t)(vrshrn_n_u16(__s1_703, __p2_703)))); \
-  __ret_703; \
+#define vrshrn_high_n_u16(__p0_795, __p1_795, __p2_795) __extension__ ({ \
+  uint8x8_t __s0_795 = __p0_795; \
+  uint16x8_t __s1_795 = __p1_795; \
+  uint8x16_t __ret_795; \
+  __ret_795 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_795), (uint8x8_t)(vrshrn_n_u16(__s1_795, __p2_795)))); \
+  __ret_795; \
 })
 #else
-#define vrshrn_high_n_u16(__p0_704, __p1_704, __p2_704) __extension__ ({ \
-  uint8x8_t __s0_704 = __p0_704; \
-  uint16x8_t __s1_704 = __p1_704; \
-  uint8x8_t __rev0_704;  __rev0_704 = __builtin_shufflevector(__s0_704, __s0_704, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __rev1_704;  __rev1_704 = __builtin_shufflevector(__s1_704, __s1_704, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x16_t __ret_704; \
-  __ret_704 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_704), (uint8x8_t)(__noswap_vrshrn_n_u16(__rev1_704, __p2_704)))); \
-  __ret_704 = __builtin_shufflevector(__ret_704, __ret_704, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_704; \
+#define vrshrn_high_n_u16(__p0_796, __p1_796, __p2_796) __extension__ ({ \
+  uint8x8_t __s0_796 = __p0_796; \
+  uint16x8_t __s1_796 = __p1_796; \
+  uint8x8_t __rev0_796;  __rev0_796 = __builtin_shufflevector(__s0_796, __s0_796, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1_796;  __rev1_796 = __builtin_shufflevector(__s1_796, __s1_796, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret_796; \
+  __ret_796 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_796), (uint8x8_t)(__noswap_vrshrn_n_u16(__rev1_796, __p2_796)))); \
+  __ret_796 = __builtin_shufflevector(__ret_796, __ret_796, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_796; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vrshrn_high_n_s32(__p0_705, __p1_705, __p2_705) __extension__ ({ \
-  int16x4_t __s0_705 = __p0_705; \
-  int32x4_t __s1_705 = __p1_705; \
-  int16x8_t __ret_705; \
-  __ret_705 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_705), (int16x4_t)(vrshrn_n_s32(__s1_705, __p2_705)))); \
-  __ret_705; \
+#define vrshrn_high_n_s32(__p0_797, __p1_797, __p2_797) __extension__ ({ \
+  int16x4_t __s0_797 = __p0_797; \
+  int32x4_t __s1_797 = __p1_797; \
+  int16x8_t __ret_797; \
+  __ret_797 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_797), (int16x4_t)(vrshrn_n_s32(__s1_797, __p2_797)))); \
+  __ret_797; \
 })
 #else
-#define vrshrn_high_n_s32(__p0_706, __p1_706, __p2_706) __extension__ ({ \
-  int16x4_t __s0_706 = __p0_706; \
-  int32x4_t __s1_706 = __p1_706; \
-  int16x4_t __rev0_706;  __rev0_706 = __builtin_shufflevector(__s0_706, __s0_706, 3, 2, 1, 0); \
-  int32x4_t __rev1_706;  __rev1_706 = __builtin_shufflevector(__s1_706, __s1_706, 3, 2, 1, 0); \
-  int16x8_t __ret_706; \
-  __ret_706 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_706), (int16x4_t)(__noswap_vrshrn_n_s32(__rev1_706, __p2_706)))); \
-  __ret_706 = __builtin_shufflevector(__ret_706, __ret_706, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_706; \
+#define vrshrn_high_n_s32(__p0_798, __p1_798, __p2_798) __extension__ ({ \
+  int16x4_t __s0_798 = __p0_798; \
+  int32x4_t __s1_798 = __p1_798; \
+  int16x4_t __rev0_798;  __rev0_798 = __builtin_shufflevector(__s0_798, __s0_798, 3, 2, 1, 0); \
+  int32x4_t __rev1_798;  __rev1_798 = __builtin_shufflevector(__s1_798, __s1_798, 3, 2, 1, 0); \
+  int16x8_t __ret_798; \
+  __ret_798 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_798), (int16x4_t)(__noswap_vrshrn_n_s32(__rev1_798, __p2_798)))); \
+  __ret_798 = __builtin_shufflevector(__ret_798, __ret_798, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_798; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vrshrn_high_n_s64(__p0_707, __p1_707, __p2_707) __extension__ ({ \
-  int32x2_t __s0_707 = __p0_707; \
-  int64x2_t __s1_707 = __p1_707; \
-  int32x4_t __ret_707; \
-  __ret_707 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_707), (int32x2_t)(vrshrn_n_s64(__s1_707, __p2_707)))); \
-  __ret_707; \
+#define vrshrn_high_n_s64(__p0_799, __p1_799, __p2_799) __extension__ ({ \
+  int32x2_t __s0_799 = __p0_799; \
+  int64x2_t __s1_799 = __p1_799; \
+  int32x4_t __ret_799; \
+  __ret_799 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_799), (int32x2_t)(vrshrn_n_s64(__s1_799, __p2_799)))); \
+  __ret_799; \
 })
 #else
-#define vrshrn_high_n_s64(__p0_708, __p1_708, __p2_708) __extension__ ({ \
-  int32x2_t __s0_708 = __p0_708; \
-  int64x2_t __s1_708 = __p1_708; \
-  int32x2_t __rev0_708;  __rev0_708 = __builtin_shufflevector(__s0_708, __s0_708, 1, 0); \
-  int64x2_t __rev1_708;  __rev1_708 = __builtin_shufflevector(__s1_708, __s1_708, 1, 0); \
-  int32x4_t __ret_708; \
-  __ret_708 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_708), (int32x2_t)(__noswap_vrshrn_n_s64(__rev1_708, __p2_708)))); \
-  __ret_708 = __builtin_shufflevector(__ret_708, __ret_708, 3, 2, 1, 0); \
-  __ret_708; \
+#define vrshrn_high_n_s64(__p0_800, __p1_800, __p2_800) __extension__ ({ \
+  int32x2_t __s0_800 = __p0_800; \
+  int64x2_t __s1_800 = __p1_800; \
+  int32x2_t __rev0_800;  __rev0_800 = __builtin_shufflevector(__s0_800, __s0_800, 1, 0); \
+  int64x2_t __rev1_800;  __rev1_800 = __builtin_shufflevector(__s1_800, __s1_800, 1, 0); \
+  int32x4_t __ret_800; \
+  __ret_800 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_800), (int32x2_t)(__noswap_vrshrn_n_s64(__rev1_800, __p2_800)))); \
+  __ret_800 = __builtin_shufflevector(__ret_800, __ret_800, 3, 2, 1, 0); \
+  __ret_800; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vrshrn_high_n_s16(__p0_709, __p1_709, __p2_709) __extension__ ({ \
-  int8x8_t __s0_709 = __p0_709; \
-  int16x8_t __s1_709 = __p1_709; \
-  int8x16_t __ret_709; \
-  __ret_709 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_709), (int8x8_t)(vrshrn_n_s16(__s1_709, __p2_709)))); \
-  __ret_709; \
+#define vrshrn_high_n_s16(__p0_801, __p1_801, __p2_801) __extension__ ({ \
+  int8x8_t __s0_801 = __p0_801; \
+  int16x8_t __s1_801 = __p1_801; \
+  int8x16_t __ret_801; \
+  __ret_801 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_801), (int8x8_t)(vrshrn_n_s16(__s1_801, __p2_801)))); \
+  __ret_801; \
 })
 #else
-#define vrshrn_high_n_s16(__p0_710, __p1_710, __p2_710) __extension__ ({ \
-  int8x8_t __s0_710 = __p0_710; \
-  int16x8_t __s1_710 = __p1_710; \
-  int8x8_t __rev0_710;  __rev0_710 = __builtin_shufflevector(__s0_710, __s0_710, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1_710;  __rev1_710 = __builtin_shufflevector(__s1_710, __s1_710, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x16_t __ret_710; \
-  __ret_710 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_710), (int8x8_t)(__noswap_vrshrn_n_s16(__rev1_710, __p2_710)))); \
-  __ret_710 = __builtin_shufflevector(__ret_710, __ret_710, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_710; \
+#define vrshrn_high_n_s16(__p0_802, __p1_802, __p2_802) __extension__ ({ \
+  int8x8_t __s0_802 = __p0_802; \
+  int16x8_t __s1_802 = __p1_802; \
+  int8x8_t __rev0_802;  __rev0_802 = __builtin_shufflevector(__s0_802, __s0_802, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1_802;  __rev1_802 = __builtin_shufflevector(__s1_802, __s1_802, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret_802; \
+  __ret_802 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_802), (int8x8_t)(__noswap_vrshrn_n_s16(__rev1_802, __p2_802)))); \
+  __ret_802 = __builtin_shufflevector(__ret_802, __ret_802, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_802; \
 })
 #endif
 
@@ -59853,7 +61523,7 @@ __ai int8x16_t vrsubhn_high_s16(int8x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
   __ret = (float64x1_t) __builtin_neon_vset_lane_f64(__s0, (float64x1_t)__s1, __p2); \
   __ret; \
 })
-__ai uint64_t vshld_u64(uint64_t __p0, uint64_t __p1) {
+__ai uint64_t vshld_u64(uint64_t __p0, int64_t __p1) {
   uint64_t __ret;
   __ret = (uint64_t) __builtin_neon_vshld_u64(__p0, __p1);
   return __ret;
@@ -59876,110 +61546,110 @@ __ai int64_t vshld_s64(int64_t __p0, int64_t __p1) {
   __ret; \
 })
 #ifdef __LITTLE_ENDIAN__
-#define vshll_high_n_u8(__p0_711, __p1_711) __extension__ ({ \
-  uint8x16_t __s0_711 = __p0_711; \
-  uint16x8_t __ret_711; \
-  __ret_711 = (uint16x8_t)(vshll_n_u8(vget_high_u8(__s0_711), __p1_711)); \
-  __ret_711; \
+#define vshll_high_n_u8(__p0_803, __p1_803) __extension__ ({ \
+  uint8x16_t __s0_803 = __p0_803; \
+  uint16x8_t __ret_803; \
+  __ret_803 = (uint16x8_t)(vshll_n_u8(vget_high_u8(__s0_803), __p1_803)); \
+  __ret_803; \
 })
 #else
-#define vshll_high_n_u8(__p0_712, __p1_712) __extension__ ({ \
-  uint8x16_t __s0_712 = __p0_712; \
-  uint8x16_t __rev0_712;  __rev0_712 = __builtin_shufflevector(__s0_712, __s0_712, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __ret_712; \
-  __ret_712 = (uint16x8_t)(__noswap_vshll_n_u8(__noswap_vget_high_u8(__rev0_712), __p1_712)); \
-  __ret_712 = __builtin_shufflevector(__ret_712, __ret_712, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_712; \
+#define vshll_high_n_u8(__p0_804, __p1_804) __extension__ ({ \
+  uint8x16_t __s0_804 = __p0_804; \
+  uint8x16_t __rev0_804;  __rev0_804 = __builtin_shufflevector(__s0_804, __s0_804, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret_804; \
+  __ret_804 = (uint16x8_t)(__noswap_vshll_n_u8(__noswap_vget_high_u8(__rev0_804), __p1_804)); \
+  __ret_804 = __builtin_shufflevector(__ret_804, __ret_804, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_804; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vshll_high_n_u32(__p0_713, __p1_713) __extension__ ({ \
-  uint32x4_t __s0_713 = __p0_713; \
-  uint64x2_t __ret_713; \
-  __ret_713 = (uint64x2_t)(vshll_n_u32(vget_high_u32(__s0_713), __p1_713)); \
-  __ret_713; \
+#define vshll_high_n_u32(__p0_805, __p1_805) __extension__ ({ \
+  uint32x4_t __s0_805 = __p0_805; \
+  uint64x2_t __ret_805; \
+  __ret_805 = (uint64x2_t)(vshll_n_u32(vget_high_u32(__s0_805), __p1_805)); \
+  __ret_805; \
 })
 #else
-#define vshll_high_n_u32(__p0_714, __p1_714) __extension__ ({ \
-  uint32x4_t __s0_714 = __p0_714; \
-  uint32x4_t __rev0_714;  __rev0_714 = __builtin_shufflevector(__s0_714, __s0_714, 3, 2, 1, 0); \
-  uint64x2_t __ret_714; \
-  __ret_714 = (uint64x2_t)(__noswap_vshll_n_u32(__noswap_vget_high_u32(__rev0_714), __p1_714)); \
-  __ret_714 = __builtin_shufflevector(__ret_714, __ret_714, 1, 0); \
-  __ret_714; \
+#define vshll_high_n_u32(__p0_806, __p1_806) __extension__ ({ \
+  uint32x4_t __s0_806 = __p0_806; \
+  uint32x4_t __rev0_806;  __rev0_806 = __builtin_shufflevector(__s0_806, __s0_806, 3, 2, 1, 0); \
+  uint64x2_t __ret_806; \
+  __ret_806 = (uint64x2_t)(__noswap_vshll_n_u32(__noswap_vget_high_u32(__rev0_806), __p1_806)); \
+  __ret_806 = __builtin_shufflevector(__ret_806, __ret_806, 1, 0); \
+  __ret_806; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vshll_high_n_u16(__p0_715, __p1_715) __extension__ ({ \
-  uint16x8_t __s0_715 = __p0_715; \
-  uint32x4_t __ret_715; \
-  __ret_715 = (uint32x4_t)(vshll_n_u16(vget_high_u16(__s0_715), __p1_715)); \
-  __ret_715; \
+#define vshll_high_n_u16(__p0_807, __p1_807) __extension__ ({ \
+  uint16x8_t __s0_807 = __p0_807; \
+  uint32x4_t __ret_807; \
+  __ret_807 = (uint32x4_t)(vshll_n_u16(vget_high_u16(__s0_807), __p1_807)); \
+  __ret_807; \
 })
 #else
-#define vshll_high_n_u16(__p0_716, __p1_716) __extension__ ({ \
-  uint16x8_t __s0_716 = __p0_716; \
-  uint16x8_t __rev0_716;  __rev0_716 = __builtin_shufflevector(__s0_716, __s0_716, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint32x4_t __ret_716; \
-  __ret_716 = (uint32x4_t)(__noswap_vshll_n_u16(__noswap_vget_high_u16(__rev0_716), __p1_716)); \
-  __ret_716 = __builtin_shufflevector(__ret_716, __ret_716, 3, 2, 1, 0); \
-  __ret_716; \
+#define vshll_high_n_u16(__p0_808, __p1_808) __extension__ ({ \
+  uint16x8_t __s0_808 = __p0_808; \
+  uint16x8_t __rev0_808;  __rev0_808 = __builtin_shufflevector(__s0_808, __s0_808, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint32x4_t __ret_808; \
+  __ret_808 = (uint32x4_t)(__noswap_vshll_n_u16(__noswap_vget_high_u16(__rev0_808), __p1_808)); \
+  __ret_808 = __builtin_shufflevector(__ret_808, __ret_808, 3, 2, 1, 0); \
+  __ret_808; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vshll_high_n_s8(__p0_717, __p1_717) __extension__ ({ \
-  int8x16_t __s0_717 = __p0_717; \
-  int16x8_t __ret_717; \
-  __ret_717 = (int16x8_t)(vshll_n_s8(vget_high_s8(__s0_717), __p1_717)); \
-  __ret_717; \
+#define vshll_high_n_s8(__p0_809, __p1_809) __extension__ ({ \
+  int8x16_t __s0_809 = __p0_809; \
+  int16x8_t __ret_809; \
+  __ret_809 = (int16x8_t)(vshll_n_s8(vget_high_s8(__s0_809), __p1_809)); \
+  __ret_809; \
 })
 #else
-#define vshll_high_n_s8(__p0_718, __p1_718) __extension__ ({ \
-  int8x16_t __s0_718 = __p0_718; \
-  int8x16_t __rev0_718;  __rev0_718 = __builtin_shufflevector(__s0_718, __s0_718, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __ret_718; \
-  __ret_718 = (int16x8_t)(__noswap_vshll_n_s8(__noswap_vget_high_s8(__rev0_718), __p1_718)); \
-  __ret_718 = __builtin_shufflevector(__ret_718, __ret_718, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_718; \
+#define vshll_high_n_s8(__p0_810, __p1_810) __extension__ ({ \
+  int8x16_t __s0_810 = __p0_810; \
+  int8x16_t __rev0_810;  __rev0_810 = __builtin_shufflevector(__s0_810, __s0_810, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret_810; \
+  __ret_810 = (int16x8_t)(__noswap_vshll_n_s8(__noswap_vget_high_s8(__rev0_810), __p1_810)); \
+  __ret_810 = __builtin_shufflevector(__ret_810, __ret_810, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_810; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vshll_high_n_s32(__p0_719, __p1_719) __extension__ ({ \
-  int32x4_t __s0_719 = __p0_719; \
-  int64x2_t __ret_719; \
-  __ret_719 = (int64x2_t)(vshll_n_s32(vget_high_s32(__s0_719), __p1_719)); \
-  __ret_719; \
+#define vshll_high_n_s32(__p0_811, __p1_811) __extension__ ({ \
+  int32x4_t __s0_811 = __p0_811; \
+  int64x2_t __ret_811; \
+  __ret_811 = (int64x2_t)(vshll_n_s32(vget_high_s32(__s0_811), __p1_811)); \
+  __ret_811; \
 })
 #else
-#define vshll_high_n_s32(__p0_720, __p1_720) __extension__ ({ \
-  int32x4_t __s0_720 = __p0_720; \
-  int32x4_t __rev0_720;  __rev0_720 = __builtin_shufflevector(__s0_720, __s0_720, 3, 2, 1, 0); \
-  int64x2_t __ret_720; \
-  __ret_720 = (int64x2_t)(__noswap_vshll_n_s32(__noswap_vget_high_s32(__rev0_720), __p1_720)); \
-  __ret_720 = __builtin_shufflevector(__ret_720, __ret_720, 1, 0); \
-  __ret_720; \
+#define vshll_high_n_s32(__p0_812, __p1_812) __extension__ ({ \
+  int32x4_t __s0_812 = __p0_812; \
+  int32x4_t __rev0_812;  __rev0_812 = __builtin_shufflevector(__s0_812, __s0_812, 3, 2, 1, 0); \
+  int64x2_t __ret_812; \
+  __ret_812 = (int64x2_t)(__noswap_vshll_n_s32(__noswap_vget_high_s32(__rev0_812), __p1_812)); \
+  __ret_812 = __builtin_shufflevector(__ret_812, __ret_812, 1, 0); \
+  __ret_812; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vshll_high_n_s16(__p0_721, __p1_721) __extension__ ({ \
-  int16x8_t __s0_721 = __p0_721; \
-  int32x4_t __ret_721; \
-  __ret_721 = (int32x4_t)(vshll_n_s16(vget_high_s16(__s0_721), __p1_721)); \
-  __ret_721; \
+#define vshll_high_n_s16(__p0_813, __p1_813) __extension__ ({ \
+  int16x8_t __s0_813 = __p0_813; \
+  int32x4_t __ret_813; \
+  __ret_813 = (int32x4_t)(vshll_n_s16(vget_high_s16(__s0_813), __p1_813)); \
+  __ret_813; \
 })
 #else
-#define vshll_high_n_s16(__p0_722, __p1_722) __extension__ ({ \
-  int16x8_t __s0_722 = __p0_722; \
-  int16x8_t __rev0_722;  __rev0_722 = __builtin_shufflevector(__s0_722, __s0_722, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int32x4_t __ret_722; \
-  __ret_722 = (int32x4_t)(__noswap_vshll_n_s16(__noswap_vget_high_s16(__rev0_722), __p1_722)); \
-  __ret_722 = __builtin_shufflevector(__ret_722, __ret_722, 3, 2, 1, 0); \
-  __ret_722; \
+#define vshll_high_n_s16(__p0_814, __p1_814) __extension__ ({ \
+  int16x8_t __s0_814 = __p0_814; \
+  int16x8_t __rev0_814;  __rev0_814 = __builtin_shufflevector(__s0_814, __s0_814, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret_814; \
+  __ret_814 = (int32x4_t)(__noswap_vshll_n_s16(__noswap_vget_high_s16(__rev0_814), __p1_814)); \
+  __ret_814 = __builtin_shufflevector(__ret_814, __ret_814, 3, 2, 1, 0); \
+  __ret_814; \
 })
 #endif
 
@@ -59996,128 +61666,128 @@ __ai int64_t vshld_s64(int64_t __p0, int64_t __p1) {
   __ret; \
 })
 #ifdef __LITTLE_ENDIAN__
-#define vshrn_high_n_u32(__p0_723, __p1_723, __p2_723) __extension__ ({ \
-  uint16x4_t __s0_723 = __p0_723; \
-  uint32x4_t __s1_723 = __p1_723; \
-  uint16x8_t __ret_723; \
-  __ret_723 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_723), (uint16x4_t)(vshrn_n_u32(__s1_723, __p2_723)))); \
-  __ret_723; \
+#define vshrn_high_n_u32(__p0_815, __p1_815, __p2_815) __extension__ ({ \
+  uint16x4_t __s0_815 = __p0_815; \
+  uint32x4_t __s1_815 = __p1_815; \
+  uint16x8_t __ret_815; \
+  __ret_815 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_815), (uint16x4_t)(vshrn_n_u32(__s1_815, __p2_815)))); \
+  __ret_815; \
 })
 #else
-#define vshrn_high_n_u32(__p0_724, __p1_724, __p2_724) __extension__ ({ \
-  uint16x4_t __s0_724 = __p0_724; \
-  uint32x4_t __s1_724 = __p1_724; \
-  uint16x4_t __rev0_724;  __rev0_724 = __builtin_shufflevector(__s0_724, __s0_724, 3, 2, 1, 0); \
-  uint32x4_t __rev1_724;  __rev1_724 = __builtin_shufflevector(__s1_724, __s1_724, 3, 2, 1, 0); \
-  uint16x8_t __ret_724; \
-  __ret_724 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_724), (uint16x4_t)(__noswap_vshrn_n_u32(__rev1_724, __p2_724)))); \
-  __ret_724 = __builtin_shufflevector(__ret_724, __ret_724, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_724; \
+#define vshrn_high_n_u32(__p0_816, __p1_816, __p2_816) __extension__ ({ \
+  uint16x4_t __s0_816 = __p0_816; \
+  uint32x4_t __s1_816 = __p1_816; \
+  uint16x4_t __rev0_816;  __rev0_816 = __builtin_shufflevector(__s0_816, __s0_816, 3, 2, 1, 0); \
+  uint32x4_t __rev1_816;  __rev1_816 = __builtin_shufflevector(__s1_816, __s1_816, 3, 2, 1, 0); \
+  uint16x8_t __ret_816; \
+  __ret_816 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_816), (uint16x4_t)(__noswap_vshrn_n_u32(__rev1_816, __p2_816)))); \
+  __ret_816 = __builtin_shufflevector(__ret_816, __ret_816, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_816; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vshrn_high_n_u64(__p0_725, __p1_725, __p2_725) __extension__ ({ \
-  uint32x2_t __s0_725 = __p0_725; \
-  uint64x2_t __s1_725 = __p1_725; \
-  uint32x4_t __ret_725; \
-  __ret_725 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_725), (uint32x2_t)(vshrn_n_u64(__s1_725, __p2_725)))); \
-  __ret_725; \
+#define vshrn_high_n_u64(__p0_817, __p1_817, __p2_817) __extension__ ({ \
+  uint32x2_t __s0_817 = __p0_817; \
+  uint64x2_t __s1_817 = __p1_817; \
+  uint32x4_t __ret_817; \
+  __ret_817 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_817), (uint32x2_t)(vshrn_n_u64(__s1_817, __p2_817)))); \
+  __ret_817; \
 })
 #else
-#define vshrn_high_n_u64(__p0_726, __p1_726, __p2_726) __extension__ ({ \
-  uint32x2_t __s0_726 = __p0_726; \
-  uint64x2_t __s1_726 = __p1_726; \
-  uint32x2_t __rev0_726;  __rev0_726 = __builtin_shufflevector(__s0_726, __s0_726, 1, 0); \
-  uint64x2_t __rev1_726;  __rev1_726 = __builtin_shufflevector(__s1_726, __s1_726, 1, 0); \
-  uint32x4_t __ret_726; \
-  __ret_726 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_726), (uint32x2_t)(__noswap_vshrn_n_u64(__rev1_726, __p2_726)))); \
-  __ret_726 = __builtin_shufflevector(__ret_726, __ret_726, 3, 2, 1, 0); \
-  __ret_726; \
+#define vshrn_high_n_u64(__p0_818, __p1_818, __p2_818) __extension__ ({ \
+  uint32x2_t __s0_818 = __p0_818; \
+  uint64x2_t __s1_818 = __p1_818; \
+  uint32x2_t __rev0_818;  __rev0_818 = __builtin_shufflevector(__s0_818, __s0_818, 1, 0); \
+  uint64x2_t __rev1_818;  __rev1_818 = __builtin_shufflevector(__s1_818, __s1_818, 1, 0); \
+  uint32x4_t __ret_818; \
+  __ret_818 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_818), (uint32x2_t)(__noswap_vshrn_n_u64(__rev1_818, __p2_818)))); \
+  __ret_818 = __builtin_shufflevector(__ret_818, __ret_818, 3, 2, 1, 0); \
+  __ret_818; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vshrn_high_n_u16(__p0_727, __p1_727, __p2_727) __extension__ ({ \
-  uint8x8_t __s0_727 = __p0_727; \
-  uint16x8_t __s1_727 = __p1_727; \
-  uint8x16_t __ret_727; \
-  __ret_727 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_727), (uint8x8_t)(vshrn_n_u16(__s1_727, __p2_727)))); \
-  __ret_727; \
+#define vshrn_high_n_u16(__p0_819, __p1_819, __p2_819) __extension__ ({ \
+  uint8x8_t __s0_819 = __p0_819; \
+  uint16x8_t __s1_819 = __p1_819; \
+  uint8x16_t __ret_819; \
+  __ret_819 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_819), (uint8x8_t)(vshrn_n_u16(__s1_819, __p2_819)))); \
+  __ret_819; \
 })
 #else
-#define vshrn_high_n_u16(__p0_728, __p1_728, __p2_728) __extension__ ({ \
-  uint8x8_t __s0_728 = __p0_728; \
-  uint16x8_t __s1_728 = __p1_728; \
-  uint8x8_t __rev0_728;  __rev0_728 = __builtin_shufflevector(__s0_728, __s0_728, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __rev1_728;  __rev1_728 = __builtin_shufflevector(__s1_728, __s1_728, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x16_t __ret_728; \
-  __ret_728 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_728), (uint8x8_t)(__noswap_vshrn_n_u16(__rev1_728, __p2_728)))); \
-  __ret_728 = __builtin_shufflevector(__ret_728, __ret_728, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_728; \
+#define vshrn_high_n_u16(__p0_820, __p1_820, __p2_820) __extension__ ({ \
+  uint8x8_t __s0_820 = __p0_820; \
+  uint16x8_t __s1_820 = __p1_820; \
+  uint8x8_t __rev0_820;  __rev0_820 = __builtin_shufflevector(__s0_820, __s0_820, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1_820;  __rev1_820 = __builtin_shufflevector(__s1_820, __s1_820, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret_820; \
+  __ret_820 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_820), (uint8x8_t)(__noswap_vshrn_n_u16(__rev1_820, __p2_820)))); \
+  __ret_820 = __builtin_shufflevector(__ret_820, __ret_820, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_820; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vshrn_high_n_s32(__p0_729, __p1_729, __p2_729) __extension__ ({ \
-  int16x4_t __s0_729 = __p0_729; \
-  int32x4_t __s1_729 = __p1_729; \
-  int16x8_t __ret_729; \
-  __ret_729 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_729), (int16x4_t)(vshrn_n_s32(__s1_729, __p2_729)))); \
-  __ret_729; \
+#define vshrn_high_n_s32(__p0_821, __p1_821, __p2_821) __extension__ ({ \
+  int16x4_t __s0_821 = __p0_821; \
+  int32x4_t __s1_821 = __p1_821; \
+  int16x8_t __ret_821; \
+  __ret_821 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_821), (int16x4_t)(vshrn_n_s32(__s1_821, __p2_821)))); \
+  __ret_821; \
 })
 #else
-#define vshrn_high_n_s32(__p0_730, __p1_730, __p2_730) __extension__ ({ \
-  int16x4_t __s0_730 = __p0_730; \
-  int32x4_t __s1_730 = __p1_730; \
-  int16x4_t __rev0_730;  __rev0_730 = __builtin_shufflevector(__s0_730, __s0_730, 3, 2, 1, 0); \
-  int32x4_t __rev1_730;  __rev1_730 = __builtin_shufflevector(__s1_730, __s1_730, 3, 2, 1, 0); \
-  int16x8_t __ret_730; \
-  __ret_730 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_730), (int16x4_t)(__noswap_vshrn_n_s32(__rev1_730, __p2_730)))); \
-  __ret_730 = __builtin_shufflevector(__ret_730, __ret_730, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_730; \
+#define vshrn_high_n_s32(__p0_822, __p1_822, __p2_822) __extension__ ({ \
+  int16x4_t __s0_822 = __p0_822; \
+  int32x4_t __s1_822 = __p1_822; \
+  int16x4_t __rev0_822;  __rev0_822 = __builtin_shufflevector(__s0_822, __s0_822, 3, 2, 1, 0); \
+  int32x4_t __rev1_822;  __rev1_822 = __builtin_shufflevector(__s1_822, __s1_822, 3, 2, 1, 0); \
+  int16x8_t __ret_822; \
+  __ret_822 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_822), (int16x4_t)(__noswap_vshrn_n_s32(__rev1_822, __p2_822)))); \
+  __ret_822 = __builtin_shufflevector(__ret_822, __ret_822, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_822; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vshrn_high_n_s64(__p0_731, __p1_731, __p2_731) __extension__ ({ \
-  int32x2_t __s0_731 = __p0_731; \
-  int64x2_t __s1_731 = __p1_731; \
-  int32x4_t __ret_731; \
-  __ret_731 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_731), (int32x2_t)(vshrn_n_s64(__s1_731, __p2_731)))); \
-  __ret_731; \
+#define vshrn_high_n_s64(__p0_823, __p1_823, __p2_823) __extension__ ({ \
+  int32x2_t __s0_823 = __p0_823; \
+  int64x2_t __s1_823 = __p1_823; \
+  int32x4_t __ret_823; \
+  __ret_823 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_823), (int32x2_t)(vshrn_n_s64(__s1_823, __p2_823)))); \
+  __ret_823; \
 })
 #else
-#define vshrn_high_n_s64(__p0_732, __p1_732, __p2_732) __extension__ ({ \
-  int32x2_t __s0_732 = __p0_732; \
-  int64x2_t __s1_732 = __p1_732; \
-  int32x2_t __rev0_732;  __rev0_732 = __builtin_shufflevector(__s0_732, __s0_732, 1, 0); \
-  int64x2_t __rev1_732;  __rev1_732 = __builtin_shufflevector(__s1_732, __s1_732, 1, 0); \
-  int32x4_t __ret_732; \
-  __ret_732 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_732), (int32x2_t)(__noswap_vshrn_n_s64(__rev1_732, __p2_732)))); \
-  __ret_732 = __builtin_shufflevector(__ret_732, __ret_732, 3, 2, 1, 0); \
-  __ret_732; \
+#define vshrn_high_n_s64(__p0_824, __p1_824, __p2_824) __extension__ ({ \
+  int32x2_t __s0_824 = __p0_824; \
+  int64x2_t __s1_824 = __p1_824; \
+  int32x2_t __rev0_824;  __rev0_824 = __builtin_shufflevector(__s0_824, __s0_824, 1, 0); \
+  int64x2_t __rev1_824;  __rev1_824 = __builtin_shufflevector(__s1_824, __s1_824, 1, 0); \
+  int32x4_t __ret_824; \
+  __ret_824 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_824), (int32x2_t)(__noswap_vshrn_n_s64(__rev1_824, __p2_824)))); \
+  __ret_824 = __builtin_shufflevector(__ret_824, __ret_824, 3, 2, 1, 0); \
+  __ret_824; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vshrn_high_n_s16(__p0_733, __p1_733, __p2_733) __extension__ ({ \
-  int8x8_t __s0_733 = __p0_733; \
-  int16x8_t __s1_733 = __p1_733; \
-  int8x16_t __ret_733; \
-  __ret_733 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_733), (int8x8_t)(vshrn_n_s16(__s1_733, __p2_733)))); \
-  __ret_733; \
+#define vshrn_high_n_s16(__p0_825, __p1_825, __p2_825) __extension__ ({ \
+  int8x8_t __s0_825 = __p0_825; \
+  int16x8_t __s1_825 = __p1_825; \
+  int8x16_t __ret_825; \
+  __ret_825 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_825), (int8x8_t)(vshrn_n_s16(__s1_825, __p2_825)))); \
+  __ret_825; \
 })
 #else
-#define vshrn_high_n_s16(__p0_734, __p1_734, __p2_734) __extension__ ({ \
-  int8x8_t __s0_734 = __p0_734; \
-  int16x8_t __s1_734 = __p1_734; \
-  int8x8_t __rev0_734;  __rev0_734 = __builtin_shufflevector(__s0_734, __s0_734, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1_734;  __rev1_734 = __builtin_shufflevector(__s1_734, __s1_734, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x16_t __ret_734; \
-  __ret_734 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_734), (int8x8_t)(__noswap_vshrn_n_s16(__rev1_734, __p2_734)))); \
-  __ret_734 = __builtin_shufflevector(__ret_734, __ret_734, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_734; \
+#define vshrn_high_n_s16(__p0_826, __p1_826, __p2_826) __extension__ ({ \
+  int8x8_t __s0_826 = __p0_826; \
+  int16x8_t __s1_826 = __p1_826; \
+  int8x8_t __rev0_826;  __rev0_826 = __builtin_shufflevector(__s0_826, __s0_826, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1_826;  __rev1_826 = __builtin_shufflevector(__s1_826, __s1_826, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret_826; \
+  __ret_826 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_826), (int8x8_t)(__noswap_vshrn_n_s16(__rev1_826, __p2_826)))); \
+  __ret_826 = __builtin_shufflevector(__ret_826, __ret_826, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_826; \
 })
 #endif
 
@@ -61553,54 +63223,54 @@ __ai int32x4_t vsubw_high_s16(int32x4_t __p0, int16x8_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vsudotq_laneq_s32(__p0_735, __p1_735, __p2_735, __p3_735) __extension__ ({ \
-  int32x4_t __s0_735 = __p0_735; \
-  int8x16_t __s1_735 = __p1_735; \
-  uint8x16_t __s2_735 = __p2_735; \
-  int32x4_t __ret_735; \
-uint8x16_t __reint_735 = __s2_735; \
-  __ret_735 = vusdotq_s32(__s0_735, (uint8x16_t)(splatq_laneq_s32(*(int32x4_t *) &__reint_735, __p3_735)), __s1_735); \
-  __ret_735; \
+#define vsudotq_laneq_s32(__p0_827, __p1_827, __p2_827, __p3_827) __extension__ ({ \
+  int32x4_t __s0_827 = __p0_827; \
+  int8x16_t __s1_827 = __p1_827; \
+  uint8x16_t __s2_827 = __p2_827; \
+  int32x4_t __ret_827; \
+uint8x16_t __reint_827 = __s2_827; \
+  __ret_827 = vusdotq_s32(__s0_827, (uint8x16_t)(splatq_laneq_s32(*(int32x4_t *) &__reint_827, __p3_827)), __s1_827); \
+  __ret_827; \
 })
 #else
-#define vsudotq_laneq_s32(__p0_736, __p1_736, __p2_736, __p3_736) __extension__ ({ \
-  int32x4_t __s0_736 = __p0_736; \
-  int8x16_t __s1_736 = __p1_736; \
-  uint8x16_t __s2_736 = __p2_736; \
-  int32x4_t __rev0_736;  __rev0_736 = __builtin_shufflevector(__s0_736, __s0_736, 3, 2, 1, 0); \
-  int8x16_t __rev1_736;  __rev1_736 = __builtin_shufflevector(__s1_736, __s1_736, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x16_t __rev2_736;  __rev2_736 = __builtin_shufflevector(__s2_736, __s2_736, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int32x4_t __ret_736; \
-uint8x16_t __reint_736 = __rev2_736; \
-  __ret_736 = __noswap_vusdotq_s32(__rev0_736, (uint8x16_t)(__noswap_splatq_laneq_s32(*(int32x4_t *) &__reint_736, __p3_736)), __rev1_736); \
-  __ret_736 = __builtin_shufflevector(__ret_736, __ret_736, 3, 2, 1, 0); \
-  __ret_736; \
+#define vsudotq_laneq_s32(__p0_828, __p1_828, __p2_828, __p3_828) __extension__ ({ \
+  int32x4_t __s0_828 = __p0_828; \
+  int8x16_t __s1_828 = __p1_828; \
+  uint8x16_t __s2_828 = __p2_828; \
+  int32x4_t __rev0_828;  __rev0_828 = __builtin_shufflevector(__s0_828, __s0_828, 3, 2, 1, 0); \
+  int8x16_t __rev1_828;  __rev1_828 = __builtin_shufflevector(__s1_828, __s1_828, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __rev2_828;  __rev2_828 = __builtin_shufflevector(__s2_828, __s2_828, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret_828; \
+uint8x16_t __reint_828 = __rev2_828; \
+  __ret_828 = __noswap_vusdotq_s32(__rev0_828, (uint8x16_t)(__noswap_splatq_laneq_s32(*(int32x4_t *) &__reint_828, __p3_828)), __rev1_828); \
+  __ret_828 = __builtin_shufflevector(__ret_828, __ret_828, 3, 2, 1, 0); \
+  __ret_828; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vsudot_laneq_s32(__p0_737, __p1_737, __p2_737, __p3_737) __extension__ ({ \
-  int32x2_t __s0_737 = __p0_737; \
-  int8x8_t __s1_737 = __p1_737; \
-  uint8x16_t __s2_737 = __p2_737; \
-  int32x2_t __ret_737; \
-uint8x16_t __reint_737 = __s2_737; \
-  __ret_737 = vusdot_s32(__s0_737, (uint8x8_t)(splat_laneq_s32(*(int32x4_t *) &__reint_737, __p3_737)), __s1_737); \
-  __ret_737; \
+#define vsudot_laneq_s32(__p0_829, __p1_829, __p2_829, __p3_829) __extension__ ({ \
+  int32x2_t __s0_829 = __p0_829; \
+  int8x8_t __s1_829 = __p1_829; \
+  uint8x16_t __s2_829 = __p2_829; \
+  int32x2_t __ret_829; \
+uint8x16_t __reint_829 = __s2_829; \
+  __ret_829 = vusdot_s32(__s0_829, (uint8x8_t)(splat_laneq_s32(*(int32x4_t *) &__reint_829, __p3_829)), __s1_829); \
+  __ret_829; \
 })
 #else
-#define vsudot_laneq_s32(__p0_738, __p1_738, __p2_738, __p3_738) __extension__ ({ \
-  int32x2_t __s0_738 = __p0_738; \
-  int8x8_t __s1_738 = __p1_738; \
-  uint8x16_t __s2_738 = __p2_738; \
-  int32x2_t __rev0_738;  __rev0_738 = __builtin_shufflevector(__s0_738, __s0_738, 1, 0); \
-  int8x8_t __rev1_738;  __rev1_738 = __builtin_shufflevector(__s1_738, __s1_738, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x16_t __rev2_738;  __rev2_738 = __builtin_shufflevector(__s2_738, __s2_738, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int32x2_t __ret_738; \
-uint8x16_t __reint_738 = __rev2_738; \
-  __ret_738 = __noswap_vusdot_s32(__rev0_738, (uint8x8_t)(__noswap_splat_laneq_s32(*(int32x4_t *) &__reint_738, __p3_738)), __rev1_738); \
-  __ret_738 = __builtin_shufflevector(__ret_738, __ret_738, 1, 0); \
-  __ret_738; \
+#define vsudot_laneq_s32(__p0_830, __p1_830, __p2_830, __p3_830) __extension__ ({ \
+  int32x2_t __s0_830 = __p0_830; \
+  int8x8_t __s1_830 = __p1_830; \
+  uint8x16_t __s2_830 = __p2_830; \
+  int32x2_t __rev0_830;  __rev0_830 = __builtin_shufflevector(__s0_830, __s0_830, 1, 0); \
+  int8x8_t __rev1_830;  __rev1_830 = __builtin_shufflevector(__s1_830, __s1_830, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __rev2_830;  __rev2_830 = __builtin_shufflevector(__s2_830, __s2_830, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x2_t __ret_830; \
+uint8x16_t __reint_830 = __rev2_830; \
+  __ret_830 = __noswap_vusdot_s32(__rev0_830, (uint8x8_t)(__noswap_splat_laneq_s32(*(int32x4_t *) &__reint_830, __p3_830)), __rev1_830); \
+  __ret_830 = __builtin_shufflevector(__ret_830, __ret_830, 1, 0); \
+  __ret_830; \
 })
 #endif
 
@@ -62423,9 +64093,9 @@ __ai uint64_t vtstd_u64(uint64_t __p0, uint64_t __p1) {
   __ret = (uint64_t) __builtin_neon_vtstd_u64(__p0, __p1);
   return __ret;
 }
-__ai int64_t vtstd_s64(int64_t __p0, int64_t __p1) {
-  int64_t __ret;
-  __ret = (int64_t) __builtin_neon_vtstd_s64(__p0, __p1);
+__ai uint64_t vtstd_s64(int64_t __p0, int64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vtstd_s64(__p0, __p1);
   return __ret;
 }
 __ai int8_t vuqaddb_s8(int8_t __p0, uint8_t __p1) {
@@ -62573,54 +64243,54 @@ __ai int16x4_t vuqadd_s16(int16x4_t __p0, uint16x4_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vusdotq_laneq_s32(__p0_739, __p1_739, __p2_739, __p3_739) __extension__ ({ \
-  int32x4_t __s0_739 = __p0_739; \
-  uint8x16_t __s1_739 = __p1_739; \
-  int8x16_t __s2_739 = __p2_739; \
-  int32x4_t __ret_739; \
-int8x16_t __reint_739 = __s2_739; \
-  __ret_739 = vusdotq_s32(__s0_739, __s1_739, (int8x16_t)(splatq_laneq_s32(*(int32x4_t *) &__reint_739, __p3_739))); \
-  __ret_739; \
+#define vusdotq_laneq_s32(__p0_831, __p1_831, __p2_831, __p3_831) __extension__ ({ \
+  int32x4_t __s0_831 = __p0_831; \
+  uint8x16_t __s1_831 = __p1_831; \
+  int8x16_t __s2_831 = __p2_831; \
+  int32x4_t __ret_831; \
+int8x16_t __reint_831 = __s2_831; \
+  __ret_831 = vusdotq_s32(__s0_831, __s1_831, (int8x16_t)(splatq_laneq_s32(*(int32x4_t *) &__reint_831, __p3_831))); \
+  __ret_831; \
 })
 #else
-#define vusdotq_laneq_s32(__p0_740, __p1_740, __p2_740, __p3_740) __extension__ ({ \
-  int32x4_t __s0_740 = __p0_740; \
-  uint8x16_t __s1_740 = __p1_740; \
-  int8x16_t __s2_740 = __p2_740; \
-  int32x4_t __rev0_740;  __rev0_740 = __builtin_shufflevector(__s0_740, __s0_740, 3, 2, 1, 0); \
-  uint8x16_t __rev1_740;  __rev1_740 = __builtin_shufflevector(__s1_740, __s1_740, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x16_t __rev2_740;  __rev2_740 = __builtin_shufflevector(__s2_740, __s2_740, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int32x4_t __ret_740; \
-int8x16_t __reint_740 = __rev2_740; \
-  __ret_740 = __noswap_vusdotq_s32(__rev0_740, __rev1_740, (int8x16_t)(__noswap_splatq_laneq_s32(*(int32x4_t *) &__reint_740, __p3_740))); \
-  __ret_740 = __builtin_shufflevector(__ret_740, __ret_740, 3, 2, 1, 0); \
-  __ret_740; \
+#define vusdotq_laneq_s32(__p0_832, __p1_832, __p2_832, __p3_832) __extension__ ({ \
+  int32x4_t __s0_832 = __p0_832; \
+  uint8x16_t __s1_832 = __p1_832; \
+  int8x16_t __s2_832 = __p2_832; \
+  int32x4_t __rev0_832;  __rev0_832 = __builtin_shufflevector(__s0_832, __s0_832, 3, 2, 1, 0); \
+  uint8x16_t __rev1_832;  __rev1_832 = __builtin_shufflevector(__s1_832, __s1_832, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __rev2_832;  __rev2_832 = __builtin_shufflevector(__s2_832, __s2_832, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret_832; \
+int8x16_t __reint_832 = __rev2_832; \
+  __ret_832 = __noswap_vusdotq_s32(__rev0_832, __rev1_832, (int8x16_t)(__noswap_splatq_laneq_s32(*(int32x4_t *) &__reint_832, __p3_832))); \
+  __ret_832 = __builtin_shufflevector(__ret_832, __ret_832, 3, 2, 1, 0); \
+  __ret_832; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vusdot_laneq_s32(__p0_741, __p1_741, __p2_741, __p3_741) __extension__ ({ \
-  int32x2_t __s0_741 = __p0_741; \
-  uint8x8_t __s1_741 = __p1_741; \
-  int8x16_t __s2_741 = __p2_741; \
-  int32x2_t __ret_741; \
-int8x16_t __reint_741 = __s2_741; \
-  __ret_741 = vusdot_s32(__s0_741, __s1_741, (int8x8_t)(splat_laneq_s32(*(int32x4_t *) &__reint_741, __p3_741))); \
-  __ret_741; \
+#define vusdot_laneq_s32(__p0_833, __p1_833, __p2_833, __p3_833) __extension__ ({ \
+  int32x2_t __s0_833 = __p0_833; \
+  uint8x8_t __s1_833 = __p1_833; \
+  int8x16_t __s2_833 = __p2_833; \
+  int32x2_t __ret_833; \
+int8x16_t __reint_833 = __s2_833; \
+  __ret_833 = vusdot_s32(__s0_833, __s1_833, (int8x8_t)(splat_laneq_s32(*(int32x4_t *) &__reint_833, __p3_833))); \
+  __ret_833; \
 })
 #else
-#define vusdot_laneq_s32(__p0_742, __p1_742, __p2_742, __p3_742) __extension__ ({ \
-  int32x2_t __s0_742 = __p0_742; \
-  uint8x8_t __s1_742 = __p1_742; \
-  int8x16_t __s2_742 = __p2_742; \
-  int32x2_t __rev0_742;  __rev0_742 = __builtin_shufflevector(__s0_742, __s0_742, 1, 0); \
-  uint8x8_t __rev1_742;  __rev1_742 = __builtin_shufflevector(__s1_742, __s1_742, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x16_t __rev2_742;  __rev2_742 = __builtin_shufflevector(__s2_742, __s2_742, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int32x2_t __ret_742; \
-int8x16_t __reint_742 = __rev2_742; \
-  __ret_742 = __noswap_vusdot_s32(__rev0_742, __rev1_742, (int8x8_t)(__noswap_splat_laneq_s32(*(int32x4_t *) &__reint_742, __p3_742))); \
-  __ret_742 = __builtin_shufflevector(__ret_742, __ret_742, 1, 0); \
-  __ret_742; \
+#define vusdot_laneq_s32(__p0_834, __p1_834, __p2_834, __p3_834) __extension__ ({ \
+  int32x2_t __s0_834 = __p0_834; \
+  uint8x8_t __s1_834 = __p1_834; \
+  int8x16_t __s2_834 = __p2_834; \
+  int32x2_t __rev0_834;  __rev0_834 = __builtin_shufflevector(__s0_834, __s0_834, 1, 0); \
+  uint8x8_t __rev1_834;  __rev1_834 = __builtin_shufflevector(__s1_834, __s1_834, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __rev2_834;  __rev2_834 = __builtin_shufflevector(__s2_834, __s2_834, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x2_t __ret_834; \
+int8x16_t __reint_834 = __rev2_834; \
+  __ret_834 = __noswap_vusdot_s32(__rev0_834, __rev1_834, (int8x8_t)(__noswap_splat_laneq_s32(*(int32x4_t *) &__reint_834, __p3_834))); \
+  __ret_834 = __builtin_shufflevector(__ret_834, __ret_834, 1, 0); \
+  __ret_834; \
 })
 #endif
 
@@ -64674,60 +66344,60 @@ __ai int32x4_t vaddw_s16(int32x4_t __p0, int16x4_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vget_lane_f16(__p0_743, __p1_743) __extension__ ({ \
-  float16x4_t __s0_743 = __p0_743; \
-  float16_t __ret_743; \
-float16x4_t __reint_743 = __s0_743; \
-int16_t __reint1_743 = vget_lane_s16(*(int16x4_t *) &__reint_743, __p1_743); \
-  __ret_743 = *(float16_t *) &__reint1_743; \
-  __ret_743; \
+#define vget_lane_f16(__p0_835, __p1_835) __extension__ ({ \
+  float16x4_t __s0_835 = __p0_835; \
+  float16_t __ret_835; \
+float16x4_t __reint_835 = __s0_835; \
+int16_t __reint1_835 = vget_lane_s16(*(int16x4_t *) &__reint_835, __p1_835); \
+  __ret_835 = *(float16_t *) &__reint1_835; \
+  __ret_835; \
 })
 #else
-#define vget_lane_f16(__p0_744, __p1_744) __extension__ ({ \
-  float16x4_t __s0_744 = __p0_744; \
-  float16x4_t __rev0_744;  __rev0_744 = __builtin_shufflevector(__s0_744, __s0_744, 3, 2, 1, 0); \
-  float16_t __ret_744; \
-float16x4_t __reint_744 = __rev0_744; \
-int16_t __reint1_744 = __noswap_vget_lane_s16(*(int16x4_t *) &__reint_744, __p1_744); \
-  __ret_744 = *(float16_t *) &__reint1_744; \
-  __ret_744; \
+#define vget_lane_f16(__p0_836, __p1_836) __extension__ ({ \
+  float16x4_t __s0_836 = __p0_836; \
+  float16x4_t __rev0_836;  __rev0_836 = __builtin_shufflevector(__s0_836, __s0_836, 3, 2, 1, 0); \
+  float16_t __ret_836; \
+float16x4_t __reint_836 = __rev0_836; \
+int16_t __reint1_836 = __noswap_vget_lane_s16(*(int16x4_t *) &__reint_836, __p1_836); \
+  __ret_836 = *(float16_t *) &__reint1_836; \
+  __ret_836; \
 })
-#define __noswap_vget_lane_f16(__p0_745, __p1_745) __extension__ ({ \
-  float16x4_t __s0_745 = __p0_745; \
-  float16_t __ret_745; \
-float16x4_t __reint_745 = __s0_745; \
-int16_t __reint1_745 = __noswap_vget_lane_s16(*(int16x4_t *) &__reint_745, __p1_745); \
-  __ret_745 = *(float16_t *) &__reint1_745; \
-  __ret_745; \
+#define __noswap_vget_lane_f16(__p0_837, __p1_837) __extension__ ({ \
+  float16x4_t __s0_837 = __p0_837; \
+  float16_t __ret_837; \
+float16x4_t __reint_837 = __s0_837; \
+int16_t __reint1_837 = __noswap_vget_lane_s16(*(int16x4_t *) &__reint_837, __p1_837); \
+  __ret_837 = *(float16_t *) &__reint1_837; \
+  __ret_837; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vgetq_lane_f16(__p0_746, __p1_746) __extension__ ({ \
-  float16x8_t __s0_746 = __p0_746; \
-  float16_t __ret_746; \
-float16x8_t __reint_746 = __s0_746; \
-int16_t __reint1_746 = vgetq_lane_s16(*(int16x8_t *) &__reint_746, __p1_746); \
-  __ret_746 = *(float16_t *) &__reint1_746; \
-  __ret_746; \
+#define vgetq_lane_f16(__p0_838, __p1_838) __extension__ ({ \
+  float16x8_t __s0_838 = __p0_838; \
+  float16_t __ret_838; \
+float16x8_t __reint_838 = __s0_838; \
+int16_t __reint1_838 = vgetq_lane_s16(*(int16x8_t *) &__reint_838, __p1_838); \
+  __ret_838 = *(float16_t *) &__reint1_838; \
+  __ret_838; \
 })
 #else
-#define vgetq_lane_f16(__p0_747, __p1_747) __extension__ ({ \
-  float16x8_t __s0_747 = __p0_747; \
-  float16x8_t __rev0_747;  __rev0_747 = __builtin_shufflevector(__s0_747, __s0_747, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16_t __ret_747; \
-float16x8_t __reint_747 = __rev0_747; \
-int16_t __reint1_747 = __noswap_vgetq_lane_s16(*(int16x8_t *) &__reint_747, __p1_747); \
-  __ret_747 = *(float16_t *) &__reint1_747; \
-  __ret_747; \
+#define vgetq_lane_f16(__p0_839, __p1_839) __extension__ ({ \
+  float16x8_t __s0_839 = __p0_839; \
+  float16x8_t __rev0_839;  __rev0_839 = __builtin_shufflevector(__s0_839, __s0_839, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16_t __ret_839; \
+float16x8_t __reint_839 = __rev0_839; \
+int16_t __reint1_839 = __noswap_vgetq_lane_s16(*(int16x8_t *) &__reint_839, __p1_839); \
+  __ret_839 = *(float16_t *) &__reint1_839; \
+  __ret_839; \
 })
-#define __noswap_vgetq_lane_f16(__p0_748, __p1_748) __extension__ ({ \
-  float16x8_t __s0_748 = __p0_748; \
-  float16_t __ret_748; \
-float16x8_t __reint_748 = __s0_748; \
-int16_t __reint1_748 = __noswap_vgetq_lane_s16(*(int16x8_t *) &__reint_748, __p1_748); \
-  __ret_748 = *(float16_t *) &__reint1_748; \
-  __ret_748; \
+#define __noswap_vgetq_lane_f16(__p0_840, __p1_840) __extension__ ({ \
+  float16x8_t __s0_840 = __p0_840; \
+  float16_t __ret_840; \
+float16x8_t __reint_840 = __s0_840; \
+int16_t __reint1_840 = __noswap_vgetq_lane_s16(*(int16x8_t *) &__reint_840, __p1_840); \
+  __ret_840 = *(float16_t *) &__reint1_840; \
+  __ret_840; \
 })
 #endif
 
@@ -64870,98 +66540,98 @@ __ai int32x4_t __noswap_vmlal_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmlal_lane_u32(__p0_749, __p1_749, __p2_749, __p3_749) __extension__ ({ \
-  uint64x2_t __s0_749 = __p0_749; \
-  uint32x2_t __s1_749 = __p1_749; \
-  uint32x2_t __s2_749 = __p2_749; \
-  uint64x2_t __ret_749; \
-  __ret_749 = __s0_749 + vmull_u32(__s1_749, splat_lane_u32(__s2_749, __p3_749)); \
-  __ret_749; \
+#define vmlal_lane_u32(__p0_841, __p1_841, __p2_841, __p3_841) __extension__ ({ \
+  uint64x2_t __s0_841 = __p0_841; \
+  uint32x2_t __s1_841 = __p1_841; \
+  uint32x2_t __s2_841 = __p2_841; \
+  uint64x2_t __ret_841; \
+  __ret_841 = __s0_841 + vmull_u32(__s1_841, splat_lane_u32(__s2_841, __p3_841)); \
+  __ret_841; \
 })
 #else
-#define vmlal_lane_u32(__p0_750, __p1_750, __p2_750, __p3_750) __extension__ ({ \
-  uint64x2_t __s0_750 = __p0_750; \
-  uint32x2_t __s1_750 = __p1_750; \
-  uint32x2_t __s2_750 = __p2_750; \
-  uint64x2_t __rev0_750;  __rev0_750 = __builtin_shufflevector(__s0_750, __s0_750, 1, 0); \
-  uint32x2_t __rev1_750;  __rev1_750 = __builtin_shufflevector(__s1_750, __s1_750, 1, 0); \
-  uint32x2_t __rev2_750;  __rev2_750 = __builtin_shufflevector(__s2_750, __s2_750, 1, 0); \
-  uint64x2_t __ret_750; \
-  __ret_750 = __rev0_750 + __noswap_vmull_u32(__rev1_750, __noswap_splat_lane_u32(__rev2_750, __p3_750)); \
-  __ret_750 = __builtin_shufflevector(__ret_750, __ret_750, 1, 0); \
-  __ret_750; \
+#define vmlal_lane_u32(__p0_842, __p1_842, __p2_842, __p3_842) __extension__ ({ \
+  uint64x2_t __s0_842 = __p0_842; \
+  uint32x2_t __s1_842 = __p1_842; \
+  uint32x2_t __s2_842 = __p2_842; \
+  uint64x2_t __rev0_842;  __rev0_842 = __builtin_shufflevector(__s0_842, __s0_842, 1, 0); \
+  uint32x2_t __rev1_842;  __rev1_842 = __builtin_shufflevector(__s1_842, __s1_842, 1, 0); \
+  uint32x2_t __rev2_842;  __rev2_842 = __builtin_shufflevector(__s2_842, __s2_842, 1, 0); \
+  uint64x2_t __ret_842; \
+  __ret_842 = __rev0_842 + __noswap_vmull_u32(__rev1_842, __noswap_splat_lane_u32(__rev2_842, __p3_842)); \
+  __ret_842 = __builtin_shufflevector(__ret_842, __ret_842, 1, 0); \
+  __ret_842; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmlal_lane_u16(__p0_751, __p1_751, __p2_751, __p3_751) __extension__ ({ \
-  uint32x4_t __s0_751 = __p0_751; \
-  uint16x4_t __s1_751 = __p1_751; \
-  uint16x4_t __s2_751 = __p2_751; \
-  uint32x4_t __ret_751; \
-  __ret_751 = __s0_751 + vmull_u16(__s1_751, splat_lane_u16(__s2_751, __p3_751)); \
-  __ret_751; \
+#define vmlal_lane_u16(__p0_843, __p1_843, __p2_843, __p3_843) __extension__ ({ \
+  uint32x4_t __s0_843 = __p0_843; \
+  uint16x4_t __s1_843 = __p1_843; \
+  uint16x4_t __s2_843 = __p2_843; \
+  uint32x4_t __ret_843; \
+  __ret_843 = __s0_843 + vmull_u16(__s1_843, splat_lane_u16(__s2_843, __p3_843)); \
+  __ret_843; \
 })
 #else
-#define vmlal_lane_u16(__p0_752, __p1_752, __p2_752, __p3_752) __extension__ ({ \
-  uint32x4_t __s0_752 = __p0_752; \
-  uint16x4_t __s1_752 = __p1_752; \
-  uint16x4_t __s2_752 = __p2_752; \
-  uint32x4_t __rev0_752;  __rev0_752 = __builtin_shufflevector(__s0_752, __s0_752, 3, 2, 1, 0); \
-  uint16x4_t __rev1_752;  __rev1_752 = __builtin_shufflevector(__s1_752, __s1_752, 3, 2, 1, 0); \
-  uint16x4_t __rev2_752;  __rev2_752 = __builtin_shufflevector(__s2_752, __s2_752, 3, 2, 1, 0); \
-  uint32x4_t __ret_752; \
-  __ret_752 = __rev0_752 + __noswap_vmull_u16(__rev1_752, __noswap_splat_lane_u16(__rev2_752, __p3_752)); \
-  __ret_752 = __builtin_shufflevector(__ret_752, __ret_752, 3, 2, 1, 0); \
-  __ret_752; \
+#define vmlal_lane_u16(__p0_844, __p1_844, __p2_844, __p3_844) __extension__ ({ \
+  uint32x4_t __s0_844 = __p0_844; \
+  uint16x4_t __s1_844 = __p1_844; \
+  uint16x4_t __s2_844 = __p2_844; \
+  uint32x4_t __rev0_844;  __rev0_844 = __builtin_shufflevector(__s0_844, __s0_844, 3, 2, 1, 0); \
+  uint16x4_t __rev1_844;  __rev1_844 = __builtin_shufflevector(__s1_844, __s1_844, 3, 2, 1, 0); \
+  uint16x4_t __rev2_844;  __rev2_844 = __builtin_shufflevector(__s2_844, __s2_844, 3, 2, 1, 0); \
+  uint32x4_t __ret_844; \
+  __ret_844 = __rev0_844 + __noswap_vmull_u16(__rev1_844, __noswap_splat_lane_u16(__rev2_844, __p3_844)); \
+  __ret_844 = __builtin_shufflevector(__ret_844, __ret_844, 3, 2, 1, 0); \
+  __ret_844; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmlal_lane_s32(__p0_753, __p1_753, __p2_753, __p3_753) __extension__ ({ \
-  int64x2_t __s0_753 = __p0_753; \
-  int32x2_t __s1_753 = __p1_753; \
-  int32x2_t __s2_753 = __p2_753; \
-  int64x2_t __ret_753; \
-  __ret_753 = __s0_753 + vmull_s32(__s1_753, splat_lane_s32(__s2_753, __p3_753)); \
-  __ret_753; \
+#define vmlal_lane_s32(__p0_845, __p1_845, __p2_845, __p3_845) __extension__ ({ \
+  int64x2_t __s0_845 = __p0_845; \
+  int32x2_t __s1_845 = __p1_845; \
+  int32x2_t __s2_845 = __p2_845; \
+  int64x2_t __ret_845; \
+  __ret_845 = __s0_845 + vmull_s32(__s1_845, splat_lane_s32(__s2_845, __p3_845)); \
+  __ret_845; \
 })
 #else
-#define vmlal_lane_s32(__p0_754, __p1_754, __p2_754, __p3_754) __extension__ ({ \
-  int64x2_t __s0_754 = __p0_754; \
-  int32x2_t __s1_754 = __p1_754; \
-  int32x2_t __s2_754 = __p2_754; \
-  int64x2_t __rev0_754;  __rev0_754 = __builtin_shufflevector(__s0_754, __s0_754, 1, 0); \
-  int32x2_t __rev1_754;  __rev1_754 = __builtin_shufflevector(__s1_754, __s1_754, 1, 0); \
-  int32x2_t __rev2_754;  __rev2_754 = __builtin_shufflevector(__s2_754, __s2_754, 1, 0); \
-  int64x2_t __ret_754; \
-  __ret_754 = __rev0_754 + __noswap_vmull_s32(__rev1_754, __noswap_splat_lane_s32(__rev2_754, __p3_754)); \
-  __ret_754 = __builtin_shufflevector(__ret_754, __ret_754, 1, 0); \
-  __ret_754; \
+#define vmlal_lane_s32(__p0_846, __p1_846, __p2_846, __p3_846) __extension__ ({ \
+  int64x2_t __s0_846 = __p0_846; \
+  int32x2_t __s1_846 = __p1_846; \
+  int32x2_t __s2_846 = __p2_846; \
+  int64x2_t __rev0_846;  __rev0_846 = __builtin_shufflevector(__s0_846, __s0_846, 1, 0); \
+  int32x2_t __rev1_846;  __rev1_846 = __builtin_shufflevector(__s1_846, __s1_846, 1, 0); \
+  int32x2_t __rev2_846;  __rev2_846 = __builtin_shufflevector(__s2_846, __s2_846, 1, 0); \
+  int64x2_t __ret_846; \
+  __ret_846 = __rev0_846 + __noswap_vmull_s32(__rev1_846, __noswap_splat_lane_s32(__rev2_846, __p3_846)); \
+  __ret_846 = __builtin_shufflevector(__ret_846, __ret_846, 1, 0); \
+  __ret_846; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmlal_lane_s16(__p0_755, __p1_755, __p2_755, __p3_755) __extension__ ({ \
-  int32x4_t __s0_755 = __p0_755; \
-  int16x4_t __s1_755 = __p1_755; \
-  int16x4_t __s2_755 = __p2_755; \
-  int32x4_t __ret_755; \
-  __ret_755 = __s0_755 + vmull_s16(__s1_755, splat_lane_s16(__s2_755, __p3_755)); \
-  __ret_755; \
+#define vmlal_lane_s16(__p0_847, __p1_847, __p2_847, __p3_847) __extension__ ({ \
+  int32x4_t __s0_847 = __p0_847; \
+  int16x4_t __s1_847 = __p1_847; \
+  int16x4_t __s2_847 = __p2_847; \
+  int32x4_t __ret_847; \
+  __ret_847 = __s0_847 + vmull_s16(__s1_847, splat_lane_s16(__s2_847, __p3_847)); \
+  __ret_847; \
 })
 #else
-#define vmlal_lane_s16(__p0_756, __p1_756, __p2_756, __p3_756) __extension__ ({ \
-  int32x4_t __s0_756 = __p0_756; \
-  int16x4_t __s1_756 = __p1_756; \
-  int16x4_t __s2_756 = __p2_756; \
-  int32x4_t __rev0_756;  __rev0_756 = __builtin_shufflevector(__s0_756, __s0_756, 3, 2, 1, 0); \
-  int16x4_t __rev1_756;  __rev1_756 = __builtin_shufflevector(__s1_756, __s1_756, 3, 2, 1, 0); \
-  int16x4_t __rev2_756;  __rev2_756 = __builtin_shufflevector(__s2_756, __s2_756, 3, 2, 1, 0); \
-  int32x4_t __ret_756; \
-  __ret_756 = __rev0_756 + __noswap_vmull_s16(__rev1_756, __noswap_splat_lane_s16(__rev2_756, __p3_756)); \
-  __ret_756 = __builtin_shufflevector(__ret_756, __ret_756, 3, 2, 1, 0); \
-  __ret_756; \
+#define vmlal_lane_s16(__p0_848, __p1_848, __p2_848, __p3_848) __extension__ ({ \
+  int32x4_t __s0_848 = __p0_848; \
+  int16x4_t __s1_848 = __p1_848; \
+  int16x4_t __s2_848 = __p2_848; \
+  int32x4_t __rev0_848;  __rev0_848 = __builtin_shufflevector(__s0_848, __s0_848, 3, 2, 1, 0); \
+  int16x4_t __rev1_848;  __rev1_848 = __builtin_shufflevector(__s1_848, __s1_848, 3, 2, 1, 0); \
+  int16x4_t __rev2_848;  __rev2_848 = __builtin_shufflevector(__s2_848, __s2_848, 3, 2, 1, 0); \
+  int32x4_t __ret_848; \
+  __ret_848 = __rev0_848 + __noswap_vmull_s16(__rev1_848, __noswap_splat_lane_s16(__rev2_848, __p3_848)); \
+  __ret_848 = __builtin_shufflevector(__ret_848, __ret_848, 3, 2, 1, 0); \
+  __ret_848; \
 })
 #endif
 
@@ -65192,98 +66862,98 @@ __ai int32x4_t __noswap_vmlsl_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmlsl_lane_u32(__p0_757, __p1_757, __p2_757, __p3_757) __extension__ ({ \
-  uint64x2_t __s0_757 = __p0_757; \
-  uint32x2_t __s1_757 = __p1_757; \
-  uint32x2_t __s2_757 = __p2_757; \
-  uint64x2_t __ret_757; \
-  __ret_757 = __s0_757 - vmull_u32(__s1_757, splat_lane_u32(__s2_757, __p3_757)); \
-  __ret_757; \
+#define vmlsl_lane_u32(__p0_849, __p1_849, __p2_849, __p3_849) __extension__ ({ \
+  uint64x2_t __s0_849 = __p0_849; \
+  uint32x2_t __s1_849 = __p1_849; \
+  uint32x2_t __s2_849 = __p2_849; \
+  uint64x2_t __ret_849; \
+  __ret_849 = __s0_849 - vmull_u32(__s1_849, splat_lane_u32(__s2_849, __p3_849)); \
+  __ret_849; \
 })
 #else
-#define vmlsl_lane_u32(__p0_758, __p1_758, __p2_758, __p3_758) __extension__ ({ \
-  uint64x2_t __s0_758 = __p0_758; \
-  uint32x2_t __s1_758 = __p1_758; \
-  uint32x2_t __s2_758 = __p2_758; \
-  uint64x2_t __rev0_758;  __rev0_758 = __builtin_shufflevector(__s0_758, __s0_758, 1, 0); \
-  uint32x2_t __rev1_758;  __rev1_758 = __builtin_shufflevector(__s1_758, __s1_758, 1, 0); \
-  uint32x2_t __rev2_758;  __rev2_758 = __builtin_shufflevector(__s2_758, __s2_758, 1, 0); \
-  uint64x2_t __ret_758; \
-  __ret_758 = __rev0_758 - __noswap_vmull_u32(__rev1_758, __noswap_splat_lane_u32(__rev2_758, __p3_758)); \
-  __ret_758 = __builtin_shufflevector(__ret_758, __ret_758, 1, 0); \
-  __ret_758; \
+#define vmlsl_lane_u32(__p0_850, __p1_850, __p2_850, __p3_850) __extension__ ({ \
+  uint64x2_t __s0_850 = __p0_850; \
+  uint32x2_t __s1_850 = __p1_850; \
+  uint32x2_t __s2_850 = __p2_850; \
+  uint64x2_t __rev0_850;  __rev0_850 = __builtin_shufflevector(__s0_850, __s0_850, 1, 0); \
+  uint32x2_t __rev1_850;  __rev1_850 = __builtin_shufflevector(__s1_850, __s1_850, 1, 0); \
+  uint32x2_t __rev2_850;  __rev2_850 = __builtin_shufflevector(__s2_850, __s2_850, 1, 0); \
+  uint64x2_t __ret_850; \
+  __ret_850 = __rev0_850 - __noswap_vmull_u32(__rev1_850, __noswap_splat_lane_u32(__rev2_850, __p3_850)); \
+  __ret_850 = __builtin_shufflevector(__ret_850, __ret_850, 1, 0); \
+  __ret_850; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmlsl_lane_u16(__p0_759, __p1_759, __p2_759, __p3_759) __extension__ ({ \
-  uint32x4_t __s0_759 = __p0_759; \
-  uint16x4_t __s1_759 = __p1_759; \
-  uint16x4_t __s2_759 = __p2_759; \
-  uint32x4_t __ret_759; \
-  __ret_759 = __s0_759 - vmull_u16(__s1_759, splat_lane_u16(__s2_759, __p3_759)); \
-  __ret_759; \
+#define vmlsl_lane_u16(__p0_851, __p1_851, __p2_851, __p3_851) __extension__ ({ \
+  uint32x4_t __s0_851 = __p0_851; \
+  uint16x4_t __s1_851 = __p1_851; \
+  uint16x4_t __s2_851 = __p2_851; \
+  uint32x4_t __ret_851; \
+  __ret_851 = __s0_851 - vmull_u16(__s1_851, splat_lane_u16(__s2_851, __p3_851)); \
+  __ret_851; \
 })
 #else
-#define vmlsl_lane_u16(__p0_760, __p1_760, __p2_760, __p3_760) __extension__ ({ \
-  uint32x4_t __s0_760 = __p0_760; \
-  uint16x4_t __s1_760 = __p1_760; \
-  uint16x4_t __s2_760 = __p2_760; \
-  uint32x4_t __rev0_760;  __rev0_760 = __builtin_shufflevector(__s0_760, __s0_760, 3, 2, 1, 0); \
-  uint16x4_t __rev1_760;  __rev1_760 = __builtin_shufflevector(__s1_760, __s1_760, 3, 2, 1, 0); \
-  uint16x4_t __rev2_760;  __rev2_760 = __builtin_shufflevector(__s2_760, __s2_760, 3, 2, 1, 0); \
-  uint32x4_t __ret_760; \
-  __ret_760 = __rev0_760 - __noswap_vmull_u16(__rev1_760, __noswap_splat_lane_u16(__rev2_760, __p3_760)); \
-  __ret_760 = __builtin_shufflevector(__ret_760, __ret_760, 3, 2, 1, 0); \
-  __ret_760; \
+#define vmlsl_lane_u16(__p0_852, __p1_852, __p2_852, __p3_852) __extension__ ({ \
+  uint32x4_t __s0_852 = __p0_852; \
+  uint16x4_t __s1_852 = __p1_852; \
+  uint16x4_t __s2_852 = __p2_852; \
+  uint32x4_t __rev0_852;  __rev0_852 = __builtin_shufflevector(__s0_852, __s0_852, 3, 2, 1, 0); \
+  uint16x4_t __rev1_852;  __rev1_852 = __builtin_shufflevector(__s1_852, __s1_852, 3, 2, 1, 0); \
+  uint16x4_t __rev2_852;  __rev2_852 = __builtin_shufflevector(__s2_852, __s2_852, 3, 2, 1, 0); \
+  uint32x4_t __ret_852; \
+  __ret_852 = __rev0_852 - __noswap_vmull_u16(__rev1_852, __noswap_splat_lane_u16(__rev2_852, __p3_852)); \
+  __ret_852 = __builtin_shufflevector(__ret_852, __ret_852, 3, 2, 1, 0); \
+  __ret_852; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmlsl_lane_s32(__p0_761, __p1_761, __p2_761, __p3_761) __extension__ ({ \
-  int64x2_t __s0_761 = __p0_761; \
-  int32x2_t __s1_761 = __p1_761; \
-  int32x2_t __s2_761 = __p2_761; \
-  int64x2_t __ret_761; \
-  __ret_761 = __s0_761 - vmull_s32(__s1_761, splat_lane_s32(__s2_761, __p3_761)); \
-  __ret_761; \
+#define vmlsl_lane_s32(__p0_853, __p1_853, __p2_853, __p3_853) __extension__ ({ \
+  int64x2_t __s0_853 = __p0_853; \
+  int32x2_t __s1_853 = __p1_853; \
+  int32x2_t __s2_853 = __p2_853; \
+  int64x2_t __ret_853; \
+  __ret_853 = __s0_853 - vmull_s32(__s1_853, splat_lane_s32(__s2_853, __p3_853)); \
+  __ret_853; \
 })
 #else
-#define vmlsl_lane_s32(__p0_762, __p1_762, __p2_762, __p3_762) __extension__ ({ \
-  int64x2_t __s0_762 = __p0_762; \
-  int32x2_t __s1_762 = __p1_762; \
-  int32x2_t __s2_762 = __p2_762; \
-  int64x2_t __rev0_762;  __rev0_762 = __builtin_shufflevector(__s0_762, __s0_762, 1, 0); \
-  int32x2_t __rev1_762;  __rev1_762 = __builtin_shufflevector(__s1_762, __s1_762, 1, 0); \
-  int32x2_t __rev2_762;  __rev2_762 = __builtin_shufflevector(__s2_762, __s2_762, 1, 0); \
-  int64x2_t __ret_762; \
-  __ret_762 = __rev0_762 - __noswap_vmull_s32(__rev1_762, __noswap_splat_lane_s32(__rev2_762, __p3_762)); \
-  __ret_762 = __builtin_shufflevector(__ret_762, __ret_762, 1, 0); \
-  __ret_762; \
+#define vmlsl_lane_s32(__p0_854, __p1_854, __p2_854, __p3_854) __extension__ ({ \
+  int64x2_t __s0_854 = __p0_854; \
+  int32x2_t __s1_854 = __p1_854; \
+  int32x2_t __s2_854 = __p2_854; \
+  int64x2_t __rev0_854;  __rev0_854 = __builtin_shufflevector(__s0_854, __s0_854, 1, 0); \
+  int32x2_t __rev1_854;  __rev1_854 = __builtin_shufflevector(__s1_854, __s1_854, 1, 0); \
+  int32x2_t __rev2_854;  __rev2_854 = __builtin_shufflevector(__s2_854, __s2_854, 1, 0); \
+  int64x2_t __ret_854; \
+  __ret_854 = __rev0_854 - __noswap_vmull_s32(__rev1_854, __noswap_splat_lane_s32(__rev2_854, __p3_854)); \
+  __ret_854 = __builtin_shufflevector(__ret_854, __ret_854, 1, 0); \
+  __ret_854; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmlsl_lane_s16(__p0_763, __p1_763, __p2_763, __p3_763) __extension__ ({ \
-  int32x4_t __s0_763 = __p0_763; \
-  int16x4_t __s1_763 = __p1_763; \
-  int16x4_t __s2_763 = __p2_763; \
-  int32x4_t __ret_763; \
-  __ret_763 = __s0_763 - vmull_s16(__s1_763, splat_lane_s16(__s2_763, __p3_763)); \
-  __ret_763; \
+#define vmlsl_lane_s16(__p0_855, __p1_855, __p2_855, __p3_855) __extension__ ({ \
+  int32x4_t __s0_855 = __p0_855; \
+  int16x4_t __s1_855 = __p1_855; \
+  int16x4_t __s2_855 = __p2_855; \
+  int32x4_t __ret_855; \
+  __ret_855 = __s0_855 - vmull_s16(__s1_855, splat_lane_s16(__s2_855, __p3_855)); \
+  __ret_855; \
 })
 #else
-#define vmlsl_lane_s16(__p0_764, __p1_764, __p2_764, __p3_764) __extension__ ({ \
-  int32x4_t __s0_764 = __p0_764; \
-  int16x4_t __s1_764 = __p1_764; \
-  int16x4_t __s2_764 = __p2_764; \
-  int32x4_t __rev0_764;  __rev0_764 = __builtin_shufflevector(__s0_764, __s0_764, 3, 2, 1, 0); \
-  int16x4_t __rev1_764;  __rev1_764 = __builtin_shufflevector(__s1_764, __s1_764, 3, 2, 1, 0); \
-  int16x4_t __rev2_764;  __rev2_764 = __builtin_shufflevector(__s2_764, __s2_764, 3, 2, 1, 0); \
-  int32x4_t __ret_764; \
-  __ret_764 = __rev0_764 - __noswap_vmull_s16(__rev1_764, __noswap_splat_lane_s16(__rev2_764, __p3_764)); \
-  __ret_764 = __builtin_shufflevector(__ret_764, __ret_764, 3, 2, 1, 0); \
-  __ret_764; \
+#define vmlsl_lane_s16(__p0_856, __p1_856, __p2_856, __p3_856) __extension__ ({ \
+  int32x4_t __s0_856 = __p0_856; \
+  int16x4_t __s1_856 = __p1_856; \
+  int16x4_t __s2_856 = __p2_856; \
+  int32x4_t __rev0_856;  __rev0_856 = __builtin_shufflevector(__s0_856, __s0_856, 3, 2, 1, 0); \
+  int16x4_t __rev1_856;  __rev1_856 = __builtin_shufflevector(__s1_856, __s1_856, 3, 2, 1, 0); \
+  int16x4_t __rev2_856;  __rev2_856 = __builtin_shufflevector(__s2_856, __s2_856, 3, 2, 1, 0); \
+  int32x4_t __ret_856; \
+  __ret_856 = __rev0_856 - __noswap_vmull_s16(__rev1_856, __noswap_splat_lane_s16(__rev2_856, __p3_856)); \
+  __ret_856 = __builtin_shufflevector(__ret_856, __ret_856, 3, 2, 1, 0); \
+  __ret_856; \
 })
 #endif
 
@@ -65376,151 +67046,151 @@ __ai int32x4_t __noswap_vmlsl_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vset_lane_f16(__p0_765, __p1_765, __p2_765) __extension__ ({ \
-  float16_t __s0_765 = __p0_765; \
-  float16x4_t __s1_765 = __p1_765; \
-  float16x4_t __ret_765; \
-float16_t __reint_765 = __s0_765; \
-float16x4_t __reint1_765 = __s1_765; \
-int16x4_t __reint2_765 = vset_lane_s16(*(int16_t *) &__reint_765, *(int16x4_t *) &__reint1_765, __p2_765); \
-  __ret_765 = *(float16x4_t *) &__reint2_765; \
-  __ret_765; \
+#define vset_lane_f16(__p0_857, __p1_857, __p2_857) __extension__ ({ \
+  float16_t __s0_857 = __p0_857; \
+  float16x4_t __s1_857 = __p1_857; \
+  float16x4_t __ret_857; \
+float16_t __reint_857 = __s0_857; \
+float16x4_t __reint1_857 = __s1_857; \
+int16x4_t __reint2_857 = vset_lane_s16(*(int16_t *) &__reint_857, *(int16x4_t *) &__reint1_857, __p2_857); \
+  __ret_857 = *(float16x4_t *) &__reint2_857; \
+  __ret_857; \
 })
 #else
-#define vset_lane_f16(__p0_766, __p1_766, __p2_766) __extension__ ({ \
-  float16_t __s0_766 = __p0_766; \
-  float16x4_t __s1_766 = __p1_766; \
-  float16x4_t __rev1_766;  __rev1_766 = __builtin_shufflevector(__s1_766, __s1_766, 3, 2, 1, 0); \
-  float16x4_t __ret_766; \
-float16_t __reint_766 = __s0_766; \
-float16x4_t __reint1_766 = __rev1_766; \
-int16x4_t __reint2_766 = __noswap_vset_lane_s16(*(int16_t *) &__reint_766, *(int16x4_t *) &__reint1_766, __p2_766); \
-  __ret_766 = *(float16x4_t *) &__reint2_766; \
-  __ret_766 = __builtin_shufflevector(__ret_766, __ret_766, 3, 2, 1, 0); \
-  __ret_766; \
+#define vset_lane_f16(__p0_858, __p1_858, __p2_858) __extension__ ({ \
+  float16_t __s0_858 = __p0_858; \
+  float16x4_t __s1_858 = __p1_858; \
+  float16x4_t __rev1_858;  __rev1_858 = __builtin_shufflevector(__s1_858, __s1_858, 3, 2, 1, 0); \
+  float16x4_t __ret_858; \
+float16_t __reint_858 = __s0_858; \
+float16x4_t __reint1_858 = __rev1_858; \
+int16x4_t __reint2_858 = __noswap_vset_lane_s16(*(int16_t *) &__reint_858, *(int16x4_t *) &__reint1_858, __p2_858); \
+  __ret_858 = *(float16x4_t *) &__reint2_858; \
+  __ret_858 = __builtin_shufflevector(__ret_858, __ret_858, 3, 2, 1, 0); \
+  __ret_858; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vsetq_lane_f16(__p0_767, __p1_767, __p2_767) __extension__ ({ \
-  float16_t __s0_767 = __p0_767; \
-  float16x8_t __s1_767 = __p1_767; \
-  float16x8_t __ret_767; \
-float16_t __reint_767 = __s0_767; \
-float16x8_t __reint1_767 = __s1_767; \
-int16x8_t __reint2_767 = vsetq_lane_s16(*(int16_t *) &__reint_767, *(int16x8_t *) &__reint1_767, __p2_767); \
-  __ret_767 = *(float16x8_t *) &__reint2_767; \
-  __ret_767; \
+#define vsetq_lane_f16(__p0_859, __p1_859, __p2_859) __extension__ ({ \
+  float16_t __s0_859 = __p0_859; \
+  float16x8_t __s1_859 = __p1_859; \
+  float16x8_t __ret_859; \
+float16_t __reint_859 = __s0_859; \
+float16x8_t __reint1_859 = __s1_859; \
+int16x8_t __reint2_859 = vsetq_lane_s16(*(int16_t *) &__reint_859, *(int16x8_t *) &__reint1_859, __p2_859); \
+  __ret_859 = *(float16x8_t *) &__reint2_859; \
+  __ret_859; \
 })
 #else
-#define vsetq_lane_f16(__p0_768, __p1_768, __p2_768) __extension__ ({ \
-  float16_t __s0_768 = __p0_768; \
-  float16x8_t __s1_768 = __p1_768; \
-  float16x8_t __rev1_768;  __rev1_768 = __builtin_shufflevector(__s1_768, __s1_768, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __ret_768; \
-float16_t __reint_768 = __s0_768; \
-float16x8_t __reint1_768 = __rev1_768; \
-int16x8_t __reint2_768 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_768, *(int16x8_t *) &__reint1_768, __p2_768); \
-  __ret_768 = *(float16x8_t *) &__reint2_768; \
-  __ret_768 = __builtin_shufflevector(__ret_768, __ret_768, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_768; \
+#define vsetq_lane_f16(__p0_860, __p1_860, __p2_860) __extension__ ({ \
+  float16_t __s0_860 = __p0_860; \
+  float16x8_t __s1_860 = __p1_860; \
+  float16x8_t __rev1_860;  __rev1_860 = __builtin_shufflevector(__s1_860, __s1_860, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __ret_860; \
+float16_t __reint_860 = __s0_860; \
+float16x8_t __reint1_860 = __rev1_860; \
+int16x8_t __reint2_860 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_860, *(int16x8_t *) &__reint1_860, __p2_860); \
+  __ret_860 = *(float16x8_t *) &__reint2_860; \
+  __ret_860 = __builtin_shufflevector(__ret_860, __ret_860, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_860; \
 })
 #endif
 
 #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC)
 #ifdef __LITTLE_ENDIAN__
-#define vbfmlalbq_lane_f32(__p0_769, __p1_769, __p2_769, __p3_769) __extension__ ({ \
-  float32x4_t __s0_769 = __p0_769; \
-  bfloat16x8_t __s1_769 = __p1_769; \
-  bfloat16x4_t __s2_769 = __p2_769; \
-  float32x4_t __ret_769; \
-  __ret_769 = vbfmlalbq_f32(__s0_769, __s1_769, (bfloat16x8_t) {vget_lane_bf16(__s2_769, __p3_769), vget_lane_bf16(__s2_769, __p3_769), vget_lane_bf16(__s2_769, __p3_769), vget_lane_bf16(__s2_769, __p3_769), vget_lane_bf16(__s2_769, __p3_769), vget_lane_bf16(__s2_769, __p3_769), vget_lane_bf16(__s2_769, __p3_769), vget_lane_bf16(__s2_769, __p3_769)}); \
-  __ret_769; \
+#define vbfmlalbq_lane_f32(__p0_861, __p1_861, __p2_861, __p3_861) __extension__ ({ \
+  float32x4_t __s0_861 = __p0_861; \
+  bfloat16x8_t __s1_861 = __p1_861; \
+  bfloat16x4_t __s2_861 = __p2_861; \
+  float32x4_t __ret_861; \
+  __ret_861 = vbfmlalbq_f32(__s0_861, __s1_861, (bfloat16x8_t) {vget_lane_bf16(__s2_861, __p3_861), vget_lane_bf16(__s2_861, __p3_861), vget_lane_bf16(__s2_861, __p3_861), vget_lane_bf16(__s2_861, __p3_861), vget_lane_bf16(__s2_861, __p3_861), vget_lane_bf16(__s2_861, __p3_861), vget_lane_bf16(__s2_861, __p3_861), vget_lane_bf16(__s2_861, __p3_861)}); \
+  __ret_861; \
 })
 #else
-#define vbfmlalbq_lane_f32(__p0_770, __p1_770, __p2_770, __p3_770) __extension__ ({ \
-  float32x4_t __s0_770 = __p0_770; \
-  bfloat16x8_t __s1_770 = __p1_770; \
-  bfloat16x4_t __s2_770 = __p2_770; \
-  float32x4_t __rev0_770;  __rev0_770 = __builtin_shufflevector(__s0_770, __s0_770, 3, 2, 1, 0); \
-  bfloat16x8_t __rev1_770;  __rev1_770 = __builtin_shufflevector(__s1_770, __s1_770, 7, 6, 5, 4, 3, 2, 1, 0); \
-  bfloat16x4_t __rev2_770;  __rev2_770 = __builtin_shufflevector(__s2_770, __s2_770, 3, 2, 1, 0); \
-  float32x4_t __ret_770; \
-  __ret_770 = __noswap_vbfmlalbq_f32(__rev0_770, __rev1_770, (bfloat16x8_t) {__noswap_vget_lane_bf16(__rev2_770, __p3_770), __noswap_vget_lane_bf16(__rev2_770, __p3_770), __noswap_vget_lane_bf16(__rev2_770, __p3_770), __noswap_vget_lane_bf16(__rev2_770, __p3_770), __noswap_vget_lane_bf16(__rev2_770, __p3_770), __noswap_vget_lane_bf16(__rev2_770, __p3_770), __noswap_vget_lane_bf16(__rev2_770, __p3_770), __noswap_vget_lane_bf16(__rev2_770, __p3_770)}); \
-  __ret_770 = __builtin_shufflevector(__ret_770, __ret_770, 3, 2, 1, 0); \
-  __ret_770; \
+#define vbfmlalbq_lane_f32(__p0_862, __p1_862, __p2_862, __p3_862) __extension__ ({ \
+  float32x4_t __s0_862 = __p0_862; \
+  bfloat16x8_t __s1_862 = __p1_862; \
+  bfloat16x4_t __s2_862 = __p2_862; \
+  float32x4_t __rev0_862;  __rev0_862 = __builtin_shufflevector(__s0_862, __s0_862, 3, 2, 1, 0); \
+  bfloat16x8_t __rev1_862;  __rev1_862 = __builtin_shufflevector(__s1_862, __s1_862, 7, 6, 5, 4, 3, 2, 1, 0); \
+  bfloat16x4_t __rev2_862;  __rev2_862 = __builtin_shufflevector(__s2_862, __s2_862, 3, 2, 1, 0); \
+  float32x4_t __ret_862; \
+  __ret_862 = __noswap_vbfmlalbq_f32(__rev0_862, __rev1_862, (bfloat16x8_t) {__noswap_vget_lane_bf16(__rev2_862, __p3_862), __noswap_vget_lane_bf16(__rev2_862, __p3_862), __noswap_vget_lane_bf16(__rev2_862, __p3_862), __noswap_vget_lane_bf16(__rev2_862, __p3_862), __noswap_vget_lane_bf16(__rev2_862, __p3_862), __noswap_vget_lane_bf16(__rev2_862, __p3_862), __noswap_vget_lane_bf16(__rev2_862, __p3_862), __noswap_vget_lane_bf16(__rev2_862, __p3_862)}); \
+  __ret_862 = __builtin_shufflevector(__ret_862, __ret_862, 3, 2, 1, 0); \
+  __ret_862; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vbfmlalbq_laneq_f32(__p0_771, __p1_771, __p2_771, __p3_771) __extension__ ({ \
-  float32x4_t __s0_771 = __p0_771; \
-  bfloat16x8_t __s1_771 = __p1_771; \
-  bfloat16x8_t __s2_771 = __p2_771; \
-  float32x4_t __ret_771; \
-  __ret_771 = vbfmlalbq_f32(__s0_771, __s1_771, (bfloat16x8_t) {vgetq_lane_bf16(__s2_771, __p3_771), vgetq_lane_bf16(__s2_771, __p3_771), vgetq_lane_bf16(__s2_771, __p3_771), vgetq_lane_bf16(__s2_771, __p3_771), vgetq_lane_bf16(__s2_771, __p3_771), vgetq_lane_bf16(__s2_771, __p3_771), vgetq_lane_bf16(__s2_771, __p3_771), vgetq_lane_bf16(__s2_771, __p3_771)}); \
-  __ret_771; \
+#define vbfmlalbq_laneq_f32(__p0_863, __p1_863, __p2_863, __p3_863) __extension__ ({ \
+  float32x4_t __s0_863 = __p0_863; \
+  bfloat16x8_t __s1_863 = __p1_863; \
+  bfloat16x8_t __s2_863 = __p2_863; \
+  float32x4_t __ret_863; \
+  __ret_863 = vbfmlalbq_f32(__s0_863, __s1_863, (bfloat16x8_t) {vgetq_lane_bf16(__s2_863, __p3_863), vgetq_lane_bf16(__s2_863, __p3_863), vgetq_lane_bf16(__s2_863, __p3_863), vgetq_lane_bf16(__s2_863, __p3_863), vgetq_lane_bf16(__s2_863, __p3_863), vgetq_lane_bf16(__s2_863, __p3_863), vgetq_lane_bf16(__s2_863, __p3_863), vgetq_lane_bf16(__s2_863, __p3_863)}); \
+  __ret_863; \
 })
 #else
-#define vbfmlalbq_laneq_f32(__p0_772, __p1_772, __p2_772, __p3_772) __extension__ ({ \
-  float32x4_t __s0_772 = __p0_772; \
-  bfloat16x8_t __s1_772 = __p1_772; \
-  bfloat16x8_t __s2_772 = __p2_772; \
-  float32x4_t __rev0_772;  __rev0_772 = __builtin_shufflevector(__s0_772, __s0_772, 3, 2, 1, 0); \
-  bfloat16x8_t __rev1_772;  __rev1_772 = __builtin_shufflevector(__s1_772, __s1_772, 7, 6, 5, 4, 3, 2, 1, 0); \
-  bfloat16x8_t __rev2_772;  __rev2_772 = __builtin_shufflevector(__s2_772, __s2_772, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float32x4_t __ret_772; \
-  __ret_772 = __noswap_vbfmlalbq_f32(__rev0_772, __rev1_772, (bfloat16x8_t) {__noswap_vgetq_lane_bf16(__rev2_772, __p3_772), __noswap_vgetq_lane_bf16(__rev2_772, __p3_772), __noswap_vgetq_lane_bf16(__rev2_772, __p3_772), __noswap_vgetq_lane_bf16(__rev2_772, __p3_772), __noswap_vgetq_lane_bf16(__rev2_772, __p3_772), __noswap_vgetq_lane_bf16(__rev2_772, __p3_772), __noswap_vgetq_lane_bf16(__rev2_772, __p3_772), __noswap_vgetq_lane_bf16(__rev2_772, __p3_772)}); \
-  __ret_772 = __builtin_shufflevector(__ret_772, __ret_772, 3, 2, 1, 0); \
-  __ret_772; \
+#define vbfmlalbq_laneq_f32(__p0_864, __p1_864, __p2_864, __p3_864) __extension__ ({ \
+  float32x4_t __s0_864 = __p0_864; \
+  bfloat16x8_t __s1_864 = __p1_864; \
+  bfloat16x8_t __s2_864 = __p2_864; \
+  float32x4_t __rev0_864;  __rev0_864 = __builtin_shufflevector(__s0_864, __s0_864, 3, 2, 1, 0); \
+  bfloat16x8_t __rev1_864;  __rev1_864 = __builtin_shufflevector(__s1_864, __s1_864, 7, 6, 5, 4, 3, 2, 1, 0); \
+  bfloat16x8_t __rev2_864;  __rev2_864 = __builtin_shufflevector(__s2_864, __s2_864, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float32x4_t __ret_864; \
+  __ret_864 = __noswap_vbfmlalbq_f32(__rev0_864, __rev1_864, (bfloat16x8_t) {__noswap_vgetq_lane_bf16(__rev2_864, __p3_864), __noswap_vgetq_lane_bf16(__rev2_864, __p3_864), __noswap_vgetq_lane_bf16(__rev2_864, __p3_864), __noswap_vgetq_lane_bf16(__rev2_864, __p3_864), __noswap_vgetq_lane_bf16(__rev2_864, __p3_864), __noswap_vgetq_lane_bf16(__rev2_864, __p3_864), __noswap_vgetq_lane_bf16(__rev2_864, __p3_864), __noswap_vgetq_lane_bf16(__rev2_864, __p3_864)}); \
+  __ret_864 = __builtin_shufflevector(__ret_864, __ret_864, 3, 2, 1, 0); \
+  __ret_864; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vbfmlaltq_lane_f32(__p0_773, __p1_773, __p2_773, __p3_773) __extension__ ({ \
-  float32x4_t __s0_773 = __p0_773; \
-  bfloat16x8_t __s1_773 = __p1_773; \
-  bfloat16x4_t __s2_773 = __p2_773; \
-  float32x4_t __ret_773; \
-  __ret_773 = vbfmlaltq_f32(__s0_773, __s1_773, (bfloat16x8_t) {vget_lane_bf16(__s2_773, __p3_773), vget_lane_bf16(__s2_773, __p3_773), vget_lane_bf16(__s2_773, __p3_773), vget_lane_bf16(__s2_773, __p3_773), vget_lane_bf16(__s2_773, __p3_773), vget_lane_bf16(__s2_773, __p3_773), vget_lane_bf16(__s2_773, __p3_773), vget_lane_bf16(__s2_773, __p3_773)}); \
-  __ret_773; \
+#define vbfmlaltq_lane_f32(__p0_865, __p1_865, __p2_865, __p3_865) __extension__ ({ \
+  float32x4_t __s0_865 = __p0_865; \
+  bfloat16x8_t __s1_865 = __p1_865; \
+  bfloat16x4_t __s2_865 = __p2_865; \
+  float32x4_t __ret_865; \
+  __ret_865 = vbfmlaltq_f32(__s0_865, __s1_865, (bfloat16x8_t) {vget_lane_bf16(__s2_865, __p3_865), vget_lane_bf16(__s2_865, __p3_865), vget_lane_bf16(__s2_865, __p3_865), vget_lane_bf16(__s2_865, __p3_865), vget_lane_bf16(__s2_865, __p3_865), vget_lane_bf16(__s2_865, __p3_865), vget_lane_bf16(__s2_865, __p3_865), vget_lane_bf16(__s2_865, __p3_865)}); \
+  __ret_865; \
 })
 #else
-#define vbfmlaltq_lane_f32(__p0_774, __p1_774, __p2_774, __p3_774) __extension__ ({ \
-  float32x4_t __s0_774 = __p0_774; \
-  bfloat16x8_t __s1_774 = __p1_774; \
-  bfloat16x4_t __s2_774 = __p2_774; \
-  float32x4_t __rev0_774;  __rev0_774 = __builtin_shufflevector(__s0_774, __s0_774, 3, 2, 1, 0); \
-  bfloat16x8_t __rev1_774;  __rev1_774 = __builtin_shufflevector(__s1_774, __s1_774, 7, 6, 5, 4, 3, 2, 1, 0); \
-  bfloat16x4_t __rev2_774;  __rev2_774 = __builtin_shufflevector(__s2_774, __s2_774, 3, 2, 1, 0); \
-  float32x4_t __ret_774; \
-  __ret_774 = __noswap_vbfmlaltq_f32(__rev0_774, __rev1_774, (bfloat16x8_t) {__noswap_vget_lane_bf16(__rev2_774, __p3_774), __noswap_vget_lane_bf16(__rev2_774, __p3_774), __noswap_vget_lane_bf16(__rev2_774, __p3_774), __noswap_vget_lane_bf16(__rev2_774, __p3_774), __noswap_vget_lane_bf16(__rev2_774, __p3_774), __noswap_vget_lane_bf16(__rev2_774, __p3_774), __noswap_vget_lane_bf16(__rev2_774, __p3_774), __noswap_vget_lane_bf16(__rev2_774, __p3_774)}); \
-  __ret_774 = __builtin_shufflevector(__ret_774, __ret_774, 3, 2, 1, 0); \
-  __ret_774; \
+#define vbfmlaltq_lane_f32(__p0_866, __p1_866, __p2_866, __p3_866) __extension__ ({ \
+  float32x4_t __s0_866 = __p0_866; \
+  bfloat16x8_t __s1_866 = __p1_866; \
+  bfloat16x4_t __s2_866 = __p2_866; \
+  float32x4_t __rev0_866;  __rev0_866 = __builtin_shufflevector(__s0_866, __s0_866, 3, 2, 1, 0); \
+  bfloat16x8_t __rev1_866;  __rev1_866 = __builtin_shufflevector(__s1_866, __s1_866, 7, 6, 5, 4, 3, 2, 1, 0); \
+  bfloat16x4_t __rev2_866;  __rev2_866 = __builtin_shufflevector(__s2_866, __s2_866, 3, 2, 1, 0); \
+  float32x4_t __ret_866; \
+  __ret_866 = __noswap_vbfmlaltq_f32(__rev0_866, __rev1_866, (bfloat16x8_t) {__noswap_vget_lane_bf16(__rev2_866, __p3_866), __noswap_vget_lane_bf16(__rev2_866, __p3_866), __noswap_vget_lane_bf16(__rev2_866, __p3_866), __noswap_vget_lane_bf16(__rev2_866, __p3_866), __noswap_vget_lane_bf16(__rev2_866, __p3_866), __noswap_vget_lane_bf16(__rev2_866, __p3_866), __noswap_vget_lane_bf16(__rev2_866, __p3_866), __noswap_vget_lane_bf16(__rev2_866, __p3_866)}); \
+  __ret_866 = __builtin_shufflevector(__ret_866, __ret_866, 3, 2, 1, 0); \
+  __ret_866; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vbfmlaltq_laneq_f32(__p0_775, __p1_775, __p2_775, __p3_775) __extension__ ({ \
-  float32x4_t __s0_775 = __p0_775; \
-  bfloat16x8_t __s1_775 = __p1_775; \
-  bfloat16x8_t __s2_775 = __p2_775; \
-  float32x4_t __ret_775; \
-  __ret_775 = vbfmlaltq_f32(__s0_775, __s1_775, (bfloat16x8_t) {vgetq_lane_bf16(__s2_775, __p3_775), vgetq_lane_bf16(__s2_775, __p3_775), vgetq_lane_bf16(__s2_775, __p3_775), vgetq_lane_bf16(__s2_775, __p3_775), vgetq_lane_bf16(__s2_775, __p3_775), vgetq_lane_bf16(__s2_775, __p3_775), vgetq_lane_bf16(__s2_775, __p3_775), vgetq_lane_bf16(__s2_775, __p3_775)}); \
-  __ret_775; \
+#define vbfmlaltq_laneq_f32(__p0_867, __p1_867, __p2_867, __p3_867) __extension__ ({ \
+  float32x4_t __s0_867 = __p0_867; \
+  bfloat16x8_t __s1_867 = __p1_867; \
+  bfloat16x8_t __s2_867 = __p2_867; \
+  float32x4_t __ret_867; \
+  __ret_867 = vbfmlaltq_f32(__s0_867, __s1_867, (bfloat16x8_t) {vgetq_lane_bf16(__s2_867, __p3_867), vgetq_lane_bf16(__s2_867, __p3_867), vgetq_lane_bf16(__s2_867, __p3_867), vgetq_lane_bf16(__s2_867, __p3_867), vgetq_lane_bf16(__s2_867, __p3_867), vgetq_lane_bf16(__s2_867, __p3_867), vgetq_lane_bf16(__s2_867, __p3_867), vgetq_lane_bf16(__s2_867, __p3_867)}); \
+  __ret_867; \
 })
 #else
-#define vbfmlaltq_laneq_f32(__p0_776, __p1_776, __p2_776, __p3_776) __extension__ ({ \
-  float32x4_t __s0_776 = __p0_776; \
-  bfloat16x8_t __s1_776 = __p1_776; \
-  bfloat16x8_t __s2_776 = __p2_776; \
-  float32x4_t __rev0_776;  __rev0_776 = __builtin_shufflevector(__s0_776, __s0_776, 3, 2, 1, 0); \
-  bfloat16x8_t __rev1_776;  __rev1_776 = __builtin_shufflevector(__s1_776, __s1_776, 7, 6, 5, 4, 3, 2, 1, 0); \
-  bfloat16x8_t __rev2_776;  __rev2_776 = __builtin_shufflevector(__s2_776, __s2_776, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float32x4_t __ret_776; \
-  __ret_776 = __noswap_vbfmlaltq_f32(__rev0_776, __rev1_776, (bfloat16x8_t) {__noswap_vgetq_lane_bf16(__rev2_776, __p3_776), __noswap_vgetq_lane_bf16(__rev2_776, __p3_776), __noswap_vgetq_lane_bf16(__rev2_776, __p3_776), __noswap_vgetq_lane_bf16(__rev2_776, __p3_776), __noswap_vgetq_lane_bf16(__rev2_776, __p3_776), __noswap_vgetq_lane_bf16(__rev2_776, __p3_776), __noswap_vgetq_lane_bf16(__rev2_776, __p3_776), __noswap_vgetq_lane_bf16(__rev2_776, __p3_776)}); \
-  __ret_776 = __builtin_shufflevector(__ret_776, __ret_776, 3, 2, 1, 0); \
-  __ret_776; \
+#define vbfmlaltq_laneq_f32(__p0_868, __p1_868, __p2_868, __p3_868) __extension__ ({ \
+  float32x4_t __s0_868 = __p0_868; \
+  bfloat16x8_t __s1_868 = __p1_868; \
+  bfloat16x8_t __s2_868 = __p2_868; \
+  float32x4_t __rev0_868;  __rev0_868 = __builtin_shufflevector(__s0_868, __s0_868, 3, 2, 1, 0); \
+  bfloat16x8_t __rev1_868;  __rev1_868 = __builtin_shufflevector(__s1_868, __s1_868, 7, 6, 5, 4, 3, 2, 1, 0); \
+  bfloat16x8_t __rev2_868;  __rev2_868 = __builtin_shufflevector(__s2_868, __s2_868, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float32x4_t __ret_868; \
+  __ret_868 = __noswap_vbfmlaltq_f32(__rev0_868, __rev1_868, (bfloat16x8_t) {__noswap_vgetq_lane_bf16(__rev2_868, __p3_868), __noswap_vgetq_lane_bf16(__rev2_868, __p3_868), __noswap_vgetq_lane_bf16(__rev2_868, __p3_868), __noswap_vgetq_lane_bf16(__rev2_868, __p3_868), __noswap_vgetq_lane_bf16(__rev2_868, __p3_868), __noswap_vgetq_lane_bf16(__rev2_868, __p3_868), __noswap_vgetq_lane_bf16(__rev2_868, __p3_868), __noswap_vgetq_lane_bf16(__rev2_868, __p3_868)}); \
+  __ret_868 = __builtin_shufflevector(__ret_868, __ret_868, 3, 2, 1, 0); \
+  __ret_868; \
 })
 #endif
 
@@ -65559,480 +67229,480 @@ __ai float32x4_t vcvtq_low_f32_bf16(bfloat16x8_t __p0) {
 #endif
 #if defined(__ARM_FEATURE_FP16FML) && defined(__aarch64__)
 #ifdef __LITTLE_ENDIAN__
-#define vfmlalq_lane_high_f16(__p0_777, __p1_777, __p2_777, __p3_777) __extension__ ({ \
-  float32x4_t __s0_777 = __p0_777; \
-  float16x8_t __s1_777 = __p1_777; \
-  float16x4_t __s2_777 = __p2_777; \
-  float32x4_t __ret_777; \
-  __ret_777 = vfmlalq_high_f16(__s0_777, __s1_777, (float16x8_t) {vget_lane_f16(__s2_777, __p3_777), vget_lane_f16(__s2_777, __p3_777), vget_lane_f16(__s2_777, __p3_777), vget_lane_f16(__s2_777, __p3_777), vget_lane_f16(__s2_777, __p3_777), vget_lane_f16(__s2_777, __p3_777), vget_lane_f16(__s2_777, __p3_777), vget_lane_f16(__s2_777, __p3_777)}); \
-  __ret_777; \
+#define vfmlalq_lane_high_f16(__p0_869, __p1_869, __p2_869, __p3_869) __extension__ ({ \
+  float32x4_t __s0_869 = __p0_869; \
+  float16x8_t __s1_869 = __p1_869; \
+  float16x4_t __s2_869 = __p2_869; \
+  float32x4_t __ret_869; \
+  __ret_869 = vfmlalq_high_f16(__s0_869, __s1_869, (float16x8_t) {vget_lane_f16(__s2_869, __p3_869), vget_lane_f16(__s2_869, __p3_869), vget_lane_f16(__s2_869, __p3_869), vget_lane_f16(__s2_869, __p3_869), vget_lane_f16(__s2_869, __p3_869), vget_lane_f16(__s2_869, __p3_869), vget_lane_f16(__s2_869, __p3_869), vget_lane_f16(__s2_869, __p3_869)}); \
+  __ret_869; \
 })
 #else
-#define vfmlalq_lane_high_f16(__p0_778, __p1_778, __p2_778, __p3_778) __extension__ ({ \
-  float32x4_t __s0_778 = __p0_778; \
-  float16x8_t __s1_778 = __p1_778; \
-  float16x4_t __s2_778 = __p2_778; \
-  float32x4_t __rev0_778;  __rev0_778 = __builtin_shufflevector(__s0_778, __s0_778, 3, 2, 1, 0); \
-  float16x8_t __rev1_778;  __rev1_778 = __builtin_shufflevector(__s1_778, __s1_778, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x4_t __rev2_778;  __rev2_778 = __builtin_shufflevector(__s2_778, __s2_778, 3, 2, 1, 0); \
-  float32x4_t __ret_778; \
-  __ret_778 = __noswap_vfmlalq_high_f16(__rev0_778, __rev1_778, (float16x8_t) {__noswap_vget_lane_f16(__rev2_778, __p3_778), __noswap_vget_lane_f16(__rev2_778, __p3_778), __noswap_vget_lane_f16(__rev2_778, __p3_778), __noswap_vget_lane_f16(__rev2_778, __p3_778), __noswap_vget_lane_f16(__rev2_778, __p3_778), __noswap_vget_lane_f16(__rev2_778, __p3_778), __noswap_vget_lane_f16(__rev2_778, __p3_778), __noswap_vget_lane_f16(__rev2_778, __p3_778)}); \
-  __ret_778 = __builtin_shufflevector(__ret_778, __ret_778, 3, 2, 1, 0); \
-  __ret_778; \
+#define vfmlalq_lane_high_f16(__p0_870, __p1_870, __p2_870, __p3_870) __extension__ ({ \
+  float32x4_t __s0_870 = __p0_870; \
+  float16x8_t __s1_870 = __p1_870; \
+  float16x4_t __s2_870 = __p2_870; \
+  float32x4_t __rev0_870;  __rev0_870 = __builtin_shufflevector(__s0_870, __s0_870, 3, 2, 1, 0); \
+  float16x8_t __rev1_870;  __rev1_870 = __builtin_shufflevector(__s1_870, __s1_870, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x4_t __rev2_870;  __rev2_870 = __builtin_shufflevector(__s2_870, __s2_870, 3, 2, 1, 0); \
+  float32x4_t __ret_870; \
+  __ret_870 = __noswap_vfmlalq_high_f16(__rev0_870, __rev1_870, (float16x8_t) {__noswap_vget_lane_f16(__rev2_870, __p3_870), __noswap_vget_lane_f16(__rev2_870, __p3_870), __noswap_vget_lane_f16(__rev2_870, __p3_870), __noswap_vget_lane_f16(__rev2_870, __p3_870), __noswap_vget_lane_f16(__rev2_870, __p3_870), __noswap_vget_lane_f16(__rev2_870, __p3_870), __noswap_vget_lane_f16(__rev2_870, __p3_870), __noswap_vget_lane_f16(__rev2_870, __p3_870)}); \
+  __ret_870 = __builtin_shufflevector(__ret_870, __ret_870, 3, 2, 1, 0); \
+  __ret_870; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmlal_lane_high_f16(__p0_779, __p1_779, __p2_779, __p3_779) __extension__ ({ \
-  float32x2_t __s0_779 = __p0_779; \
-  float16x4_t __s1_779 = __p1_779; \
-  float16x4_t __s2_779 = __p2_779; \
-  float32x2_t __ret_779; \
-  __ret_779 = vfmlal_high_f16(__s0_779, __s1_779, (float16x4_t) {vget_lane_f16(__s2_779, __p3_779), vget_lane_f16(__s2_779, __p3_779), vget_lane_f16(__s2_779, __p3_779), vget_lane_f16(__s2_779, __p3_779)}); \
-  __ret_779; \
+#define vfmlal_lane_high_f16(__p0_871, __p1_871, __p2_871, __p3_871) __extension__ ({ \
+  float32x2_t __s0_871 = __p0_871; \
+  float16x4_t __s1_871 = __p1_871; \
+  float16x4_t __s2_871 = __p2_871; \
+  float32x2_t __ret_871; \
+  __ret_871 = vfmlal_high_f16(__s0_871, __s1_871, (float16x4_t) {vget_lane_f16(__s2_871, __p3_871), vget_lane_f16(__s2_871, __p3_871), vget_lane_f16(__s2_871, __p3_871), vget_lane_f16(__s2_871, __p3_871)}); \
+  __ret_871; \
 })
 #else
-#define vfmlal_lane_high_f16(__p0_780, __p1_780, __p2_780, __p3_780) __extension__ ({ \
-  float32x2_t __s0_780 = __p0_780; \
-  float16x4_t __s1_780 = __p1_780; \
-  float16x4_t __s2_780 = __p2_780; \
-  float32x2_t __rev0_780;  __rev0_780 = __builtin_shufflevector(__s0_780, __s0_780, 1, 0); \
-  float16x4_t __rev1_780;  __rev1_780 = __builtin_shufflevector(__s1_780, __s1_780, 3, 2, 1, 0); \
-  float16x4_t __rev2_780;  __rev2_780 = __builtin_shufflevector(__s2_780, __s2_780, 3, 2, 1, 0); \
-  float32x2_t __ret_780; \
-  __ret_780 = __noswap_vfmlal_high_f16(__rev0_780, __rev1_780, (float16x4_t) {__noswap_vget_lane_f16(__rev2_780, __p3_780), __noswap_vget_lane_f16(__rev2_780, __p3_780), __noswap_vget_lane_f16(__rev2_780, __p3_780), __noswap_vget_lane_f16(__rev2_780, __p3_780)}); \
-  __ret_780 = __builtin_shufflevector(__ret_780, __ret_780, 1, 0); \
-  __ret_780; \
+#define vfmlal_lane_high_f16(__p0_872, __p1_872, __p2_872, __p3_872) __extension__ ({ \
+  float32x2_t __s0_872 = __p0_872; \
+  float16x4_t __s1_872 = __p1_872; \
+  float16x4_t __s2_872 = __p2_872; \
+  float32x2_t __rev0_872;  __rev0_872 = __builtin_shufflevector(__s0_872, __s0_872, 1, 0); \
+  float16x4_t __rev1_872;  __rev1_872 = __builtin_shufflevector(__s1_872, __s1_872, 3, 2, 1, 0); \
+  float16x4_t __rev2_872;  __rev2_872 = __builtin_shufflevector(__s2_872, __s2_872, 3, 2, 1, 0); \
+  float32x2_t __ret_872; \
+  __ret_872 = __noswap_vfmlal_high_f16(__rev0_872, __rev1_872, (float16x4_t) {__noswap_vget_lane_f16(__rev2_872, __p3_872), __noswap_vget_lane_f16(__rev2_872, __p3_872), __noswap_vget_lane_f16(__rev2_872, __p3_872), __noswap_vget_lane_f16(__rev2_872, __p3_872)}); \
+  __ret_872 = __builtin_shufflevector(__ret_872, __ret_872, 1, 0); \
+  __ret_872; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmlalq_lane_low_f16(__p0_781, __p1_781, __p2_781, __p3_781) __extension__ ({ \
-  float32x4_t __s0_781 = __p0_781; \
-  float16x8_t __s1_781 = __p1_781; \
-  float16x4_t __s2_781 = __p2_781; \
-  float32x4_t __ret_781; \
-  __ret_781 = vfmlalq_low_f16(__s0_781, __s1_781, (float16x8_t) {vget_lane_f16(__s2_781, __p3_781), vget_lane_f16(__s2_781, __p3_781), vget_lane_f16(__s2_781, __p3_781), vget_lane_f16(__s2_781, __p3_781), vget_lane_f16(__s2_781, __p3_781), vget_lane_f16(__s2_781, __p3_781), vget_lane_f16(__s2_781, __p3_781), vget_lane_f16(__s2_781, __p3_781)}); \
-  __ret_781; \
+#define vfmlalq_lane_low_f16(__p0_873, __p1_873, __p2_873, __p3_873) __extension__ ({ \
+  float32x4_t __s0_873 = __p0_873; \
+  float16x8_t __s1_873 = __p1_873; \
+  float16x4_t __s2_873 = __p2_873; \
+  float32x4_t __ret_873; \
+  __ret_873 = vfmlalq_low_f16(__s0_873, __s1_873, (float16x8_t) {vget_lane_f16(__s2_873, __p3_873), vget_lane_f16(__s2_873, __p3_873), vget_lane_f16(__s2_873, __p3_873), vget_lane_f16(__s2_873, __p3_873), vget_lane_f16(__s2_873, __p3_873), vget_lane_f16(__s2_873, __p3_873), vget_lane_f16(__s2_873, __p3_873), vget_lane_f16(__s2_873, __p3_873)}); \
+  __ret_873; \
 })
 #else
-#define vfmlalq_lane_low_f16(__p0_782, __p1_782, __p2_782, __p3_782) __extension__ ({ \
-  float32x4_t __s0_782 = __p0_782; \
-  float16x8_t __s1_782 = __p1_782; \
-  float16x4_t __s2_782 = __p2_782; \
-  float32x4_t __rev0_782;  __rev0_782 = __builtin_shufflevector(__s0_782, __s0_782, 3, 2, 1, 0); \
-  float16x8_t __rev1_782;  __rev1_782 = __builtin_shufflevector(__s1_782, __s1_782, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x4_t __rev2_782;  __rev2_782 = __builtin_shufflevector(__s2_782, __s2_782, 3, 2, 1, 0); \
-  float32x4_t __ret_782; \
-  __ret_782 = __noswap_vfmlalq_low_f16(__rev0_782, __rev1_782, (float16x8_t) {__noswap_vget_lane_f16(__rev2_782, __p3_782), __noswap_vget_lane_f16(__rev2_782, __p3_782), __noswap_vget_lane_f16(__rev2_782, __p3_782), __noswap_vget_lane_f16(__rev2_782, __p3_782), __noswap_vget_lane_f16(__rev2_782, __p3_782), __noswap_vget_lane_f16(__rev2_782, __p3_782), __noswap_vget_lane_f16(__rev2_782, __p3_782), __noswap_vget_lane_f16(__rev2_782, __p3_782)}); \
-  __ret_782 = __builtin_shufflevector(__ret_782, __ret_782, 3, 2, 1, 0); \
-  __ret_782; \
+#define vfmlalq_lane_low_f16(__p0_874, __p1_874, __p2_874, __p3_874) __extension__ ({ \
+  float32x4_t __s0_874 = __p0_874; \
+  float16x8_t __s1_874 = __p1_874; \
+  float16x4_t __s2_874 = __p2_874; \
+  float32x4_t __rev0_874;  __rev0_874 = __builtin_shufflevector(__s0_874, __s0_874, 3, 2, 1, 0); \
+  float16x8_t __rev1_874;  __rev1_874 = __builtin_shufflevector(__s1_874, __s1_874, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x4_t __rev2_874;  __rev2_874 = __builtin_shufflevector(__s2_874, __s2_874, 3, 2, 1, 0); \
+  float32x4_t __ret_874; \
+  __ret_874 = __noswap_vfmlalq_low_f16(__rev0_874, __rev1_874, (float16x8_t) {__noswap_vget_lane_f16(__rev2_874, __p3_874), __noswap_vget_lane_f16(__rev2_874, __p3_874), __noswap_vget_lane_f16(__rev2_874, __p3_874), __noswap_vget_lane_f16(__rev2_874, __p3_874), __noswap_vget_lane_f16(__rev2_874, __p3_874), __noswap_vget_lane_f16(__rev2_874, __p3_874), __noswap_vget_lane_f16(__rev2_874, __p3_874), __noswap_vget_lane_f16(__rev2_874, __p3_874)}); \
+  __ret_874 = __builtin_shufflevector(__ret_874, __ret_874, 3, 2, 1, 0); \
+  __ret_874; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmlal_lane_low_f16(__p0_783, __p1_783, __p2_783, __p3_783) __extension__ ({ \
-  float32x2_t __s0_783 = __p0_783; \
-  float16x4_t __s1_783 = __p1_783; \
-  float16x4_t __s2_783 = __p2_783; \
-  float32x2_t __ret_783; \
-  __ret_783 = vfmlal_low_f16(__s0_783, __s1_783, (float16x4_t) {vget_lane_f16(__s2_783, __p3_783), vget_lane_f16(__s2_783, __p3_783), vget_lane_f16(__s2_783, __p3_783), vget_lane_f16(__s2_783, __p3_783)}); \
-  __ret_783; \
+#define vfmlal_lane_low_f16(__p0_875, __p1_875, __p2_875, __p3_875) __extension__ ({ \
+  float32x2_t __s0_875 = __p0_875; \
+  float16x4_t __s1_875 = __p1_875; \
+  float16x4_t __s2_875 = __p2_875; \
+  float32x2_t __ret_875; \
+  __ret_875 = vfmlal_low_f16(__s0_875, __s1_875, (float16x4_t) {vget_lane_f16(__s2_875, __p3_875), vget_lane_f16(__s2_875, __p3_875), vget_lane_f16(__s2_875, __p3_875), vget_lane_f16(__s2_875, __p3_875)}); \
+  __ret_875; \
 })
 #else
-#define vfmlal_lane_low_f16(__p0_784, __p1_784, __p2_784, __p3_784) __extension__ ({ \
-  float32x2_t __s0_784 = __p0_784; \
-  float16x4_t __s1_784 = __p1_784; \
-  float16x4_t __s2_784 = __p2_784; \
-  float32x2_t __rev0_784;  __rev0_784 = __builtin_shufflevector(__s0_784, __s0_784, 1, 0); \
-  float16x4_t __rev1_784;  __rev1_784 = __builtin_shufflevector(__s1_784, __s1_784, 3, 2, 1, 0); \
-  float16x4_t __rev2_784;  __rev2_784 = __builtin_shufflevector(__s2_784, __s2_784, 3, 2, 1, 0); \
-  float32x2_t __ret_784; \
-  __ret_784 = __noswap_vfmlal_low_f16(__rev0_784, __rev1_784, (float16x4_t) {__noswap_vget_lane_f16(__rev2_784, __p3_784), __noswap_vget_lane_f16(__rev2_784, __p3_784), __noswap_vget_lane_f16(__rev2_784, __p3_784), __noswap_vget_lane_f16(__rev2_784, __p3_784)}); \
-  __ret_784 = __builtin_shufflevector(__ret_784, __ret_784, 1, 0); \
-  __ret_784; \
+#define vfmlal_lane_low_f16(__p0_876, __p1_876, __p2_876, __p3_876) __extension__ ({ \
+  float32x2_t __s0_876 = __p0_876; \
+  float16x4_t __s1_876 = __p1_876; \
+  float16x4_t __s2_876 = __p2_876; \
+  float32x2_t __rev0_876;  __rev0_876 = __builtin_shufflevector(__s0_876, __s0_876, 1, 0); \
+  float16x4_t __rev1_876;  __rev1_876 = __builtin_shufflevector(__s1_876, __s1_876, 3, 2, 1, 0); \
+  float16x4_t __rev2_876;  __rev2_876 = __builtin_shufflevector(__s2_876, __s2_876, 3, 2, 1, 0); \
+  float32x2_t __ret_876; \
+  __ret_876 = __noswap_vfmlal_low_f16(__rev0_876, __rev1_876, (float16x4_t) {__noswap_vget_lane_f16(__rev2_876, __p3_876), __noswap_vget_lane_f16(__rev2_876, __p3_876), __noswap_vget_lane_f16(__rev2_876, __p3_876), __noswap_vget_lane_f16(__rev2_876, __p3_876)}); \
+  __ret_876 = __builtin_shufflevector(__ret_876, __ret_876, 1, 0); \
+  __ret_876; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmlalq_laneq_high_f16(__p0_785, __p1_785, __p2_785, __p3_785) __extension__ ({ \
-  float32x4_t __s0_785 = __p0_785; \
-  float16x8_t __s1_785 = __p1_785; \
-  float16x8_t __s2_785 = __p2_785; \
-  float32x4_t __ret_785; \
-  __ret_785 = vfmlalq_high_f16(__s0_785, __s1_785, (float16x8_t) {vgetq_lane_f16(__s2_785, __p3_785), vgetq_lane_f16(__s2_785, __p3_785), vgetq_lane_f16(__s2_785, __p3_785), vgetq_lane_f16(__s2_785, __p3_785), vgetq_lane_f16(__s2_785, __p3_785), vgetq_lane_f16(__s2_785, __p3_785), vgetq_lane_f16(__s2_785, __p3_785), vgetq_lane_f16(__s2_785, __p3_785)}); \
-  __ret_785; \
+#define vfmlalq_laneq_high_f16(__p0_877, __p1_877, __p2_877, __p3_877) __extension__ ({ \
+  float32x4_t __s0_877 = __p0_877; \
+  float16x8_t __s1_877 = __p1_877; \
+  float16x8_t __s2_877 = __p2_877; \
+  float32x4_t __ret_877; \
+  __ret_877 = vfmlalq_high_f16(__s0_877, __s1_877, (float16x8_t) {vgetq_lane_f16(__s2_877, __p3_877), vgetq_lane_f16(__s2_877, __p3_877), vgetq_lane_f16(__s2_877, __p3_877), vgetq_lane_f16(__s2_877, __p3_877), vgetq_lane_f16(__s2_877, __p3_877), vgetq_lane_f16(__s2_877, __p3_877), vgetq_lane_f16(__s2_877, __p3_877), vgetq_lane_f16(__s2_877, __p3_877)}); \
+  __ret_877; \
 })
 #else
-#define vfmlalq_laneq_high_f16(__p0_786, __p1_786, __p2_786, __p3_786) __extension__ ({ \
-  float32x4_t __s0_786 = __p0_786; \
-  float16x8_t __s1_786 = __p1_786; \
-  float16x8_t __s2_786 = __p2_786; \
-  float32x4_t __rev0_786;  __rev0_786 = __builtin_shufflevector(__s0_786, __s0_786, 3, 2, 1, 0); \
-  float16x8_t __rev1_786;  __rev1_786 = __builtin_shufflevector(__s1_786, __s1_786, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __rev2_786;  __rev2_786 = __builtin_shufflevector(__s2_786, __s2_786, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float32x4_t __ret_786; \
-  __ret_786 = __noswap_vfmlalq_high_f16(__rev0_786, __rev1_786, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_786, __p3_786), __noswap_vgetq_lane_f16(__rev2_786, __p3_786), __noswap_vgetq_lane_f16(__rev2_786, __p3_786), __noswap_vgetq_lane_f16(__rev2_786, __p3_786), __noswap_vgetq_lane_f16(__rev2_786, __p3_786), __noswap_vgetq_lane_f16(__rev2_786, __p3_786), __noswap_vgetq_lane_f16(__rev2_786, __p3_786), __noswap_vgetq_lane_f16(__rev2_786, __p3_786)}); \
-  __ret_786 = __builtin_shufflevector(__ret_786, __ret_786, 3, 2, 1, 0); \
-  __ret_786; \
+#define vfmlalq_laneq_high_f16(__p0_878, __p1_878, __p2_878, __p3_878) __extension__ ({ \
+  float32x4_t __s0_878 = __p0_878; \
+  float16x8_t __s1_878 = __p1_878; \
+  float16x8_t __s2_878 = __p2_878; \
+  float32x4_t __rev0_878;  __rev0_878 = __builtin_shufflevector(__s0_878, __s0_878, 3, 2, 1, 0); \
+  float16x8_t __rev1_878;  __rev1_878 = __builtin_shufflevector(__s1_878, __s1_878, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __rev2_878;  __rev2_878 = __builtin_shufflevector(__s2_878, __s2_878, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float32x4_t __ret_878; \
+  __ret_878 = __noswap_vfmlalq_high_f16(__rev0_878, __rev1_878, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_878, __p3_878), __noswap_vgetq_lane_f16(__rev2_878, __p3_878), __noswap_vgetq_lane_f16(__rev2_878, __p3_878), __noswap_vgetq_lane_f16(__rev2_878, __p3_878), __noswap_vgetq_lane_f16(__rev2_878, __p3_878), __noswap_vgetq_lane_f16(__rev2_878, __p3_878), __noswap_vgetq_lane_f16(__rev2_878, __p3_878), __noswap_vgetq_lane_f16(__rev2_878, __p3_878)}); \
+  __ret_878 = __builtin_shufflevector(__ret_878, __ret_878, 3, 2, 1, 0); \
+  __ret_878; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmlal_laneq_high_f16(__p0_787, __p1_787, __p2_787, __p3_787) __extension__ ({ \
-  float32x2_t __s0_787 = __p0_787; \
-  float16x4_t __s1_787 = __p1_787; \
-  float16x8_t __s2_787 = __p2_787; \
-  float32x2_t __ret_787; \
-  __ret_787 = vfmlal_high_f16(__s0_787, __s1_787, (float16x4_t) {vgetq_lane_f16(__s2_787, __p3_787), vgetq_lane_f16(__s2_787, __p3_787), vgetq_lane_f16(__s2_787, __p3_787), vgetq_lane_f16(__s2_787, __p3_787)}); \
-  __ret_787; \
+#define vfmlal_laneq_high_f16(__p0_879, __p1_879, __p2_879, __p3_879) __extension__ ({ \
+  float32x2_t __s0_879 = __p0_879; \
+  float16x4_t __s1_879 = __p1_879; \
+  float16x8_t __s2_879 = __p2_879; \
+  float32x2_t __ret_879; \
+  __ret_879 = vfmlal_high_f16(__s0_879, __s1_879, (float16x4_t) {vgetq_lane_f16(__s2_879, __p3_879), vgetq_lane_f16(__s2_879, __p3_879), vgetq_lane_f16(__s2_879, __p3_879), vgetq_lane_f16(__s2_879, __p3_879)}); \
+  __ret_879; \
 })
 #else
-#define vfmlal_laneq_high_f16(__p0_788, __p1_788, __p2_788, __p3_788) __extension__ ({ \
-  float32x2_t __s0_788 = __p0_788; \
-  float16x4_t __s1_788 = __p1_788; \
-  float16x8_t __s2_788 = __p2_788; \
-  float32x2_t __rev0_788;  __rev0_788 = __builtin_shufflevector(__s0_788, __s0_788, 1, 0); \
-  float16x4_t __rev1_788;  __rev1_788 = __builtin_shufflevector(__s1_788, __s1_788, 3, 2, 1, 0); \
-  float16x8_t __rev2_788;  __rev2_788 = __builtin_shufflevector(__s2_788, __s2_788, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float32x2_t __ret_788; \
-  __ret_788 = __noswap_vfmlal_high_f16(__rev0_788, __rev1_788, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_788, __p3_788), __noswap_vgetq_lane_f16(__rev2_788, __p3_788), __noswap_vgetq_lane_f16(__rev2_788, __p3_788), __noswap_vgetq_lane_f16(__rev2_788, __p3_788)}); \
-  __ret_788 = __builtin_shufflevector(__ret_788, __ret_788, 1, 0); \
-  __ret_788; \
+#define vfmlal_laneq_high_f16(__p0_880, __p1_880, __p2_880, __p3_880) __extension__ ({ \
+  float32x2_t __s0_880 = __p0_880; \
+  float16x4_t __s1_880 = __p1_880; \
+  float16x8_t __s2_880 = __p2_880; \
+  float32x2_t __rev0_880;  __rev0_880 = __builtin_shufflevector(__s0_880, __s0_880, 1, 0); \
+  float16x4_t __rev1_880;  __rev1_880 = __builtin_shufflevector(__s1_880, __s1_880, 3, 2, 1, 0); \
+  float16x8_t __rev2_880;  __rev2_880 = __builtin_shufflevector(__s2_880, __s2_880, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float32x2_t __ret_880; \
+  __ret_880 = __noswap_vfmlal_high_f16(__rev0_880, __rev1_880, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_880, __p3_880), __noswap_vgetq_lane_f16(__rev2_880, __p3_880), __noswap_vgetq_lane_f16(__rev2_880, __p3_880), __noswap_vgetq_lane_f16(__rev2_880, __p3_880)}); \
+  __ret_880 = __builtin_shufflevector(__ret_880, __ret_880, 1, 0); \
+  __ret_880; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmlalq_laneq_low_f16(__p0_789, __p1_789, __p2_789, __p3_789) __extension__ ({ \
-  float32x4_t __s0_789 = __p0_789; \
-  float16x8_t __s1_789 = __p1_789; \
-  float16x8_t __s2_789 = __p2_789; \
-  float32x4_t __ret_789; \
-  __ret_789 = vfmlalq_low_f16(__s0_789, __s1_789, (float16x8_t) {vgetq_lane_f16(__s2_789, __p3_789), vgetq_lane_f16(__s2_789, __p3_789), vgetq_lane_f16(__s2_789, __p3_789), vgetq_lane_f16(__s2_789, __p3_789), vgetq_lane_f16(__s2_789, __p3_789), vgetq_lane_f16(__s2_789, __p3_789), vgetq_lane_f16(__s2_789, __p3_789), vgetq_lane_f16(__s2_789, __p3_789)}); \
-  __ret_789; \
+#define vfmlalq_laneq_low_f16(__p0_881, __p1_881, __p2_881, __p3_881) __extension__ ({ \
+  float32x4_t __s0_881 = __p0_881; \
+  float16x8_t __s1_881 = __p1_881; \
+  float16x8_t __s2_881 = __p2_881; \
+  float32x4_t __ret_881; \
+  __ret_881 = vfmlalq_low_f16(__s0_881, __s1_881, (float16x8_t) {vgetq_lane_f16(__s2_881, __p3_881), vgetq_lane_f16(__s2_881, __p3_881), vgetq_lane_f16(__s2_881, __p3_881), vgetq_lane_f16(__s2_881, __p3_881), vgetq_lane_f16(__s2_881, __p3_881), vgetq_lane_f16(__s2_881, __p3_881), vgetq_lane_f16(__s2_881, __p3_881), vgetq_lane_f16(__s2_881, __p3_881)}); \
+  __ret_881; \
 })
 #else
-#define vfmlalq_laneq_low_f16(__p0_790, __p1_790, __p2_790, __p3_790) __extension__ ({ \
-  float32x4_t __s0_790 = __p0_790; \
-  float16x8_t __s1_790 = __p1_790; \
-  float16x8_t __s2_790 = __p2_790; \
-  float32x4_t __rev0_790;  __rev0_790 = __builtin_shufflevector(__s0_790, __s0_790, 3, 2, 1, 0); \
-  float16x8_t __rev1_790;  __rev1_790 = __builtin_shufflevector(__s1_790, __s1_790, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __rev2_790;  __rev2_790 = __builtin_shufflevector(__s2_790, __s2_790, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float32x4_t __ret_790; \
-  __ret_790 = __noswap_vfmlalq_low_f16(__rev0_790, __rev1_790, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_790, __p3_790), __noswap_vgetq_lane_f16(__rev2_790, __p3_790), __noswap_vgetq_lane_f16(__rev2_790, __p3_790), __noswap_vgetq_lane_f16(__rev2_790, __p3_790), __noswap_vgetq_lane_f16(__rev2_790, __p3_790), __noswap_vgetq_lane_f16(__rev2_790, __p3_790), __noswap_vgetq_lane_f16(__rev2_790, __p3_790), __noswap_vgetq_lane_f16(__rev2_790, __p3_790)}); \
-  __ret_790 = __builtin_shufflevector(__ret_790, __ret_790, 3, 2, 1, 0); \
-  __ret_790; \
+#define vfmlalq_laneq_low_f16(__p0_882, __p1_882, __p2_882, __p3_882) __extension__ ({ \
+  float32x4_t __s0_882 = __p0_882; \
+  float16x8_t __s1_882 = __p1_882; \
+  float16x8_t __s2_882 = __p2_882; \
+  float32x4_t __rev0_882;  __rev0_882 = __builtin_shufflevector(__s0_882, __s0_882, 3, 2, 1, 0); \
+  float16x8_t __rev1_882;  __rev1_882 = __builtin_shufflevector(__s1_882, __s1_882, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __rev2_882;  __rev2_882 = __builtin_shufflevector(__s2_882, __s2_882, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float32x4_t __ret_882; \
+  __ret_882 = __noswap_vfmlalq_low_f16(__rev0_882, __rev1_882, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_882, __p3_882), __noswap_vgetq_lane_f16(__rev2_882, __p3_882), __noswap_vgetq_lane_f16(__rev2_882, __p3_882), __noswap_vgetq_lane_f16(__rev2_882, __p3_882), __noswap_vgetq_lane_f16(__rev2_882, __p3_882), __noswap_vgetq_lane_f16(__rev2_882, __p3_882), __noswap_vgetq_lane_f16(__rev2_882, __p3_882), __noswap_vgetq_lane_f16(__rev2_882, __p3_882)}); \
+  __ret_882 = __builtin_shufflevector(__ret_882, __ret_882, 3, 2, 1, 0); \
+  __ret_882; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmlal_laneq_low_f16(__p0_791, __p1_791, __p2_791, __p3_791) __extension__ ({ \
-  float32x2_t __s0_791 = __p0_791; \
-  float16x4_t __s1_791 = __p1_791; \
-  float16x8_t __s2_791 = __p2_791; \
-  float32x2_t __ret_791; \
-  __ret_791 = vfmlal_low_f16(__s0_791, __s1_791, (float16x4_t) {vgetq_lane_f16(__s2_791, __p3_791), vgetq_lane_f16(__s2_791, __p3_791), vgetq_lane_f16(__s2_791, __p3_791), vgetq_lane_f16(__s2_791, __p3_791)}); \
-  __ret_791; \
+#define vfmlal_laneq_low_f16(__p0_883, __p1_883, __p2_883, __p3_883) __extension__ ({ \
+  float32x2_t __s0_883 = __p0_883; \
+  float16x4_t __s1_883 = __p1_883; \
+  float16x8_t __s2_883 = __p2_883; \
+  float32x2_t __ret_883; \
+  __ret_883 = vfmlal_low_f16(__s0_883, __s1_883, (float16x4_t) {vgetq_lane_f16(__s2_883, __p3_883), vgetq_lane_f16(__s2_883, __p3_883), vgetq_lane_f16(__s2_883, __p3_883), vgetq_lane_f16(__s2_883, __p3_883)}); \
+  __ret_883; \
 })
 #else
-#define vfmlal_laneq_low_f16(__p0_792, __p1_792, __p2_792, __p3_792) __extension__ ({ \
-  float32x2_t __s0_792 = __p0_792; \
-  float16x4_t __s1_792 = __p1_792; \
-  float16x8_t __s2_792 = __p2_792; \
-  float32x2_t __rev0_792;  __rev0_792 = __builtin_shufflevector(__s0_792, __s0_792, 1, 0); \
-  float16x4_t __rev1_792;  __rev1_792 = __builtin_shufflevector(__s1_792, __s1_792, 3, 2, 1, 0); \
-  float16x8_t __rev2_792;  __rev2_792 = __builtin_shufflevector(__s2_792, __s2_792, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float32x2_t __ret_792; \
-  __ret_792 = __noswap_vfmlal_low_f16(__rev0_792, __rev1_792, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_792, __p3_792), __noswap_vgetq_lane_f16(__rev2_792, __p3_792), __noswap_vgetq_lane_f16(__rev2_792, __p3_792), __noswap_vgetq_lane_f16(__rev2_792, __p3_792)}); \
-  __ret_792 = __builtin_shufflevector(__ret_792, __ret_792, 1, 0); \
-  __ret_792; \
+#define vfmlal_laneq_low_f16(__p0_884, __p1_884, __p2_884, __p3_884) __extension__ ({ \
+  float32x2_t __s0_884 = __p0_884; \
+  float16x4_t __s1_884 = __p1_884; \
+  float16x8_t __s2_884 = __p2_884; \
+  float32x2_t __rev0_884;  __rev0_884 = __builtin_shufflevector(__s0_884, __s0_884, 1, 0); \
+  float16x4_t __rev1_884;  __rev1_884 = __builtin_shufflevector(__s1_884, __s1_884, 3, 2, 1, 0); \
+  float16x8_t __rev2_884;  __rev2_884 = __builtin_shufflevector(__s2_884, __s2_884, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float32x2_t __ret_884; \
+  __ret_884 = __noswap_vfmlal_low_f16(__rev0_884, __rev1_884, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_884, __p3_884), __noswap_vgetq_lane_f16(__rev2_884, __p3_884), __noswap_vgetq_lane_f16(__rev2_884, __p3_884), __noswap_vgetq_lane_f16(__rev2_884, __p3_884)}); \
+  __ret_884 = __builtin_shufflevector(__ret_884, __ret_884, 1, 0); \
+  __ret_884; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmlslq_lane_high_f16(__p0_793, __p1_793, __p2_793, __p3_793) __extension__ ({ \
-  float32x4_t __s0_793 = __p0_793; \
-  float16x8_t __s1_793 = __p1_793; \
-  float16x4_t __s2_793 = __p2_793; \
-  float32x4_t __ret_793; \
-  __ret_793 = vfmlslq_high_f16(__s0_793, __s1_793, (float16x8_t) {vget_lane_f16(__s2_793, __p3_793), vget_lane_f16(__s2_793, __p3_793), vget_lane_f16(__s2_793, __p3_793), vget_lane_f16(__s2_793, __p3_793), vget_lane_f16(__s2_793, __p3_793), vget_lane_f16(__s2_793, __p3_793), vget_lane_f16(__s2_793, __p3_793), vget_lane_f16(__s2_793, __p3_793)}); \
-  __ret_793; \
+#define vfmlslq_lane_high_f16(__p0_885, __p1_885, __p2_885, __p3_885) __extension__ ({ \
+  float32x4_t __s0_885 = __p0_885; \
+  float16x8_t __s1_885 = __p1_885; \
+  float16x4_t __s2_885 = __p2_885; \
+  float32x4_t __ret_885; \
+  __ret_885 = vfmlslq_high_f16(__s0_885, __s1_885, (float16x8_t) {vget_lane_f16(__s2_885, __p3_885), vget_lane_f16(__s2_885, __p3_885), vget_lane_f16(__s2_885, __p3_885), vget_lane_f16(__s2_885, __p3_885), vget_lane_f16(__s2_885, __p3_885), vget_lane_f16(__s2_885, __p3_885), vget_lane_f16(__s2_885, __p3_885), vget_lane_f16(__s2_885, __p3_885)}); \
+  __ret_885; \
 })
 #else
-#define vfmlslq_lane_high_f16(__p0_794, __p1_794, __p2_794, __p3_794) __extension__ ({ \
-  float32x4_t __s0_794 = __p0_794; \
-  float16x8_t __s1_794 = __p1_794; \
-  float16x4_t __s2_794 = __p2_794; \
-  float32x4_t __rev0_794;  __rev0_794 = __builtin_shufflevector(__s0_794, __s0_794, 3, 2, 1, 0); \
-  float16x8_t __rev1_794;  __rev1_794 = __builtin_shufflevector(__s1_794, __s1_794, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x4_t __rev2_794;  __rev2_794 = __builtin_shufflevector(__s2_794, __s2_794, 3, 2, 1, 0); \
-  float32x4_t __ret_794; \
-  __ret_794 = __noswap_vfmlslq_high_f16(__rev0_794, __rev1_794, (float16x8_t) {__noswap_vget_lane_f16(__rev2_794, __p3_794), __noswap_vget_lane_f16(__rev2_794, __p3_794), __noswap_vget_lane_f16(__rev2_794, __p3_794), __noswap_vget_lane_f16(__rev2_794, __p3_794), __noswap_vget_lane_f16(__rev2_794, __p3_794), __noswap_vget_lane_f16(__rev2_794, __p3_794), __noswap_vget_lane_f16(__rev2_794, __p3_794), __noswap_vget_lane_f16(__rev2_794, __p3_794)}); \
-  __ret_794 = __builtin_shufflevector(__ret_794, __ret_794, 3, 2, 1, 0); \
-  __ret_794; \
+#define vfmlslq_lane_high_f16(__p0_886, __p1_886, __p2_886, __p3_886) __extension__ ({ \
+  float32x4_t __s0_886 = __p0_886; \
+  float16x8_t __s1_886 = __p1_886; \
+  float16x4_t __s2_886 = __p2_886; \
+  float32x4_t __rev0_886;  __rev0_886 = __builtin_shufflevector(__s0_886, __s0_886, 3, 2, 1, 0); \
+  float16x8_t __rev1_886;  __rev1_886 = __builtin_shufflevector(__s1_886, __s1_886, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x4_t __rev2_886;  __rev2_886 = __builtin_shufflevector(__s2_886, __s2_886, 3, 2, 1, 0); \
+  float32x4_t __ret_886; \
+  __ret_886 = __noswap_vfmlslq_high_f16(__rev0_886, __rev1_886, (float16x8_t) {__noswap_vget_lane_f16(__rev2_886, __p3_886), __noswap_vget_lane_f16(__rev2_886, __p3_886), __noswap_vget_lane_f16(__rev2_886, __p3_886), __noswap_vget_lane_f16(__rev2_886, __p3_886), __noswap_vget_lane_f16(__rev2_886, __p3_886), __noswap_vget_lane_f16(__rev2_886, __p3_886), __noswap_vget_lane_f16(__rev2_886, __p3_886), __noswap_vget_lane_f16(__rev2_886, __p3_886)}); \
+  __ret_886 = __builtin_shufflevector(__ret_886, __ret_886, 3, 2, 1, 0); \
+  __ret_886; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmlsl_lane_high_f16(__p0_795, __p1_795, __p2_795, __p3_795) __extension__ ({ \
-  float32x2_t __s0_795 = __p0_795; \
-  float16x4_t __s1_795 = __p1_795; \
-  float16x4_t __s2_795 = __p2_795; \
-  float32x2_t __ret_795; \
-  __ret_795 = vfmlsl_high_f16(__s0_795, __s1_795, (float16x4_t) {vget_lane_f16(__s2_795, __p3_795), vget_lane_f16(__s2_795, __p3_795), vget_lane_f16(__s2_795, __p3_795), vget_lane_f16(__s2_795, __p3_795)}); \
-  __ret_795; \
+#define vfmlsl_lane_high_f16(__p0_887, __p1_887, __p2_887, __p3_887) __extension__ ({ \
+  float32x2_t __s0_887 = __p0_887; \
+  float16x4_t __s1_887 = __p1_887; \
+  float16x4_t __s2_887 = __p2_887; \
+  float32x2_t __ret_887; \
+  __ret_887 = vfmlsl_high_f16(__s0_887, __s1_887, (float16x4_t) {vget_lane_f16(__s2_887, __p3_887), vget_lane_f16(__s2_887, __p3_887), vget_lane_f16(__s2_887, __p3_887), vget_lane_f16(__s2_887, __p3_887)}); \
+  __ret_887; \
 })
 #else
-#define vfmlsl_lane_high_f16(__p0_796, __p1_796, __p2_796, __p3_796) __extension__ ({ \
-  float32x2_t __s0_796 = __p0_796; \
-  float16x4_t __s1_796 = __p1_796; \
-  float16x4_t __s2_796 = __p2_796; \
-  float32x2_t __rev0_796;  __rev0_796 = __builtin_shufflevector(__s0_796, __s0_796, 1, 0); \
-  float16x4_t __rev1_796;  __rev1_796 = __builtin_shufflevector(__s1_796, __s1_796, 3, 2, 1, 0); \
-  float16x4_t __rev2_796;  __rev2_796 = __builtin_shufflevector(__s2_796, __s2_796, 3, 2, 1, 0); \
-  float32x2_t __ret_796; \
-  __ret_796 = __noswap_vfmlsl_high_f16(__rev0_796, __rev1_796, (float16x4_t) {__noswap_vget_lane_f16(__rev2_796, __p3_796), __noswap_vget_lane_f16(__rev2_796, __p3_796), __noswap_vget_lane_f16(__rev2_796, __p3_796), __noswap_vget_lane_f16(__rev2_796, __p3_796)}); \
-  __ret_796 = __builtin_shufflevector(__ret_796, __ret_796, 1, 0); \
-  __ret_796; \
+#define vfmlsl_lane_high_f16(__p0_888, __p1_888, __p2_888, __p3_888) __extension__ ({ \
+  float32x2_t __s0_888 = __p0_888; \
+  float16x4_t __s1_888 = __p1_888; \
+  float16x4_t __s2_888 = __p2_888; \
+  float32x2_t __rev0_888;  __rev0_888 = __builtin_shufflevector(__s0_888, __s0_888, 1, 0); \
+  float16x4_t __rev1_888;  __rev1_888 = __builtin_shufflevector(__s1_888, __s1_888, 3, 2, 1, 0); \
+  float16x4_t __rev2_888;  __rev2_888 = __builtin_shufflevector(__s2_888, __s2_888, 3, 2, 1, 0); \
+  float32x2_t __ret_888; \
+  __ret_888 = __noswap_vfmlsl_high_f16(__rev0_888, __rev1_888, (float16x4_t) {__noswap_vget_lane_f16(__rev2_888, __p3_888), __noswap_vget_lane_f16(__rev2_888, __p3_888), __noswap_vget_lane_f16(__rev2_888, __p3_888), __noswap_vget_lane_f16(__rev2_888, __p3_888)}); \
+  __ret_888 = __builtin_shufflevector(__ret_888, __ret_888, 1, 0); \
+  __ret_888; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmlslq_lane_low_f16(__p0_797, __p1_797, __p2_797, __p3_797) __extension__ ({ \
-  float32x4_t __s0_797 = __p0_797; \
-  float16x8_t __s1_797 = __p1_797; \
-  float16x4_t __s2_797 = __p2_797; \
-  float32x4_t __ret_797; \
-  __ret_797 = vfmlslq_low_f16(__s0_797, __s1_797, (float16x8_t) {vget_lane_f16(__s2_797, __p3_797), vget_lane_f16(__s2_797, __p3_797), vget_lane_f16(__s2_797, __p3_797), vget_lane_f16(__s2_797, __p3_797), vget_lane_f16(__s2_797, __p3_797), vget_lane_f16(__s2_797, __p3_797), vget_lane_f16(__s2_797, __p3_797), vget_lane_f16(__s2_797, __p3_797)}); \
-  __ret_797; \
+#define vfmlslq_lane_low_f16(__p0_889, __p1_889, __p2_889, __p3_889) __extension__ ({ \
+  float32x4_t __s0_889 = __p0_889; \
+  float16x8_t __s1_889 = __p1_889; \
+  float16x4_t __s2_889 = __p2_889; \
+  float32x4_t __ret_889; \
+  __ret_889 = vfmlslq_low_f16(__s0_889, __s1_889, (float16x8_t) {vget_lane_f16(__s2_889, __p3_889), vget_lane_f16(__s2_889, __p3_889), vget_lane_f16(__s2_889, __p3_889), vget_lane_f16(__s2_889, __p3_889), vget_lane_f16(__s2_889, __p3_889), vget_lane_f16(__s2_889, __p3_889), vget_lane_f16(__s2_889, __p3_889), vget_lane_f16(__s2_889, __p3_889)}); \
+  __ret_889; \
 })
 #else
-#define vfmlslq_lane_low_f16(__p0_798, __p1_798, __p2_798, __p3_798) __extension__ ({ \
-  float32x4_t __s0_798 = __p0_798; \
-  float16x8_t __s1_798 = __p1_798; \
-  float16x4_t __s2_798 = __p2_798; \
-  float32x4_t __rev0_798;  __rev0_798 = __builtin_shufflevector(__s0_798, __s0_798, 3, 2, 1, 0); \
-  float16x8_t __rev1_798;  __rev1_798 = __builtin_shufflevector(__s1_798, __s1_798, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x4_t __rev2_798;  __rev2_798 = __builtin_shufflevector(__s2_798, __s2_798, 3, 2, 1, 0); \
-  float32x4_t __ret_798; \
-  __ret_798 = __noswap_vfmlslq_low_f16(__rev0_798, __rev1_798, (float16x8_t) {__noswap_vget_lane_f16(__rev2_798, __p3_798), __noswap_vget_lane_f16(__rev2_798, __p3_798), __noswap_vget_lane_f16(__rev2_798, __p3_798), __noswap_vget_lane_f16(__rev2_798, __p3_798), __noswap_vget_lane_f16(__rev2_798, __p3_798), __noswap_vget_lane_f16(__rev2_798, __p3_798), __noswap_vget_lane_f16(__rev2_798, __p3_798), __noswap_vget_lane_f16(__rev2_798, __p3_798)}); \
-  __ret_798 = __builtin_shufflevector(__ret_798, __ret_798, 3, 2, 1, 0); \
-  __ret_798; \
+#define vfmlslq_lane_low_f16(__p0_890, __p1_890, __p2_890, __p3_890) __extension__ ({ \
+  float32x4_t __s0_890 = __p0_890; \
+  float16x8_t __s1_890 = __p1_890; \
+  float16x4_t __s2_890 = __p2_890; \
+  float32x4_t __rev0_890;  __rev0_890 = __builtin_shufflevector(__s0_890, __s0_890, 3, 2, 1, 0); \
+  float16x8_t __rev1_890;  __rev1_890 = __builtin_shufflevector(__s1_890, __s1_890, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x4_t __rev2_890;  __rev2_890 = __builtin_shufflevector(__s2_890, __s2_890, 3, 2, 1, 0); \
+  float32x4_t __ret_890; \
+  __ret_890 = __noswap_vfmlslq_low_f16(__rev0_890, __rev1_890, (float16x8_t) {__noswap_vget_lane_f16(__rev2_890, __p3_890), __noswap_vget_lane_f16(__rev2_890, __p3_890), __noswap_vget_lane_f16(__rev2_890, __p3_890), __noswap_vget_lane_f16(__rev2_890, __p3_890), __noswap_vget_lane_f16(__rev2_890, __p3_890), __noswap_vget_lane_f16(__rev2_890, __p3_890), __noswap_vget_lane_f16(__rev2_890, __p3_890), __noswap_vget_lane_f16(__rev2_890, __p3_890)}); \
+  __ret_890 = __builtin_shufflevector(__ret_890, __ret_890, 3, 2, 1, 0); \
+  __ret_890; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmlsl_lane_low_f16(__p0_799, __p1_799, __p2_799, __p3_799) __extension__ ({ \
-  float32x2_t __s0_799 = __p0_799; \
-  float16x4_t __s1_799 = __p1_799; \
-  float16x4_t __s2_799 = __p2_799; \
-  float32x2_t __ret_799; \
-  __ret_799 = vfmlsl_low_f16(__s0_799, __s1_799, (float16x4_t) {vget_lane_f16(__s2_799, __p3_799), vget_lane_f16(__s2_799, __p3_799), vget_lane_f16(__s2_799, __p3_799), vget_lane_f16(__s2_799, __p3_799)}); \
-  __ret_799; \
+#define vfmlsl_lane_low_f16(__p0_891, __p1_891, __p2_891, __p3_891) __extension__ ({ \
+  float32x2_t __s0_891 = __p0_891; \
+  float16x4_t __s1_891 = __p1_891; \
+  float16x4_t __s2_891 = __p2_891; \
+  float32x2_t __ret_891; \
+  __ret_891 = vfmlsl_low_f16(__s0_891, __s1_891, (float16x4_t) {vget_lane_f16(__s2_891, __p3_891), vget_lane_f16(__s2_891, __p3_891), vget_lane_f16(__s2_891, __p3_891), vget_lane_f16(__s2_891, __p3_891)}); \
+  __ret_891; \
 })
 #else
-#define vfmlsl_lane_low_f16(__p0_800, __p1_800, __p2_800, __p3_800) __extension__ ({ \
-  float32x2_t __s0_800 = __p0_800; \
-  float16x4_t __s1_800 = __p1_800; \
-  float16x4_t __s2_800 = __p2_800; \
-  float32x2_t __rev0_800;  __rev0_800 = __builtin_shufflevector(__s0_800, __s0_800, 1, 0); \
-  float16x4_t __rev1_800;  __rev1_800 = __builtin_shufflevector(__s1_800, __s1_800, 3, 2, 1, 0); \
-  float16x4_t __rev2_800;  __rev2_800 = __builtin_shufflevector(__s2_800, __s2_800, 3, 2, 1, 0); \
-  float32x2_t __ret_800; \
-  __ret_800 = __noswap_vfmlsl_low_f16(__rev0_800, __rev1_800, (float16x4_t) {__noswap_vget_lane_f16(__rev2_800, __p3_800), __noswap_vget_lane_f16(__rev2_800, __p3_800), __noswap_vget_lane_f16(__rev2_800, __p3_800), __noswap_vget_lane_f16(__rev2_800, __p3_800)}); \
-  __ret_800 = __builtin_shufflevector(__ret_800, __ret_800, 1, 0); \
-  __ret_800; \
+#define vfmlsl_lane_low_f16(__p0_892, __p1_892, __p2_892, __p3_892) __extension__ ({ \
+  float32x2_t __s0_892 = __p0_892; \
+  float16x4_t __s1_892 = __p1_892; \
+  float16x4_t __s2_892 = __p2_892; \
+  float32x2_t __rev0_892;  __rev0_892 = __builtin_shufflevector(__s0_892, __s0_892, 1, 0); \
+  float16x4_t __rev1_892;  __rev1_892 = __builtin_shufflevector(__s1_892, __s1_892, 3, 2, 1, 0); \
+  float16x4_t __rev2_892;  __rev2_892 = __builtin_shufflevector(__s2_892, __s2_892, 3, 2, 1, 0); \
+  float32x2_t __ret_892; \
+  __ret_892 = __noswap_vfmlsl_low_f16(__rev0_892, __rev1_892, (float16x4_t) {__noswap_vget_lane_f16(__rev2_892, __p3_892), __noswap_vget_lane_f16(__rev2_892, __p3_892), __noswap_vget_lane_f16(__rev2_892, __p3_892), __noswap_vget_lane_f16(__rev2_892, __p3_892)}); \
+  __ret_892 = __builtin_shufflevector(__ret_892, __ret_892, 1, 0); \
+  __ret_892; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmlslq_laneq_high_f16(__p0_801, __p1_801, __p2_801, __p3_801) __extension__ ({ \
-  float32x4_t __s0_801 = __p0_801; \
-  float16x8_t __s1_801 = __p1_801; \
-  float16x8_t __s2_801 = __p2_801; \
-  float32x4_t __ret_801; \
-  __ret_801 = vfmlslq_high_f16(__s0_801, __s1_801, (float16x8_t) {vgetq_lane_f16(__s2_801, __p3_801), vgetq_lane_f16(__s2_801, __p3_801), vgetq_lane_f16(__s2_801, __p3_801), vgetq_lane_f16(__s2_801, __p3_801), vgetq_lane_f16(__s2_801, __p3_801), vgetq_lane_f16(__s2_801, __p3_801), vgetq_lane_f16(__s2_801, __p3_801), vgetq_lane_f16(__s2_801, __p3_801)}); \
-  __ret_801; \
+#define vfmlslq_laneq_high_f16(__p0_893, __p1_893, __p2_893, __p3_893) __extension__ ({ \
+  float32x4_t __s0_893 = __p0_893; \
+  float16x8_t __s1_893 = __p1_893; \
+  float16x8_t __s2_893 = __p2_893; \
+  float32x4_t __ret_893; \
+  __ret_893 = vfmlslq_high_f16(__s0_893, __s1_893, (float16x8_t) {vgetq_lane_f16(__s2_893, __p3_893), vgetq_lane_f16(__s2_893, __p3_893), vgetq_lane_f16(__s2_893, __p3_893), vgetq_lane_f16(__s2_893, __p3_893), vgetq_lane_f16(__s2_893, __p3_893), vgetq_lane_f16(__s2_893, __p3_893), vgetq_lane_f16(__s2_893, __p3_893), vgetq_lane_f16(__s2_893, __p3_893)}); \
+  __ret_893; \
 })
 #else
-#define vfmlslq_laneq_high_f16(__p0_802, __p1_802, __p2_802, __p3_802) __extension__ ({ \
-  float32x4_t __s0_802 = __p0_802; \
-  float16x8_t __s1_802 = __p1_802; \
-  float16x8_t __s2_802 = __p2_802; \
-  float32x4_t __rev0_802;  __rev0_802 = __builtin_shufflevector(__s0_802, __s0_802, 3, 2, 1, 0); \
-  float16x8_t __rev1_802;  __rev1_802 = __builtin_shufflevector(__s1_802, __s1_802, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __rev2_802;  __rev2_802 = __builtin_shufflevector(__s2_802, __s2_802, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float32x4_t __ret_802; \
-  __ret_802 = __noswap_vfmlslq_high_f16(__rev0_802, __rev1_802, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_802, __p3_802), __noswap_vgetq_lane_f16(__rev2_802, __p3_802), __noswap_vgetq_lane_f16(__rev2_802, __p3_802), __noswap_vgetq_lane_f16(__rev2_802, __p3_802), __noswap_vgetq_lane_f16(__rev2_802, __p3_802), __noswap_vgetq_lane_f16(__rev2_802, __p3_802), __noswap_vgetq_lane_f16(__rev2_802, __p3_802), __noswap_vgetq_lane_f16(__rev2_802, __p3_802)}); \
-  __ret_802 = __builtin_shufflevector(__ret_802, __ret_802, 3, 2, 1, 0); \
-  __ret_802; \
+#define vfmlslq_laneq_high_f16(__p0_894, __p1_894, __p2_894, __p3_894) __extension__ ({ \
+  float32x4_t __s0_894 = __p0_894; \
+  float16x8_t __s1_894 = __p1_894; \
+  float16x8_t __s2_894 = __p2_894; \
+  float32x4_t __rev0_894;  __rev0_894 = __builtin_shufflevector(__s0_894, __s0_894, 3, 2, 1, 0); \
+  float16x8_t __rev1_894;  __rev1_894 = __builtin_shufflevector(__s1_894, __s1_894, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __rev2_894;  __rev2_894 = __builtin_shufflevector(__s2_894, __s2_894, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float32x4_t __ret_894; \
+  __ret_894 = __noswap_vfmlslq_high_f16(__rev0_894, __rev1_894, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_894, __p3_894), __noswap_vgetq_lane_f16(__rev2_894, __p3_894), __noswap_vgetq_lane_f16(__rev2_894, __p3_894), __noswap_vgetq_lane_f16(__rev2_894, __p3_894), __noswap_vgetq_lane_f16(__rev2_894, __p3_894), __noswap_vgetq_lane_f16(__rev2_894, __p3_894), __noswap_vgetq_lane_f16(__rev2_894, __p3_894), __noswap_vgetq_lane_f16(__rev2_894, __p3_894)}); \
+  __ret_894 = __builtin_shufflevector(__ret_894, __ret_894, 3, 2, 1, 0); \
+  __ret_894; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmlsl_laneq_high_f16(__p0_803, __p1_803, __p2_803, __p3_803) __extension__ ({ \
-  float32x2_t __s0_803 = __p0_803; \
-  float16x4_t __s1_803 = __p1_803; \
-  float16x8_t __s2_803 = __p2_803; \
-  float32x2_t __ret_803; \
-  __ret_803 = vfmlsl_high_f16(__s0_803, __s1_803, (float16x4_t) {vgetq_lane_f16(__s2_803, __p3_803), vgetq_lane_f16(__s2_803, __p3_803), vgetq_lane_f16(__s2_803, __p3_803), vgetq_lane_f16(__s2_803, __p3_803)}); \
-  __ret_803; \
+#define vfmlsl_laneq_high_f16(__p0_895, __p1_895, __p2_895, __p3_895) __extension__ ({ \
+  float32x2_t __s0_895 = __p0_895; \
+  float16x4_t __s1_895 = __p1_895; \
+  float16x8_t __s2_895 = __p2_895; \
+  float32x2_t __ret_895; \
+  __ret_895 = vfmlsl_high_f16(__s0_895, __s1_895, (float16x4_t) {vgetq_lane_f16(__s2_895, __p3_895), vgetq_lane_f16(__s2_895, __p3_895), vgetq_lane_f16(__s2_895, __p3_895), vgetq_lane_f16(__s2_895, __p3_895)}); \
+  __ret_895; \
 })
 #else
-#define vfmlsl_laneq_high_f16(__p0_804, __p1_804, __p2_804, __p3_804) __extension__ ({ \
-  float32x2_t __s0_804 = __p0_804; \
-  float16x4_t __s1_804 = __p1_804; \
-  float16x8_t __s2_804 = __p2_804; \
-  float32x2_t __rev0_804;  __rev0_804 = __builtin_shufflevector(__s0_804, __s0_804, 1, 0); \
-  float16x4_t __rev1_804;  __rev1_804 = __builtin_shufflevector(__s1_804, __s1_804, 3, 2, 1, 0); \
-  float16x8_t __rev2_804;  __rev2_804 = __builtin_shufflevector(__s2_804, __s2_804, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float32x2_t __ret_804; \
-  __ret_804 = __noswap_vfmlsl_high_f16(__rev0_804, __rev1_804, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_804, __p3_804), __noswap_vgetq_lane_f16(__rev2_804, __p3_804), __noswap_vgetq_lane_f16(__rev2_804, __p3_804), __noswap_vgetq_lane_f16(__rev2_804, __p3_804)}); \
-  __ret_804 = __builtin_shufflevector(__ret_804, __ret_804, 1, 0); \
-  __ret_804; \
+#define vfmlsl_laneq_high_f16(__p0_896, __p1_896, __p2_896, __p3_896) __extension__ ({ \
+  float32x2_t __s0_896 = __p0_896; \
+  float16x4_t __s1_896 = __p1_896; \
+  float16x8_t __s2_896 = __p2_896; \
+  float32x2_t __rev0_896;  __rev0_896 = __builtin_shufflevector(__s0_896, __s0_896, 1, 0); \
+  float16x4_t __rev1_896;  __rev1_896 = __builtin_shufflevector(__s1_896, __s1_896, 3, 2, 1, 0); \
+  float16x8_t __rev2_896;  __rev2_896 = __builtin_shufflevector(__s2_896, __s2_896, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float32x2_t __ret_896; \
+  __ret_896 = __noswap_vfmlsl_high_f16(__rev0_896, __rev1_896, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_896, __p3_896), __noswap_vgetq_lane_f16(__rev2_896, __p3_896), __noswap_vgetq_lane_f16(__rev2_896, __p3_896), __noswap_vgetq_lane_f16(__rev2_896, __p3_896)}); \
+  __ret_896 = __builtin_shufflevector(__ret_896, __ret_896, 1, 0); \
+  __ret_896; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmlslq_laneq_low_f16(__p0_805, __p1_805, __p2_805, __p3_805) __extension__ ({ \
-  float32x4_t __s0_805 = __p0_805; \
-  float16x8_t __s1_805 = __p1_805; \
-  float16x8_t __s2_805 = __p2_805; \
-  float32x4_t __ret_805; \
-  __ret_805 = vfmlslq_low_f16(__s0_805, __s1_805, (float16x8_t) {vgetq_lane_f16(__s2_805, __p3_805), vgetq_lane_f16(__s2_805, __p3_805), vgetq_lane_f16(__s2_805, __p3_805), vgetq_lane_f16(__s2_805, __p3_805), vgetq_lane_f16(__s2_805, __p3_805), vgetq_lane_f16(__s2_805, __p3_805), vgetq_lane_f16(__s2_805, __p3_805), vgetq_lane_f16(__s2_805, __p3_805)}); \
-  __ret_805; \
+#define vfmlslq_laneq_low_f16(__p0_897, __p1_897, __p2_897, __p3_897) __extension__ ({ \
+  float32x4_t __s0_897 = __p0_897; \
+  float16x8_t __s1_897 = __p1_897; \
+  float16x8_t __s2_897 = __p2_897; \
+  float32x4_t __ret_897; \
+  __ret_897 = vfmlslq_low_f16(__s0_897, __s1_897, (float16x8_t) {vgetq_lane_f16(__s2_897, __p3_897), vgetq_lane_f16(__s2_897, __p3_897), vgetq_lane_f16(__s2_897, __p3_897), vgetq_lane_f16(__s2_897, __p3_897), vgetq_lane_f16(__s2_897, __p3_897), vgetq_lane_f16(__s2_897, __p3_897), vgetq_lane_f16(__s2_897, __p3_897), vgetq_lane_f16(__s2_897, __p3_897)}); \
+  __ret_897; \
 })
 #else
-#define vfmlslq_laneq_low_f16(__p0_806, __p1_806, __p2_806, __p3_806) __extension__ ({ \
-  float32x4_t __s0_806 = __p0_806; \
-  float16x8_t __s1_806 = __p1_806; \
-  float16x8_t __s2_806 = __p2_806; \
-  float32x4_t __rev0_806;  __rev0_806 = __builtin_shufflevector(__s0_806, __s0_806, 3, 2, 1, 0); \
-  float16x8_t __rev1_806;  __rev1_806 = __builtin_shufflevector(__s1_806, __s1_806, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __rev2_806;  __rev2_806 = __builtin_shufflevector(__s2_806, __s2_806, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float32x4_t __ret_806; \
-  __ret_806 = __noswap_vfmlslq_low_f16(__rev0_806, __rev1_806, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_806, __p3_806), __noswap_vgetq_lane_f16(__rev2_806, __p3_806), __noswap_vgetq_lane_f16(__rev2_806, __p3_806), __noswap_vgetq_lane_f16(__rev2_806, __p3_806), __noswap_vgetq_lane_f16(__rev2_806, __p3_806), __noswap_vgetq_lane_f16(__rev2_806, __p3_806), __noswap_vgetq_lane_f16(__rev2_806, __p3_806), __noswap_vgetq_lane_f16(__rev2_806, __p3_806)}); \
-  __ret_806 = __builtin_shufflevector(__ret_806, __ret_806, 3, 2, 1, 0); \
-  __ret_806; \
+#define vfmlslq_laneq_low_f16(__p0_898, __p1_898, __p2_898, __p3_898) __extension__ ({ \
+  float32x4_t __s0_898 = __p0_898; \
+  float16x8_t __s1_898 = __p1_898; \
+  float16x8_t __s2_898 = __p2_898; \
+  float32x4_t __rev0_898;  __rev0_898 = __builtin_shufflevector(__s0_898, __s0_898, 3, 2, 1, 0); \
+  float16x8_t __rev1_898;  __rev1_898 = __builtin_shufflevector(__s1_898, __s1_898, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __rev2_898;  __rev2_898 = __builtin_shufflevector(__s2_898, __s2_898, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float32x4_t __ret_898; \
+  __ret_898 = __noswap_vfmlslq_low_f16(__rev0_898, __rev1_898, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_898, __p3_898), __noswap_vgetq_lane_f16(__rev2_898, __p3_898), __noswap_vgetq_lane_f16(__rev2_898, __p3_898), __noswap_vgetq_lane_f16(__rev2_898, __p3_898), __noswap_vgetq_lane_f16(__rev2_898, __p3_898), __noswap_vgetq_lane_f16(__rev2_898, __p3_898), __noswap_vgetq_lane_f16(__rev2_898, __p3_898), __noswap_vgetq_lane_f16(__rev2_898, __p3_898)}); \
+  __ret_898 = __builtin_shufflevector(__ret_898, __ret_898, 3, 2, 1, 0); \
+  __ret_898; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmlsl_laneq_low_f16(__p0_807, __p1_807, __p2_807, __p3_807) __extension__ ({ \
-  float32x2_t __s0_807 = __p0_807; \
-  float16x4_t __s1_807 = __p1_807; \
-  float16x8_t __s2_807 = __p2_807; \
-  float32x2_t __ret_807; \
-  __ret_807 = vfmlsl_low_f16(__s0_807, __s1_807, (float16x4_t) {vgetq_lane_f16(__s2_807, __p3_807), vgetq_lane_f16(__s2_807, __p3_807), vgetq_lane_f16(__s2_807, __p3_807), vgetq_lane_f16(__s2_807, __p3_807)}); \
-  __ret_807; \
+#define vfmlsl_laneq_low_f16(__p0_899, __p1_899, __p2_899, __p3_899) __extension__ ({ \
+  float32x2_t __s0_899 = __p0_899; \
+  float16x4_t __s1_899 = __p1_899; \
+  float16x8_t __s2_899 = __p2_899; \
+  float32x2_t __ret_899; \
+  __ret_899 = vfmlsl_low_f16(__s0_899, __s1_899, (float16x4_t) {vgetq_lane_f16(__s2_899, __p3_899), vgetq_lane_f16(__s2_899, __p3_899), vgetq_lane_f16(__s2_899, __p3_899), vgetq_lane_f16(__s2_899, __p3_899)}); \
+  __ret_899; \
 })
 #else
-#define vfmlsl_laneq_low_f16(__p0_808, __p1_808, __p2_808, __p3_808) __extension__ ({ \
-  float32x2_t __s0_808 = __p0_808; \
-  float16x4_t __s1_808 = __p1_808; \
-  float16x8_t __s2_808 = __p2_808; \
-  float32x2_t __rev0_808;  __rev0_808 = __builtin_shufflevector(__s0_808, __s0_808, 1, 0); \
-  float16x4_t __rev1_808;  __rev1_808 = __builtin_shufflevector(__s1_808, __s1_808, 3, 2, 1, 0); \
-  float16x8_t __rev2_808;  __rev2_808 = __builtin_shufflevector(__s2_808, __s2_808, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float32x2_t __ret_808; \
-  __ret_808 = __noswap_vfmlsl_low_f16(__rev0_808, __rev1_808, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_808, __p3_808), __noswap_vgetq_lane_f16(__rev2_808, __p3_808), __noswap_vgetq_lane_f16(__rev2_808, __p3_808), __noswap_vgetq_lane_f16(__rev2_808, __p3_808)}); \
-  __ret_808 = __builtin_shufflevector(__ret_808, __ret_808, 1, 0); \
-  __ret_808; \
+#define vfmlsl_laneq_low_f16(__p0_900, __p1_900, __p2_900, __p3_900) __extension__ ({ \
+  float32x2_t __s0_900 = __p0_900; \
+  float16x4_t __s1_900 = __p1_900; \
+  float16x8_t __s2_900 = __p2_900; \
+  float32x2_t __rev0_900;  __rev0_900 = __builtin_shufflevector(__s0_900, __s0_900, 1, 0); \
+  float16x4_t __rev1_900;  __rev1_900 = __builtin_shufflevector(__s1_900, __s1_900, 3, 2, 1, 0); \
+  float16x8_t __rev2_900;  __rev2_900 = __builtin_shufflevector(__s2_900, __s2_900, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float32x2_t __ret_900; \
+  __ret_900 = __noswap_vfmlsl_low_f16(__rev0_900, __rev1_900, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_900, __p3_900), __noswap_vgetq_lane_f16(__rev2_900, __p3_900), __noswap_vgetq_lane_f16(__rev2_900, __p3_900), __noswap_vgetq_lane_f16(__rev2_900, __p3_900)}); \
+  __ret_900 = __builtin_shufflevector(__ret_900, __ret_900, 1, 0); \
+  __ret_900; \
 })
 #endif
 
 #endif
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(__aarch64__)
 #ifdef __LITTLE_ENDIAN__
-#define vmulh_lane_f16(__p0_809, __p1_809, __p2_809) __extension__ ({ \
-  float16_t __s0_809 = __p0_809; \
-  float16x4_t __s1_809 = __p1_809; \
-  float16_t __ret_809; \
-  __ret_809 = __s0_809 * vget_lane_f16(__s1_809, __p2_809); \
-  __ret_809; \
+#define vmulh_lane_f16(__p0_901, __p1_901, __p2_901) __extension__ ({ \
+  float16_t __s0_901 = __p0_901; \
+  float16x4_t __s1_901 = __p1_901; \
+  float16_t __ret_901; \
+  __ret_901 = __s0_901 * vget_lane_f16(__s1_901, __p2_901); \
+  __ret_901; \
 })
 #else
-#define vmulh_lane_f16(__p0_810, __p1_810, __p2_810) __extension__ ({ \
-  float16_t __s0_810 = __p0_810; \
-  float16x4_t __s1_810 = __p1_810; \
-  float16x4_t __rev1_810;  __rev1_810 = __builtin_shufflevector(__s1_810, __s1_810, 3, 2, 1, 0); \
-  float16_t __ret_810; \
-  __ret_810 = __s0_810 * __noswap_vget_lane_f16(__rev1_810, __p2_810); \
-  __ret_810; \
+#define vmulh_lane_f16(__p0_902, __p1_902, __p2_902) __extension__ ({ \
+  float16_t __s0_902 = __p0_902; \
+  float16x4_t __s1_902 = __p1_902; \
+  float16x4_t __rev1_902;  __rev1_902 = __builtin_shufflevector(__s1_902, __s1_902, 3, 2, 1, 0); \
+  float16_t __ret_902; \
+  __ret_902 = __s0_902 * __noswap_vget_lane_f16(__rev1_902, __p2_902); \
+  __ret_902; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vmulh_laneq_f16(__p0_811, __p1_811, __p2_811) __extension__ ({ \
-  float16_t __s0_811 = __p0_811; \
-  float16x8_t __s1_811 = __p1_811; \
-  float16_t __ret_811; \
-  __ret_811 = __s0_811 * vgetq_lane_f16(__s1_811, __p2_811); \
-  __ret_811; \
+#define vmulh_laneq_f16(__p0_903, __p1_903, __p2_903) __extension__ ({ \
+  float16_t __s0_903 = __p0_903; \
+  float16x8_t __s1_903 = __p1_903; \
+  float16_t __ret_903; \
+  __ret_903 = __s0_903 * vgetq_lane_f16(__s1_903, __p2_903); \
+  __ret_903; \
 })
 #else
-#define vmulh_laneq_f16(__p0_812, __p1_812, __p2_812) __extension__ ({ \
-  float16_t __s0_812 = __p0_812; \
-  float16x8_t __s1_812 = __p1_812; \
-  float16x8_t __rev1_812;  __rev1_812 = __builtin_shufflevector(__s1_812, __s1_812, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16_t __ret_812; \
-  __ret_812 = __s0_812 * __noswap_vgetq_lane_f16(__rev1_812, __p2_812); \
-  __ret_812; \
+#define vmulh_laneq_f16(__p0_904, __p1_904, __p2_904) __extension__ ({ \
+  float16_t __s0_904 = __p0_904; \
+  float16x8_t __s1_904 = __p1_904; \
+  float16x8_t __rev1_904;  __rev1_904 = __builtin_shufflevector(__s1_904, __s1_904, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16_t __ret_904; \
+  __ret_904 = __s0_904 * __noswap_vgetq_lane_f16(__rev1_904, __p2_904); \
+  __ret_904; \
 })
 #endif
 
 #endif
 #if defined(__ARM_FEATURE_MATMUL_INT8)
 #ifdef __LITTLE_ENDIAN__
-#define vsudotq_lane_s32(__p0_813, __p1_813, __p2_813, __p3_813) __extension__ ({ \
-  int32x4_t __s0_813 = __p0_813; \
-  int8x16_t __s1_813 = __p1_813; \
-  uint8x8_t __s2_813 = __p2_813; \
-  int32x4_t __ret_813; \
-uint8x8_t __reint_813 = __s2_813; \
-  __ret_813 = vusdotq_s32(__s0_813, (uint8x16_t)(splatq_lane_s32(*(int32x2_t *) &__reint_813, __p3_813)), __s1_813); \
-  __ret_813; \
+#define vsudotq_lane_s32(__p0_905, __p1_905, __p2_905, __p3_905) __extension__ ({ \
+  int32x4_t __s0_905 = __p0_905; \
+  int8x16_t __s1_905 = __p1_905; \
+  uint8x8_t __s2_905 = __p2_905; \
+  int32x4_t __ret_905; \
+uint8x8_t __reint_905 = __s2_905; \
+  __ret_905 = vusdotq_s32(__s0_905, (uint8x16_t)(splatq_lane_s32(*(int32x2_t *) &__reint_905, __p3_905)), __s1_905); \
+  __ret_905; \
 })
 #else
-#define vsudotq_lane_s32(__p0_814, __p1_814, __p2_814, __p3_814) __extension__ ({ \
-  int32x4_t __s0_814 = __p0_814; \
-  int8x16_t __s1_814 = __p1_814; \
-  uint8x8_t __s2_814 = __p2_814; \
-  int32x4_t __rev0_814;  __rev0_814 = __builtin_shufflevector(__s0_814, __s0_814, 3, 2, 1, 0); \
-  int8x16_t __rev1_814;  __rev1_814 = __builtin_shufflevector(__s1_814, __s1_814, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x8_t __rev2_814;  __rev2_814 = __builtin_shufflevector(__s2_814, __s2_814, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int32x4_t __ret_814; \
-uint8x8_t __reint_814 = __rev2_814; \
-  __ret_814 = __noswap_vusdotq_s32(__rev0_814, (uint8x16_t)(__noswap_splatq_lane_s32(*(int32x2_t *) &__reint_814, __p3_814)), __rev1_814); \
-  __ret_814 = __builtin_shufflevector(__ret_814, __ret_814, 3, 2, 1, 0); \
-  __ret_814; \
+#define vsudotq_lane_s32(__p0_906, __p1_906, __p2_906, __p3_906) __extension__ ({ \
+  int32x4_t __s0_906 = __p0_906; \
+  int8x16_t __s1_906 = __p1_906; \
+  uint8x8_t __s2_906 = __p2_906; \
+  int32x4_t __rev0_906;  __rev0_906 = __builtin_shufflevector(__s0_906, __s0_906, 3, 2, 1, 0); \
+  int8x16_t __rev1_906;  __rev1_906 = __builtin_shufflevector(__s1_906, __s1_906, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __rev2_906;  __rev2_906 = __builtin_shufflevector(__s2_906, __s2_906, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret_906; \
+uint8x8_t __reint_906 = __rev2_906; \
+  __ret_906 = __noswap_vusdotq_s32(__rev0_906, (uint8x16_t)(__noswap_splatq_lane_s32(*(int32x2_t *) &__reint_906, __p3_906)), __rev1_906); \
+  __ret_906 = __builtin_shufflevector(__ret_906, __ret_906, 3, 2, 1, 0); \
+  __ret_906; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vsudot_lane_s32(__p0_815, __p1_815, __p2_815, __p3_815) __extension__ ({ \
-  int32x2_t __s0_815 = __p0_815; \
-  int8x8_t __s1_815 = __p1_815; \
-  uint8x8_t __s2_815 = __p2_815; \
-  int32x2_t __ret_815; \
-uint8x8_t __reint_815 = __s2_815; \
-  __ret_815 = vusdot_s32(__s0_815, (uint8x8_t)(splat_lane_s32(*(int32x2_t *) &__reint_815, __p3_815)), __s1_815); \
-  __ret_815; \
+#define vsudot_lane_s32(__p0_907, __p1_907, __p2_907, __p3_907) __extension__ ({ \
+  int32x2_t __s0_907 = __p0_907; \
+  int8x8_t __s1_907 = __p1_907; \
+  uint8x8_t __s2_907 = __p2_907; \
+  int32x2_t __ret_907; \
+uint8x8_t __reint_907 = __s2_907; \
+  __ret_907 = vusdot_s32(__s0_907, (uint8x8_t)(splat_lane_s32(*(int32x2_t *) &__reint_907, __p3_907)), __s1_907); \
+  __ret_907; \
 })
 #else
-#define vsudot_lane_s32(__p0_816, __p1_816, __p2_816, __p3_816) __extension__ ({ \
-  int32x2_t __s0_816 = __p0_816; \
-  int8x8_t __s1_816 = __p1_816; \
-  uint8x8_t __s2_816 = __p2_816; \
-  int32x2_t __rev0_816;  __rev0_816 = __builtin_shufflevector(__s0_816, __s0_816, 1, 0); \
-  int8x8_t __rev1_816;  __rev1_816 = __builtin_shufflevector(__s1_816, __s1_816, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x8_t __rev2_816;  __rev2_816 = __builtin_shufflevector(__s2_816, __s2_816, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int32x2_t __ret_816; \
-uint8x8_t __reint_816 = __rev2_816; \
-  __ret_816 = __noswap_vusdot_s32(__rev0_816, (uint8x8_t)(__noswap_splat_lane_s32(*(int32x2_t *) &__reint_816, __p3_816)), __rev1_816); \
-  __ret_816 = __builtin_shufflevector(__ret_816, __ret_816, 1, 0); \
-  __ret_816; \
+#define vsudot_lane_s32(__p0_908, __p1_908, __p2_908, __p3_908) __extension__ ({ \
+  int32x2_t __s0_908 = __p0_908; \
+  int8x8_t __s1_908 = __p1_908; \
+  uint8x8_t __s2_908 = __p2_908; \
+  int32x2_t __rev0_908;  __rev0_908 = __builtin_shufflevector(__s0_908, __s0_908, 1, 0); \
+  int8x8_t __rev1_908;  __rev1_908 = __builtin_shufflevector(__s1_908, __s1_908, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __rev2_908;  __rev2_908 = __builtin_shufflevector(__s2_908, __s2_908, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x2_t __ret_908; \
+uint8x8_t __reint_908 = __rev2_908; \
+  __ret_908 = __noswap_vusdot_s32(__rev0_908, (uint8x8_t)(__noswap_splat_lane_s32(*(int32x2_t *) &__reint_908, __p3_908)), __rev1_908); \
+  __ret_908 = __builtin_shufflevector(__ret_908, __ret_908, 1, 0); \
+  __ret_908; \
 })
 #endif
 
@@ -66049,86 +67719,86 @@ __ai int16_t vqrdmlahh_s16(int16_t __p0, int16_t __p1, int16_t __p2) {
   return __ret;
 }
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmlahs_lane_s32(__p0_817, __p1_817, __p2_817, __p3_817) __extension__ ({ \
-  int32_t __s0_817 = __p0_817; \
-  int32_t __s1_817 = __p1_817; \
-  int32x2_t __s2_817 = __p2_817; \
-  int32_t __ret_817; \
-  __ret_817 = vqadds_s32(__s0_817, vqrdmulhs_s32(__s1_817, vget_lane_s32(__s2_817, __p3_817))); \
-  __ret_817; \
+#define vqrdmlahs_lane_s32(__p0_909, __p1_909, __p2_909, __p3_909) __extension__ ({ \
+  int32_t __s0_909 = __p0_909; \
+  int32_t __s1_909 = __p1_909; \
+  int32x2_t __s2_909 = __p2_909; \
+  int32_t __ret_909; \
+  __ret_909 = vqadds_s32(__s0_909, vqrdmulhs_s32(__s1_909, vget_lane_s32(__s2_909, __p3_909))); \
+  __ret_909; \
 })
 #else
-#define vqrdmlahs_lane_s32(__p0_818, __p1_818, __p2_818, __p3_818) __extension__ ({ \
-  int32_t __s0_818 = __p0_818; \
-  int32_t __s1_818 = __p1_818; \
-  int32x2_t __s2_818 = __p2_818; \
-  int32x2_t __rev2_818;  __rev2_818 = __builtin_shufflevector(__s2_818, __s2_818, 1, 0); \
-  int32_t __ret_818; \
-  __ret_818 = vqadds_s32(__s0_818, vqrdmulhs_s32(__s1_818, __noswap_vget_lane_s32(__rev2_818, __p3_818))); \
-  __ret_818; \
+#define vqrdmlahs_lane_s32(__p0_910, __p1_910, __p2_910, __p3_910) __extension__ ({ \
+  int32_t __s0_910 = __p0_910; \
+  int32_t __s1_910 = __p1_910; \
+  int32x2_t __s2_910 = __p2_910; \
+  int32x2_t __rev2_910;  __rev2_910 = __builtin_shufflevector(__s2_910, __s2_910, 1, 0); \
+  int32_t __ret_910; \
+  __ret_910 = vqadds_s32(__s0_910, vqrdmulhs_s32(__s1_910, __noswap_vget_lane_s32(__rev2_910, __p3_910))); \
+  __ret_910; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmlahh_lane_s16(__p0_819, __p1_819, __p2_819, __p3_819) __extension__ ({ \
-  int16_t __s0_819 = __p0_819; \
-  int16_t __s1_819 = __p1_819; \
-  int16x4_t __s2_819 = __p2_819; \
-  int16_t __ret_819; \
-  __ret_819 = vqaddh_s16(__s0_819, vqrdmulhh_s16(__s1_819, vget_lane_s16(__s2_819, __p3_819))); \
-  __ret_819; \
+#define vqrdmlahh_lane_s16(__p0_911, __p1_911, __p2_911, __p3_911) __extension__ ({ \
+  int16_t __s0_911 = __p0_911; \
+  int16_t __s1_911 = __p1_911; \
+  int16x4_t __s2_911 = __p2_911; \
+  int16_t __ret_911; \
+  __ret_911 = vqaddh_s16(__s0_911, vqrdmulhh_s16(__s1_911, vget_lane_s16(__s2_911, __p3_911))); \
+  __ret_911; \
 })
 #else
-#define vqrdmlahh_lane_s16(__p0_820, __p1_820, __p2_820, __p3_820) __extension__ ({ \
-  int16_t __s0_820 = __p0_820; \
-  int16_t __s1_820 = __p1_820; \
-  int16x4_t __s2_820 = __p2_820; \
-  int16x4_t __rev2_820;  __rev2_820 = __builtin_shufflevector(__s2_820, __s2_820, 3, 2, 1, 0); \
-  int16_t __ret_820; \
-  __ret_820 = vqaddh_s16(__s0_820, vqrdmulhh_s16(__s1_820, __noswap_vget_lane_s16(__rev2_820, __p3_820))); \
-  __ret_820; \
+#define vqrdmlahh_lane_s16(__p0_912, __p1_912, __p2_912, __p3_912) __extension__ ({ \
+  int16_t __s0_912 = __p0_912; \
+  int16_t __s1_912 = __p1_912; \
+  int16x4_t __s2_912 = __p2_912; \
+  int16x4_t __rev2_912;  __rev2_912 = __builtin_shufflevector(__s2_912, __s2_912, 3, 2, 1, 0); \
+  int16_t __ret_912; \
+  __ret_912 = vqaddh_s16(__s0_912, vqrdmulhh_s16(__s1_912, __noswap_vget_lane_s16(__rev2_912, __p3_912))); \
+  __ret_912; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmlahs_laneq_s32(__p0_821, __p1_821, __p2_821, __p3_821) __extension__ ({ \
-  int32_t __s0_821 = __p0_821; \
-  int32_t __s1_821 = __p1_821; \
-  int32x4_t __s2_821 = __p2_821; \
-  int32_t __ret_821; \
-  __ret_821 = vqadds_s32(__s0_821, vqrdmulhs_s32(__s1_821, vgetq_lane_s32(__s2_821, __p3_821))); \
-  __ret_821; \
+#define vqrdmlahs_laneq_s32(__p0_913, __p1_913, __p2_913, __p3_913) __extension__ ({ \
+  int32_t __s0_913 = __p0_913; \
+  int32_t __s1_913 = __p1_913; \
+  int32x4_t __s2_913 = __p2_913; \
+  int32_t __ret_913; \
+  __ret_913 = vqadds_s32(__s0_913, vqrdmulhs_s32(__s1_913, vgetq_lane_s32(__s2_913, __p3_913))); \
+  __ret_913; \
 })
 #else
-#define vqrdmlahs_laneq_s32(__p0_822, __p1_822, __p2_822, __p3_822) __extension__ ({ \
-  int32_t __s0_822 = __p0_822; \
-  int32_t __s1_822 = __p1_822; \
-  int32x4_t __s2_822 = __p2_822; \
-  int32x4_t __rev2_822;  __rev2_822 = __builtin_shufflevector(__s2_822, __s2_822, 3, 2, 1, 0); \
-  int32_t __ret_822; \
-  __ret_822 = vqadds_s32(__s0_822, vqrdmulhs_s32(__s1_822, __noswap_vgetq_lane_s32(__rev2_822, __p3_822))); \
-  __ret_822; \
+#define vqrdmlahs_laneq_s32(__p0_914, __p1_914, __p2_914, __p3_914) __extension__ ({ \
+  int32_t __s0_914 = __p0_914; \
+  int32_t __s1_914 = __p1_914; \
+  int32x4_t __s2_914 = __p2_914; \
+  int32x4_t __rev2_914;  __rev2_914 = __builtin_shufflevector(__s2_914, __s2_914, 3, 2, 1, 0); \
+  int32_t __ret_914; \
+  __ret_914 = vqadds_s32(__s0_914, vqrdmulhs_s32(__s1_914, __noswap_vgetq_lane_s32(__rev2_914, __p3_914))); \
+  __ret_914; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmlahh_laneq_s16(__p0_823, __p1_823, __p2_823, __p3_823) __extension__ ({ \
-  int16_t __s0_823 = __p0_823; \
-  int16_t __s1_823 = __p1_823; \
-  int16x8_t __s2_823 = __p2_823; \
-  int16_t __ret_823; \
-  __ret_823 = vqaddh_s16(__s0_823, vqrdmulhh_s16(__s1_823, vgetq_lane_s16(__s2_823, __p3_823))); \
-  __ret_823; \
+#define vqrdmlahh_laneq_s16(__p0_915, __p1_915, __p2_915, __p3_915) __extension__ ({ \
+  int16_t __s0_915 = __p0_915; \
+  int16_t __s1_915 = __p1_915; \
+  int16x8_t __s2_915 = __p2_915; \
+  int16_t __ret_915; \
+  __ret_915 = vqaddh_s16(__s0_915, vqrdmulhh_s16(__s1_915, vgetq_lane_s16(__s2_915, __p3_915))); \
+  __ret_915; \
 })
 #else
-#define vqrdmlahh_laneq_s16(__p0_824, __p1_824, __p2_824, __p3_824) __extension__ ({ \
-  int16_t __s0_824 = __p0_824; \
-  int16_t __s1_824 = __p1_824; \
-  int16x8_t __s2_824 = __p2_824; \
-  int16x8_t __rev2_824;  __rev2_824 = __builtin_shufflevector(__s2_824, __s2_824, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16_t __ret_824; \
-  __ret_824 = vqaddh_s16(__s0_824, vqrdmulhh_s16(__s1_824, __noswap_vgetq_lane_s16(__rev2_824, __p3_824))); \
-  __ret_824; \
+#define vqrdmlahh_laneq_s16(__p0_916, __p1_916, __p2_916, __p3_916) __extension__ ({ \
+  int16_t __s0_916 = __p0_916; \
+  int16_t __s1_916 = __p1_916; \
+  int16x8_t __s2_916 = __p2_916; \
+  int16x8_t __rev2_916;  __rev2_916 = __builtin_shufflevector(__s2_916, __s2_916, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16_t __ret_916; \
+  __ret_916 = vqaddh_s16(__s0_916, vqrdmulhh_s16(__s1_916, __noswap_vgetq_lane_s16(__rev2_916, __p3_916))); \
+  __ret_916; \
 })
 #endif
 
@@ -66143,86 +67813,86 @@ __ai int16_t vqrdmlshh_s16(int16_t __p0, int16_t __p1, int16_t __p2) {
   return __ret;
 }
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmlshs_lane_s32(__p0_825, __p1_825, __p2_825, __p3_825) __extension__ ({ \
-  int32_t __s0_825 = __p0_825; \
-  int32_t __s1_825 = __p1_825; \
-  int32x2_t __s2_825 = __p2_825; \
-  int32_t __ret_825; \
-  __ret_825 = vqsubs_s32(__s0_825, vqrdmulhs_s32(__s1_825, vget_lane_s32(__s2_825, __p3_825))); \
-  __ret_825; \
+#define vqrdmlshs_lane_s32(__p0_917, __p1_917, __p2_917, __p3_917) __extension__ ({ \
+  int32_t __s0_917 = __p0_917; \
+  int32_t __s1_917 = __p1_917; \
+  int32x2_t __s2_917 = __p2_917; \
+  int32_t __ret_917; \
+  __ret_917 = vqsubs_s32(__s0_917, vqrdmulhs_s32(__s1_917, vget_lane_s32(__s2_917, __p3_917))); \
+  __ret_917; \
 })
 #else
-#define vqrdmlshs_lane_s32(__p0_826, __p1_826, __p2_826, __p3_826) __extension__ ({ \
-  int32_t __s0_826 = __p0_826; \
-  int32_t __s1_826 = __p1_826; \
-  int32x2_t __s2_826 = __p2_826; \
-  int32x2_t __rev2_826;  __rev2_826 = __builtin_shufflevector(__s2_826, __s2_826, 1, 0); \
-  int32_t __ret_826; \
-  __ret_826 = vqsubs_s32(__s0_826, vqrdmulhs_s32(__s1_826, __noswap_vget_lane_s32(__rev2_826, __p3_826))); \
-  __ret_826; \
+#define vqrdmlshs_lane_s32(__p0_918, __p1_918, __p2_918, __p3_918) __extension__ ({ \
+  int32_t __s0_918 = __p0_918; \
+  int32_t __s1_918 = __p1_918; \
+  int32x2_t __s2_918 = __p2_918; \
+  int32x2_t __rev2_918;  __rev2_918 = __builtin_shufflevector(__s2_918, __s2_918, 1, 0); \
+  int32_t __ret_918; \
+  __ret_918 = vqsubs_s32(__s0_918, vqrdmulhs_s32(__s1_918, __noswap_vget_lane_s32(__rev2_918, __p3_918))); \
+  __ret_918; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmlshh_lane_s16(__p0_827, __p1_827, __p2_827, __p3_827) __extension__ ({ \
-  int16_t __s0_827 = __p0_827; \
-  int16_t __s1_827 = __p1_827; \
-  int16x4_t __s2_827 = __p2_827; \
-  int16_t __ret_827; \
-  __ret_827 = vqsubh_s16(__s0_827, vqrdmulhh_s16(__s1_827, vget_lane_s16(__s2_827, __p3_827))); \
-  __ret_827; \
+#define vqrdmlshh_lane_s16(__p0_919, __p1_919, __p2_919, __p3_919) __extension__ ({ \
+  int16_t __s0_919 = __p0_919; \
+  int16_t __s1_919 = __p1_919; \
+  int16x4_t __s2_919 = __p2_919; \
+  int16_t __ret_919; \
+  __ret_919 = vqsubh_s16(__s0_919, vqrdmulhh_s16(__s1_919, vget_lane_s16(__s2_919, __p3_919))); \
+  __ret_919; \
 })
 #else
-#define vqrdmlshh_lane_s16(__p0_828, __p1_828, __p2_828, __p3_828) __extension__ ({ \
-  int16_t __s0_828 = __p0_828; \
-  int16_t __s1_828 = __p1_828; \
-  int16x4_t __s2_828 = __p2_828; \
-  int16x4_t __rev2_828;  __rev2_828 = __builtin_shufflevector(__s2_828, __s2_828, 3, 2, 1, 0); \
-  int16_t __ret_828; \
-  __ret_828 = vqsubh_s16(__s0_828, vqrdmulhh_s16(__s1_828, __noswap_vget_lane_s16(__rev2_828, __p3_828))); \
-  __ret_828; \
+#define vqrdmlshh_lane_s16(__p0_920, __p1_920, __p2_920, __p3_920) __extension__ ({ \
+  int16_t __s0_920 = __p0_920; \
+  int16_t __s1_920 = __p1_920; \
+  int16x4_t __s2_920 = __p2_920; \
+  int16x4_t __rev2_920;  __rev2_920 = __builtin_shufflevector(__s2_920, __s2_920, 3, 2, 1, 0); \
+  int16_t __ret_920; \
+  __ret_920 = vqsubh_s16(__s0_920, vqrdmulhh_s16(__s1_920, __noswap_vget_lane_s16(__rev2_920, __p3_920))); \
+  __ret_920; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmlshs_laneq_s32(__p0_829, __p1_829, __p2_829, __p3_829) __extension__ ({ \
-  int32_t __s0_829 = __p0_829; \
-  int32_t __s1_829 = __p1_829; \
-  int32x4_t __s2_829 = __p2_829; \
-  int32_t __ret_829; \
-  __ret_829 = vqsubs_s32(__s0_829, vqrdmulhs_s32(__s1_829, vgetq_lane_s32(__s2_829, __p3_829))); \
-  __ret_829; \
+#define vqrdmlshs_laneq_s32(__p0_921, __p1_921, __p2_921, __p3_921) __extension__ ({ \
+  int32_t __s0_921 = __p0_921; \
+  int32_t __s1_921 = __p1_921; \
+  int32x4_t __s2_921 = __p2_921; \
+  int32_t __ret_921; \
+  __ret_921 = vqsubs_s32(__s0_921, vqrdmulhs_s32(__s1_921, vgetq_lane_s32(__s2_921, __p3_921))); \
+  __ret_921; \
 })
 #else
-#define vqrdmlshs_laneq_s32(__p0_830, __p1_830, __p2_830, __p3_830) __extension__ ({ \
-  int32_t __s0_830 = __p0_830; \
-  int32_t __s1_830 = __p1_830; \
-  int32x4_t __s2_830 = __p2_830; \
-  int32x4_t __rev2_830;  __rev2_830 = __builtin_shufflevector(__s2_830, __s2_830, 3, 2, 1, 0); \
-  int32_t __ret_830; \
-  __ret_830 = vqsubs_s32(__s0_830, vqrdmulhs_s32(__s1_830, __noswap_vgetq_lane_s32(__rev2_830, __p3_830))); \
-  __ret_830; \
+#define vqrdmlshs_laneq_s32(__p0_922, __p1_922, __p2_922, __p3_922) __extension__ ({ \
+  int32_t __s0_922 = __p0_922; \
+  int32_t __s1_922 = __p1_922; \
+  int32x4_t __s2_922 = __p2_922; \
+  int32x4_t __rev2_922;  __rev2_922 = __builtin_shufflevector(__s2_922, __s2_922, 3, 2, 1, 0); \
+  int32_t __ret_922; \
+  __ret_922 = vqsubs_s32(__s0_922, vqrdmulhs_s32(__s1_922, __noswap_vgetq_lane_s32(__rev2_922, __p3_922))); \
+  __ret_922; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vqrdmlshh_laneq_s16(__p0_831, __p1_831, __p2_831, __p3_831) __extension__ ({ \
-  int16_t __s0_831 = __p0_831; \
-  int16_t __s1_831 = __p1_831; \
-  int16x8_t __s2_831 = __p2_831; \
-  int16_t __ret_831; \
-  __ret_831 = vqsubh_s16(__s0_831, vqrdmulhh_s16(__s1_831, vgetq_lane_s16(__s2_831, __p3_831))); \
-  __ret_831; \
+#define vqrdmlshh_laneq_s16(__p0_923, __p1_923, __p2_923, __p3_923) __extension__ ({ \
+  int16_t __s0_923 = __p0_923; \
+  int16_t __s1_923 = __p1_923; \
+  int16x8_t __s2_923 = __p2_923; \
+  int16_t __ret_923; \
+  __ret_923 = vqsubh_s16(__s0_923, vqrdmulhh_s16(__s1_923, vgetq_lane_s16(__s2_923, __p3_923))); \
+  __ret_923; \
 })
 #else
-#define vqrdmlshh_laneq_s16(__p0_832, __p1_832, __p2_832, __p3_832) __extension__ ({ \
-  int16_t __s0_832 = __p0_832; \
-  int16_t __s1_832 = __p1_832; \
-  int16x8_t __s2_832 = __p2_832; \
-  int16x8_t __rev2_832;  __rev2_832 = __builtin_shufflevector(__s2_832, __s2_832, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16_t __ret_832; \
-  __ret_832 = vqsubh_s16(__s0_832, vqrdmulhh_s16(__s1_832, __noswap_vgetq_lane_s16(__rev2_832, __p3_832))); \
-  __ret_832; \
+#define vqrdmlshh_laneq_s16(__p0_924, __p1_924, __p2_924, __p3_924) __extension__ ({ \
+  int16_t __s0_924 = __p0_924; \
+  int16_t __s1_924 = __p1_924; \
+  int16x8_t __s2_924 = __p2_924; \
+  int16x8_t __rev2_924;  __rev2_924 = __builtin_shufflevector(__s2_924, __s2_924, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16_t __ret_924; \
+  __ret_924 = vqsubh_s16(__s0_924, vqrdmulhh_s16(__s1_924, __noswap_vgetq_lane_s16(__rev2_924, __p3_924))); \
+  __ret_924; \
 })
 #endif
 
@@ -66535,136 +68205,136 @@ __ai int32x4_t vaddw_high_s16(int32x4_t __p0, int16x8_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_p64(__p0_833, __p1_833, __p2_833, __p3_833) __extension__ ({ \
-  poly64x2_t __s0_833 = __p0_833; \
-  poly64x1_t __s2_833 = __p2_833; \
-  poly64x2_t __ret_833; \
-  __ret_833 = vsetq_lane_p64(vget_lane_p64(__s2_833, __p3_833), __s0_833, __p1_833); \
-  __ret_833; \
+#define vcopyq_lane_p64(__p0_925, __p1_925, __p2_925, __p3_925) __extension__ ({ \
+  poly64x2_t __s0_925 = __p0_925; \
+  poly64x1_t __s2_925 = __p2_925; \
+  poly64x2_t __ret_925; \
+  __ret_925 = vsetq_lane_p64(vget_lane_p64(__s2_925, __p3_925), __s0_925, __p1_925); \
+  __ret_925; \
 })
 #else
-#define vcopyq_lane_p64(__p0_834, __p1_834, __p2_834, __p3_834) __extension__ ({ \
-  poly64x2_t __s0_834 = __p0_834; \
-  poly64x1_t __s2_834 = __p2_834; \
-  poly64x2_t __rev0_834;  __rev0_834 = __builtin_shufflevector(__s0_834, __s0_834, 1, 0); \
-  poly64x2_t __ret_834; \
-  __ret_834 = __noswap_vsetq_lane_p64(vget_lane_p64(__s2_834, __p3_834), __rev0_834, __p1_834); \
-  __ret_834 = __builtin_shufflevector(__ret_834, __ret_834, 1, 0); \
-  __ret_834; \
+#define vcopyq_lane_p64(__p0_926, __p1_926, __p2_926, __p3_926) __extension__ ({ \
+  poly64x2_t __s0_926 = __p0_926; \
+  poly64x1_t __s2_926 = __p2_926; \
+  poly64x2_t __rev0_926;  __rev0_926 = __builtin_shufflevector(__s0_926, __s0_926, 1, 0); \
+  poly64x2_t __ret_926; \
+  __ret_926 = __noswap_vsetq_lane_p64(vget_lane_p64(__s2_926, __p3_926), __rev0_926, __p1_926); \
+  __ret_926 = __builtin_shufflevector(__ret_926, __ret_926, 1, 0); \
+  __ret_926; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_f64(__p0_835, __p1_835, __p2_835, __p3_835) __extension__ ({ \
-  float64x2_t __s0_835 = __p0_835; \
-  float64x1_t __s2_835 = __p2_835; \
-  float64x2_t __ret_835; \
-  __ret_835 = vsetq_lane_f64(vget_lane_f64(__s2_835, __p3_835), __s0_835, __p1_835); \
-  __ret_835; \
+#define vcopyq_lane_f64(__p0_927, __p1_927, __p2_927, __p3_927) __extension__ ({ \
+  float64x2_t __s0_927 = __p0_927; \
+  float64x1_t __s2_927 = __p2_927; \
+  float64x2_t __ret_927; \
+  __ret_927 = vsetq_lane_f64(vget_lane_f64(__s2_927, __p3_927), __s0_927, __p1_927); \
+  __ret_927; \
 })
 #else
-#define vcopyq_lane_f64(__p0_836, __p1_836, __p2_836, __p3_836) __extension__ ({ \
-  float64x2_t __s0_836 = __p0_836; \
-  float64x1_t __s2_836 = __p2_836; \
-  float64x2_t __rev0_836;  __rev0_836 = __builtin_shufflevector(__s0_836, __s0_836, 1, 0); \
-  float64x2_t __ret_836; \
-  __ret_836 = __noswap_vsetq_lane_f64(vget_lane_f64(__s2_836, __p3_836), __rev0_836, __p1_836); \
-  __ret_836 = __builtin_shufflevector(__ret_836, __ret_836, 1, 0); \
-  __ret_836; \
+#define vcopyq_lane_f64(__p0_928, __p1_928, __p2_928, __p3_928) __extension__ ({ \
+  float64x2_t __s0_928 = __p0_928; \
+  float64x1_t __s2_928 = __p2_928; \
+  float64x2_t __rev0_928;  __rev0_928 = __builtin_shufflevector(__s0_928, __s0_928, 1, 0); \
+  float64x2_t __ret_928; \
+  __ret_928 = __noswap_vsetq_lane_f64(vget_lane_f64(__s2_928, __p3_928), __rev0_928, __p1_928); \
+  __ret_928 = __builtin_shufflevector(__ret_928, __ret_928, 1, 0); \
+  __ret_928; \
 })
 #endif
 
-#define vcopy_lane_p64(__p0_837, __p1_837, __p2_837, __p3_837) __extension__ ({ \
-  poly64x1_t __s0_837 = __p0_837; \
-  poly64x1_t __s2_837 = __p2_837; \
-  poly64x1_t __ret_837; \
-  __ret_837 = vset_lane_p64(vget_lane_p64(__s2_837, __p3_837), __s0_837, __p1_837); \
-  __ret_837; \
+#define vcopy_lane_p64(__p0_929, __p1_929, __p2_929, __p3_929) __extension__ ({ \
+  poly64x1_t __s0_929 = __p0_929; \
+  poly64x1_t __s2_929 = __p2_929; \
+  poly64x1_t __ret_929; \
+  __ret_929 = vset_lane_p64(vget_lane_p64(__s2_929, __p3_929), __s0_929, __p1_929); \
+  __ret_929; \
 })
-#define vcopy_lane_f64(__p0_838, __p1_838, __p2_838, __p3_838) __extension__ ({ \
-  float64x1_t __s0_838 = __p0_838; \
-  float64x1_t __s2_838 = __p2_838; \
-  float64x1_t __ret_838; \
-  __ret_838 = vset_lane_f64(vget_lane_f64(__s2_838, __p3_838), __s0_838, __p1_838); \
-  __ret_838; \
+#define vcopy_lane_f64(__p0_930, __p1_930, __p2_930, __p3_930) __extension__ ({ \
+  float64x1_t __s0_930 = __p0_930; \
+  float64x1_t __s2_930 = __p2_930; \
+  float64x1_t __ret_930; \
+  __ret_930 = vset_lane_f64(vget_lane_f64(__s2_930, __p3_930), __s0_930, __p1_930); \
+  __ret_930; \
 })
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_p64(__p0_839, __p1_839, __p2_839, __p3_839) __extension__ ({ \
-  poly64x2_t __s0_839 = __p0_839; \
-  poly64x2_t __s2_839 = __p2_839; \
-  poly64x2_t __ret_839; \
-  __ret_839 = vsetq_lane_p64(vgetq_lane_p64(__s2_839, __p3_839), __s0_839, __p1_839); \
-  __ret_839; \
+#define vcopyq_laneq_p64(__p0_931, __p1_931, __p2_931, __p3_931) __extension__ ({ \
+  poly64x2_t __s0_931 = __p0_931; \
+  poly64x2_t __s2_931 = __p2_931; \
+  poly64x2_t __ret_931; \
+  __ret_931 = vsetq_lane_p64(vgetq_lane_p64(__s2_931, __p3_931), __s0_931, __p1_931); \
+  __ret_931; \
 })
 #else
-#define vcopyq_laneq_p64(__p0_840, __p1_840, __p2_840, __p3_840) __extension__ ({ \
-  poly64x2_t __s0_840 = __p0_840; \
-  poly64x2_t __s2_840 = __p2_840; \
-  poly64x2_t __rev0_840;  __rev0_840 = __builtin_shufflevector(__s0_840, __s0_840, 1, 0); \
-  poly64x2_t __rev2_840;  __rev2_840 = __builtin_shufflevector(__s2_840, __s2_840, 1, 0); \
-  poly64x2_t __ret_840; \
-  __ret_840 = __noswap_vsetq_lane_p64(__noswap_vgetq_lane_p64(__rev2_840, __p3_840), __rev0_840, __p1_840); \
-  __ret_840 = __builtin_shufflevector(__ret_840, __ret_840, 1, 0); \
-  __ret_840; \
+#define vcopyq_laneq_p64(__p0_932, __p1_932, __p2_932, __p3_932) __extension__ ({ \
+  poly64x2_t __s0_932 = __p0_932; \
+  poly64x2_t __s2_932 = __p2_932; \
+  poly64x2_t __rev0_932;  __rev0_932 = __builtin_shufflevector(__s0_932, __s0_932, 1, 0); \
+  poly64x2_t __rev2_932;  __rev2_932 = __builtin_shufflevector(__s2_932, __s2_932, 1, 0); \
+  poly64x2_t __ret_932; \
+  __ret_932 = __noswap_vsetq_lane_p64(__noswap_vgetq_lane_p64(__rev2_932, __p3_932), __rev0_932, __p1_932); \
+  __ret_932 = __builtin_shufflevector(__ret_932, __ret_932, 1, 0); \
+  __ret_932; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_f64(__p0_841, __p1_841, __p2_841, __p3_841) __extension__ ({ \
-  float64x2_t __s0_841 = __p0_841; \
-  float64x2_t __s2_841 = __p2_841; \
-  float64x2_t __ret_841; \
-  __ret_841 = vsetq_lane_f64(vgetq_lane_f64(__s2_841, __p3_841), __s0_841, __p1_841); \
-  __ret_841; \
+#define vcopyq_laneq_f64(__p0_933, __p1_933, __p2_933, __p3_933) __extension__ ({ \
+  float64x2_t __s0_933 = __p0_933; \
+  float64x2_t __s2_933 = __p2_933; \
+  float64x2_t __ret_933; \
+  __ret_933 = vsetq_lane_f64(vgetq_lane_f64(__s2_933, __p3_933), __s0_933, __p1_933); \
+  __ret_933; \
 })
 #else
-#define vcopyq_laneq_f64(__p0_842, __p1_842, __p2_842, __p3_842) __extension__ ({ \
-  float64x2_t __s0_842 = __p0_842; \
-  float64x2_t __s2_842 = __p2_842; \
-  float64x2_t __rev0_842;  __rev0_842 = __builtin_shufflevector(__s0_842, __s0_842, 1, 0); \
-  float64x2_t __rev2_842;  __rev2_842 = __builtin_shufflevector(__s2_842, __s2_842, 1, 0); \
-  float64x2_t __ret_842; \
-  __ret_842 = __noswap_vsetq_lane_f64(__noswap_vgetq_lane_f64(__rev2_842, __p3_842), __rev0_842, __p1_842); \
-  __ret_842 = __builtin_shufflevector(__ret_842, __ret_842, 1, 0); \
-  __ret_842; \
+#define vcopyq_laneq_f64(__p0_934, __p1_934, __p2_934, __p3_934) __extension__ ({ \
+  float64x2_t __s0_934 = __p0_934; \
+  float64x2_t __s2_934 = __p2_934; \
+  float64x2_t __rev0_934;  __rev0_934 = __builtin_shufflevector(__s0_934, __s0_934, 1, 0); \
+  float64x2_t __rev2_934;  __rev2_934 = __builtin_shufflevector(__s2_934, __s2_934, 1, 0); \
+  float64x2_t __ret_934; \
+  __ret_934 = __noswap_vsetq_lane_f64(__noswap_vgetq_lane_f64(__rev2_934, __p3_934), __rev0_934, __p1_934); \
+  __ret_934 = __builtin_shufflevector(__ret_934, __ret_934, 1, 0); \
+  __ret_934; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_p64(__p0_843, __p1_843, __p2_843, __p3_843) __extension__ ({ \
-  poly64x1_t __s0_843 = __p0_843; \
-  poly64x2_t __s2_843 = __p2_843; \
-  poly64x1_t __ret_843; \
-  __ret_843 = vset_lane_p64(vgetq_lane_p64(__s2_843, __p3_843), __s0_843, __p1_843); \
-  __ret_843; \
+#define vcopy_laneq_p64(__p0_935, __p1_935, __p2_935, __p3_935) __extension__ ({ \
+  poly64x1_t __s0_935 = __p0_935; \
+  poly64x2_t __s2_935 = __p2_935; \
+  poly64x1_t __ret_935; \
+  __ret_935 = vset_lane_p64(vgetq_lane_p64(__s2_935, __p3_935), __s0_935, __p1_935); \
+  __ret_935; \
 })
 #else
-#define vcopy_laneq_p64(__p0_844, __p1_844, __p2_844, __p3_844) __extension__ ({ \
-  poly64x1_t __s0_844 = __p0_844; \
-  poly64x2_t __s2_844 = __p2_844; \
-  poly64x2_t __rev2_844;  __rev2_844 = __builtin_shufflevector(__s2_844, __s2_844, 1, 0); \
-  poly64x1_t __ret_844; \
-  __ret_844 = vset_lane_p64(__noswap_vgetq_lane_p64(__rev2_844, __p3_844), __s0_844, __p1_844); \
-  __ret_844; \
+#define vcopy_laneq_p64(__p0_936, __p1_936, __p2_936, __p3_936) __extension__ ({ \
+  poly64x1_t __s0_936 = __p0_936; \
+  poly64x2_t __s2_936 = __p2_936; \
+  poly64x2_t __rev2_936;  __rev2_936 = __builtin_shufflevector(__s2_936, __s2_936, 1, 0); \
+  poly64x1_t __ret_936; \
+  __ret_936 = vset_lane_p64(__noswap_vgetq_lane_p64(__rev2_936, __p3_936), __s0_936, __p1_936); \
+  __ret_936; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_f64(__p0_845, __p1_845, __p2_845, __p3_845) __extension__ ({ \
-  float64x1_t __s0_845 = __p0_845; \
-  float64x2_t __s2_845 = __p2_845; \
-  float64x1_t __ret_845; \
-  __ret_845 = vset_lane_f64(vgetq_lane_f64(__s2_845, __p3_845), __s0_845, __p1_845); \
-  __ret_845; \
+#define vcopy_laneq_f64(__p0_937, __p1_937, __p2_937, __p3_937) __extension__ ({ \
+  float64x1_t __s0_937 = __p0_937; \
+  float64x2_t __s2_937 = __p2_937; \
+  float64x1_t __ret_937; \
+  __ret_937 = vset_lane_f64(vgetq_lane_f64(__s2_937, __p3_937), __s0_937, __p1_937); \
+  __ret_937; \
 })
 #else
-#define vcopy_laneq_f64(__p0_846, __p1_846, __p2_846, __p3_846) __extension__ ({ \
-  float64x1_t __s0_846 = __p0_846; \
-  float64x2_t __s2_846 = __p2_846; \
-  float64x2_t __rev2_846;  __rev2_846 = __builtin_shufflevector(__s2_846, __s2_846, 1, 0); \
-  float64x1_t __ret_846; \
-  __ret_846 = vset_lane_f64(__noswap_vgetq_lane_f64(__rev2_846, __p3_846), __s0_846, __p1_846); \
-  __ret_846; \
+#define vcopy_laneq_f64(__p0_938, __p1_938, __p2_938, __p3_938) __extension__ ({ \
+  float64x1_t __s0_938 = __p0_938; \
+  float64x2_t __s2_938 = __p2_938; \
+  float64x2_t __rev2_938;  __rev2_938 = __builtin_shufflevector(__s2_938, __s2_938, 1, 0); \
+  float64x1_t __ret_938; \
+  __ret_938 = vset_lane_f64(__noswap_vgetq_lane_f64(__rev2_938, __p3_938), __s0_938, __p1_938); \
+  __ret_938; \
 })
 #endif
 
@@ -67020,38 +68690,38 @@ __ai int32x4_t vmlsl_high_n_s16(int32x4_t __p0, int16x8_t __p1, int16_t __p2) {
 }
 #endif
 
-#define vmulx_lane_f64(__p0_847, __p1_847, __p2_847) __extension__ ({ \
-  float64x1_t __s0_847 = __p0_847; \
-  float64x1_t __s1_847 = __p1_847; \
-  float64x1_t __ret_847; \
-  float64_t __x_847 = vget_lane_f64(__s0_847, 0); \
-  float64_t __y_847 = vget_lane_f64(__s1_847, __p2_847); \
-  float64_t __z_847 = vmulxd_f64(__x_847, __y_847); \
-  __ret_847 = vset_lane_f64(__z_847, __s0_847, __p2_847); \
-  __ret_847; \
+#define vmulx_lane_f64(__p0_939, __p1_939, __p2_939) __extension__ ({ \
+  float64x1_t __s0_939 = __p0_939; \
+  float64x1_t __s1_939 = __p1_939; \
+  float64x1_t __ret_939; \
+  float64_t __x_939 = vget_lane_f64(__s0_939, 0); \
+  float64_t __y_939 = vget_lane_f64(__s1_939, __p2_939); \
+  float64_t __z_939 = vmulxd_f64(__x_939, __y_939); \
+  __ret_939 = vset_lane_f64(__z_939, __s0_939, __p2_939); \
+  __ret_939; \
 })
 #ifdef __LITTLE_ENDIAN__
-#define vmulx_laneq_f64(__p0_848, __p1_848, __p2_848) __extension__ ({ \
-  float64x1_t __s0_848 = __p0_848; \
-  float64x2_t __s1_848 = __p1_848; \
-  float64x1_t __ret_848; \
-  float64_t __x_848 = vget_lane_f64(__s0_848, 0); \
-  float64_t __y_848 = vgetq_lane_f64(__s1_848, __p2_848); \
-  float64_t __z_848 = vmulxd_f64(__x_848, __y_848); \
-  __ret_848 = vset_lane_f64(__z_848, __s0_848, 0); \
-  __ret_848; \
+#define vmulx_laneq_f64(__p0_940, __p1_940, __p2_940) __extension__ ({ \
+  float64x1_t __s0_940 = __p0_940; \
+  float64x2_t __s1_940 = __p1_940; \
+  float64x1_t __ret_940; \
+  float64_t __x_940 = vget_lane_f64(__s0_940, 0); \
+  float64_t __y_940 = vgetq_lane_f64(__s1_940, __p2_940); \
+  float64_t __z_940 = vmulxd_f64(__x_940, __y_940); \
+  __ret_940 = vset_lane_f64(__z_940, __s0_940, 0); \
+  __ret_940; \
 })
 #else
-#define vmulx_laneq_f64(__p0_849, __p1_849, __p2_849) __extension__ ({ \
-  float64x1_t __s0_849 = __p0_849; \
-  float64x2_t __s1_849 = __p1_849; \
-  float64x2_t __rev1_849;  __rev1_849 = __builtin_shufflevector(__s1_849, __s1_849, 1, 0); \
-  float64x1_t __ret_849; \
-  float64_t __x_849 = vget_lane_f64(__s0_849, 0); \
-  float64_t __y_849 = __noswap_vgetq_lane_f64(__rev1_849, __p2_849); \
-  float64_t __z_849 = vmulxd_f64(__x_849, __y_849); \
-  __ret_849 = vset_lane_f64(__z_849, __s0_849, 0); \
-  __ret_849; \
+#define vmulx_laneq_f64(__p0_941, __p1_941, __p2_941) __extension__ ({ \
+  float64x1_t __s0_941 = __p0_941; \
+  float64x2_t __s1_941 = __p1_941; \
+  float64x2_t __rev1_941;  __rev1_941 = __builtin_shufflevector(__s1_941, __s1_941, 1, 0); \
+  float64x1_t __ret_941; \
+  float64_t __x_941 = vget_lane_f64(__s0_941, 0); \
+  float64_t __y_941 = __noswap_vgetq_lane_f64(__rev1_941, __p2_941); \
+  float64_t __z_941 = vmulxd_f64(__x_941, __y_941); \
+  __ret_941 = vset_lane_f64(__z_941, __s0_941, 0); \
+  __ret_941; \
 })
 #endif
 
diff --git a/lib/include/arm_sve.h b/lib/include/arm_sve.h
index 1035d41811..8a03f9da58 100644
--- a/lib/include/arm_sve.h
+++ b/lib/include/arm_sve.h
@@ -94,7 +94,7 @@ typedef __clang_svbfloat16x2_t svbfloat16x2_t;
 typedef __clang_svbfloat16x3_t svbfloat16x3_t;
 typedef __clang_svbfloat16x4_t svbfloat16x4_t;
 #endif
-typedef enum
+enum svpattern
 {
   SV_POW2 = 0,
   SV_VL1 = 1,
@@ -113,9 +113,9 @@ typedef enum
   SV_MUL4 = 29,
   SV_MUL3 = 30,
   SV_ALL = 31
-} sv_pattern;
+};
 
-typedef enum
+enum svprfop
 {
   SV_PLDL1KEEP = 0,
   SV_PLDL1STRM = 1,
@@ -129,7 +129,7 @@ typedef enum
   SV_PSTL2STRM = 11,
   SV_PSTL3KEEP = 12,
   SV_PSTL3STRM = 13
-} sv_prfop;
+};
 
 /* Function attributes */
 #define __aio static inline __attribute__((__always_inline__, __nodebug__, __overloadable__))
@@ -10013,69 +10013,69 @@ int16_t svorv(svbool_t, svint16_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpfirst_b)))
 svbool_t svpfirst(svbool_t, svbool_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u32base)))
-void svprfb_gather(svbool_t, svuint32_t, sv_prfop);
+void svprfb_gather(svbool_t, svuint32_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u64base)))
-void svprfb_gather(svbool_t, svuint64_t, sv_prfop);
+void svprfb_gather(svbool_t, svuint64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u32base_offset)))
-void svprfb_gather_offset(svbool_t, svuint32_t, int64_t, sv_prfop);
+void svprfb_gather_offset(svbool_t, svuint32_t, int64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u64base_offset)))
-void svprfb_gather_offset(svbool_t, svuint64_t, int64_t, sv_prfop);
+void svprfb_gather_offset(svbool_t, svuint64_t, int64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_s32offset)))
-void svprfb_gather_offset(svbool_t, void const *, svint32_t, sv_prfop);
+void svprfb_gather_offset(svbool_t, void const *, svint32_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u32offset)))
-void svprfb_gather_offset(svbool_t, void const *, svuint32_t, sv_prfop);
+void svprfb_gather_offset(svbool_t, void const *, svuint32_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_s64offset)))
-void svprfb_gather_offset(svbool_t, void const *, svint64_t, sv_prfop);
+void svprfb_gather_offset(svbool_t, void const *, svint64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u64offset)))
-void svprfb_gather_offset(svbool_t, void const *, svuint64_t, sv_prfop);
+void svprfb_gather_offset(svbool_t, void const *, svuint64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_u32base)))
-void svprfd_gather(svbool_t, svuint32_t, sv_prfop);
+void svprfd_gather(svbool_t, svuint32_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_u64base)))
-void svprfd_gather(svbool_t, svuint64_t, sv_prfop);
+void svprfd_gather(svbool_t, svuint64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_u32base_index)))
-void svprfd_gather_index(svbool_t, svuint32_t, int64_t, sv_prfop);
+void svprfd_gather_index(svbool_t, svuint32_t, int64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_u64base_index)))
-void svprfd_gather_index(svbool_t, svuint64_t, int64_t, sv_prfop);
+void svprfd_gather_index(svbool_t, svuint64_t, int64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_s32index)))
-void svprfd_gather_index(svbool_t, void const *, svint32_t, sv_prfop);
+void svprfd_gather_index(svbool_t, void const *, svint32_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_u32index)))
-void svprfd_gather_index(svbool_t, void const *, svuint32_t, sv_prfop);
+void svprfd_gather_index(svbool_t, void const *, svuint32_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_s64index)))
-void svprfd_gather_index(svbool_t, void const *, svint64_t, sv_prfop);
+void svprfd_gather_index(svbool_t, void const *, svint64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_u64index)))
-void svprfd_gather_index(svbool_t, void const *, svuint64_t, sv_prfop);
+void svprfd_gather_index(svbool_t, void const *, svuint64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_u32base)))
-void svprfh_gather(svbool_t, svuint32_t, sv_prfop);
+void svprfh_gather(svbool_t, svuint32_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_u64base)))
-void svprfh_gather(svbool_t, svuint64_t, sv_prfop);
+void svprfh_gather(svbool_t, svuint64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_u32base_index)))
-void svprfh_gather_index(svbool_t, svuint32_t, int64_t, sv_prfop);
+void svprfh_gather_index(svbool_t, svuint32_t, int64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_u64base_index)))
-void svprfh_gather_index(svbool_t, svuint64_t, int64_t, sv_prfop);
+void svprfh_gather_index(svbool_t, svuint64_t, int64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_s32index)))
-void svprfh_gather_index(svbool_t, void const *, svint32_t, sv_prfop);
+void svprfh_gather_index(svbool_t, void const *, svint32_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_u32index)))
-void svprfh_gather_index(svbool_t, void const *, svuint32_t, sv_prfop);
+void svprfh_gather_index(svbool_t, void const *, svuint32_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_s64index)))
-void svprfh_gather_index(svbool_t, void const *, svint64_t, sv_prfop);
+void svprfh_gather_index(svbool_t, void const *, svint64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_u64index)))
-void svprfh_gather_index(svbool_t, void const *, svuint64_t, sv_prfop);
+void svprfh_gather_index(svbool_t, void const *, svuint64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_u32base)))
-void svprfw_gather(svbool_t, svuint32_t, sv_prfop);
+void svprfw_gather(svbool_t, svuint32_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_u64base)))
-void svprfw_gather(svbool_t, svuint64_t, sv_prfop);
+void svprfw_gather(svbool_t, svuint64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_u32base_index)))
-void svprfw_gather_index(svbool_t, svuint32_t, int64_t, sv_prfop);
+void svprfw_gather_index(svbool_t, svuint32_t, int64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_u64base_index)))
-void svprfw_gather_index(svbool_t, svuint64_t, int64_t, sv_prfop);
+void svprfw_gather_index(svbool_t, svuint64_t, int64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_s32index)))
-void svprfw_gather_index(svbool_t, void const *, svint32_t, sv_prfop);
+void svprfw_gather_index(svbool_t, void const *, svint32_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_u32index)))
-void svprfw_gather_index(svbool_t, void const *, svuint32_t, sv_prfop);
+void svprfw_gather_index(svbool_t, void const *, svuint32_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_s64index)))
-void svprfw_gather_index(svbool_t, void const *, svint64_t, sv_prfop);
+void svprfw_gather_index(svbool_t, void const *, svint64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_u64index)))
-void svprfw_gather_index(svbool_t, void const *, svuint64_t, sv_prfop);
+void svprfw_gather_index(svbool_t, void const *, svuint64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s8)))
 svint8_t svqadd(svint8_t, int8_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s32)))
@@ -10117,13 +10117,13 @@ uint32_t svqdecb(uint32_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecb_n_u64)))
 uint64_t svqdecb(uint64_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecb_pat_n_s32)))
-int32_t svqdecb_pat(int32_t, sv_pattern, uint64_t);
+int32_t svqdecb_pat(int32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecb_pat_n_s64)))
-int64_t svqdecb_pat(int64_t, sv_pattern, uint64_t);
+int64_t svqdecb_pat(int64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecb_pat_n_u32)))
-uint32_t svqdecb_pat(uint32_t, sv_pattern, uint64_t);
+uint32_t svqdecb_pat(uint32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecb_pat_n_u64)))
-uint64_t svqdecb_pat(uint64_t, sv_pattern, uint64_t);
+uint64_t svqdecb_pat(uint64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_n_s32)))
 int32_t svqdecd(int32_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_n_s64)))
@@ -10137,17 +10137,17 @@ svint64_t svqdecd(svint64_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_u64)))
 svuint64_t svqdecd(svuint64_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_pat_n_s32)))
-int32_t svqdecd_pat(int32_t, sv_pattern, uint64_t);
+int32_t svqdecd_pat(int32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_pat_n_s64)))
-int64_t svqdecd_pat(int64_t, sv_pattern, uint64_t);
+int64_t svqdecd_pat(int64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_pat_n_u32)))
-uint32_t svqdecd_pat(uint32_t, sv_pattern, uint64_t);
+uint32_t svqdecd_pat(uint32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_pat_n_u64)))
-uint64_t svqdecd_pat(uint64_t, sv_pattern, uint64_t);
+uint64_t svqdecd_pat(uint64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_pat_s64)))
-svint64_t svqdecd_pat(svint64_t, sv_pattern, uint64_t);
+svint64_t svqdecd_pat(svint64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_pat_u64)))
-svuint64_t svqdecd_pat(svuint64_t, sv_pattern, uint64_t);
+svuint64_t svqdecd_pat(svuint64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_n_s32)))
 int32_t svqdech(int32_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_n_s64)))
@@ -10161,17 +10161,17 @@ svint16_t svqdech(svint16_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_u16)))
 svuint16_t svqdech(svuint16_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_pat_n_s32)))
-int32_t svqdech_pat(int32_t, sv_pattern, uint64_t);
+int32_t svqdech_pat(int32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_pat_n_s64)))
-int64_t svqdech_pat(int64_t, sv_pattern, uint64_t);
+int64_t svqdech_pat(int64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_pat_n_u32)))
-uint32_t svqdech_pat(uint32_t, sv_pattern, uint64_t);
+uint32_t svqdech_pat(uint32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_pat_n_u64)))
-uint64_t svqdech_pat(uint64_t, sv_pattern, uint64_t);
+uint64_t svqdech_pat(uint64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_pat_s16)))
-svint16_t svqdech_pat(svint16_t, sv_pattern, uint64_t);
+svint16_t svqdech_pat(svint16_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_pat_u16)))
-svuint16_t svqdech_pat(svuint16_t, sv_pattern, uint64_t);
+svuint16_t svqdech_pat(svuint16_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_s32_b8)))
 int32_t svqdecp_b8(int32_t, svbool_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_s32_b32)))
@@ -10229,17 +10229,17 @@ svint32_t svqdecw(svint32_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_u32)))
 svuint32_t svqdecw(svuint32_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_pat_n_s32)))
-int32_t svqdecw_pat(int32_t, sv_pattern, uint64_t);
+int32_t svqdecw_pat(int32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_pat_n_s64)))
-int64_t svqdecw_pat(int64_t, sv_pattern, uint64_t);
+int64_t svqdecw_pat(int64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_pat_n_u32)))
-uint32_t svqdecw_pat(uint32_t, sv_pattern, uint64_t);
+uint32_t svqdecw_pat(uint32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_pat_n_u64)))
-uint64_t svqdecw_pat(uint64_t, sv_pattern, uint64_t);
+uint64_t svqdecw_pat(uint64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_pat_s32)))
-svint32_t svqdecw_pat(svint32_t, sv_pattern, uint64_t);
+svint32_t svqdecw_pat(svint32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_pat_u32)))
-svuint32_t svqdecw_pat(svuint32_t, sv_pattern, uint64_t);
+svuint32_t svqdecw_pat(svuint32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_n_s32)))
 int32_t svqincb(int32_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_n_s64)))
@@ -10249,13 +10249,13 @@ uint32_t svqincb(uint32_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_n_u64)))
 uint64_t svqincb(uint64_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_pat_n_s32)))
-int32_t svqincb_pat(int32_t, sv_pattern, uint64_t);
+int32_t svqincb_pat(int32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_pat_n_s64)))
-int64_t svqincb_pat(int64_t, sv_pattern, uint64_t);
+int64_t svqincb_pat(int64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_pat_n_u32)))
-uint32_t svqincb_pat(uint32_t, sv_pattern, uint64_t);
+uint32_t svqincb_pat(uint32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_pat_n_u64)))
-uint64_t svqincb_pat(uint64_t, sv_pattern, uint64_t);
+uint64_t svqincb_pat(uint64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_n_s32)))
 int32_t svqincd(int32_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_n_s64)))
@@ -10269,17 +10269,17 @@ svint64_t svqincd(svint64_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_u64)))
 svuint64_t svqincd(svuint64_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_pat_n_s32)))
-int32_t svqincd_pat(int32_t, sv_pattern, uint64_t);
+int32_t svqincd_pat(int32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_pat_n_s64)))
-int64_t svqincd_pat(int64_t, sv_pattern, uint64_t);
+int64_t svqincd_pat(int64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_pat_n_u32)))
-uint32_t svqincd_pat(uint32_t, sv_pattern, uint64_t);
+uint32_t svqincd_pat(uint32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_pat_n_u64)))
-uint64_t svqincd_pat(uint64_t, sv_pattern, uint64_t);
+uint64_t svqincd_pat(uint64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_pat_s64)))
-svint64_t svqincd_pat(svint64_t, sv_pattern, uint64_t);
+svint64_t svqincd_pat(svint64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_pat_u64)))
-svuint64_t svqincd_pat(svuint64_t, sv_pattern, uint64_t);
+svuint64_t svqincd_pat(svuint64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_n_s32)))
 int32_t svqinch(int32_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_n_s64)))
@@ -10293,17 +10293,17 @@ svint16_t svqinch(svint16_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_u16)))
 svuint16_t svqinch(svuint16_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_pat_n_s32)))
-int32_t svqinch_pat(int32_t, sv_pattern, uint64_t);
+int32_t svqinch_pat(int32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_pat_n_s64)))
-int64_t svqinch_pat(int64_t, sv_pattern, uint64_t);
+int64_t svqinch_pat(int64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_pat_n_u32)))
-uint32_t svqinch_pat(uint32_t, sv_pattern, uint64_t);
+uint32_t svqinch_pat(uint32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_pat_n_u64)))
-uint64_t svqinch_pat(uint64_t, sv_pattern, uint64_t);
+uint64_t svqinch_pat(uint64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_pat_s16)))
-svint16_t svqinch_pat(svint16_t, sv_pattern, uint64_t);
+svint16_t svqinch_pat(svint16_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_pat_u16)))
-svuint16_t svqinch_pat(svuint16_t, sv_pattern, uint64_t);
+svuint16_t svqinch_pat(svuint16_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_s32_b8)))
 int32_t svqincp_b8(int32_t, svbool_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_s32_b32)))
@@ -10361,17 +10361,17 @@ svint32_t svqincw(svint32_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_u32)))
 svuint32_t svqincw(svuint32_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_pat_n_s32)))
-int32_t svqincw_pat(int32_t, sv_pattern, uint64_t);
+int32_t svqincw_pat(int32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_pat_n_s64)))
-int64_t svqincw_pat(int64_t, sv_pattern, uint64_t);
+int64_t svqincw_pat(int64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_pat_n_u32)))
-uint32_t svqincw_pat(uint32_t, sv_pattern, uint64_t);
+uint32_t svqincw_pat(uint32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_pat_n_u64)))
-uint64_t svqincw_pat(uint64_t, sv_pattern, uint64_t);
+uint64_t svqincw_pat(uint64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_pat_s32)))
-svint32_t svqincw_pat(svint32_t, sv_pattern, uint64_t);
+svint32_t svqincw_pat(svint32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_pat_u32)))
-svuint32_t svqincw_pat(svuint32_t, sv_pattern, uint64_t);
+svuint32_t svqincw_pat(svuint32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s8)))
 svint8_t svqsub(svint8_t, int8_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s32)))
diff --git a/lib/include/avx512fintrin.h b/lib/include/avx512fintrin.h
index fa22ef3fdd..f226382cbb 100644
--- a/lib/include/avx512fintrin.h
+++ b/lib/include/avx512fintrin.h
@@ -9297,303 +9297,232 @@ _mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A)
 
 /* Vector-reduction arithmetic accepts vectors as inputs and produces scalars as
  * outputs. This class of vector operation forms the basis of many scientific
- * computations. In vector-reduction arithmetic, the evaluation off is
+ * computations. In vector-reduction arithmetic, the evaluation order is
  * independent of the order of the input elements of V.
 
+ * For floating point types, we always assume the elements are reassociable even
+ * if -fast-math is off.
+
  * Used bisection method. At each step, we partition the vector with previous
  * step in half, and the operation is performed on its two halves.
  * This takes log2(n) steps where n is the number of elements in the vector.
  */
 
-#define _mm512_mask_reduce_operator(op) \
-  __v4du __t1 = (__v4du)_mm512_extracti64x4_epi64(__W, 0); \
-  __v4du __t2 = (__v4du)_mm512_extracti64x4_epi64(__W, 1); \
-  __m256i __t3 = (__m256i)(__t1 op __t2); \
-  __v2du __t4 = (__v2du)_mm256_extracti128_si256(__t3, 0); \
-  __v2du __t5 = (__v2du)_mm256_extracti128_si256(__t3, 1); \
-  __v2du __t6 = __t4 op __t5; \
-  __v2du __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \
-  __v2du __t8 = __t6 op __t7; \
-  return __t8[0]
-
 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_add_epi64(__m512i __W) {
-  _mm512_mask_reduce_operator(+);
+  return __builtin_ia32_reduce_add_q512(__W);
 }
 
 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_epi64(__m512i __W) {
-  _mm512_mask_reduce_operator(*);
+  return __builtin_ia32_reduce_mul_q512(__W);
 }
 
 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_and_epi64(__m512i __W) {
-  _mm512_mask_reduce_operator(&);
+  return __builtin_ia32_reduce_and_q512(__W);
 }
 
 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_or_epi64(__m512i __W) {
-  _mm512_mask_reduce_operator(|);
+  return __builtin_ia32_reduce_or_q512(__W);
 }
 
 static __inline__ long long __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W) {
   __W = _mm512_maskz_mov_epi64(__M, __W);
-  _mm512_mask_reduce_operator(+);
+  return __builtin_ia32_reduce_add_q512(__W);
 }
 
 static __inline__ long long __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) {
   __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(1), __M, __W);
-  _mm512_mask_reduce_operator(*);
+  return __builtin_ia32_reduce_mul_q512(__W);
 }
 
 static __inline__ long long __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) {
   __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0ULL), __M, __W);
-  _mm512_mask_reduce_operator(&);
+  return __builtin_ia32_reduce_and_q512(__W);
 }
 
 static __inline__ long long __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) {
   __W = _mm512_maskz_mov_epi64(__M, __W);
-  _mm512_mask_reduce_operator(|);
+  return __builtin_ia32_reduce_or_q512(__W);
 }
-#undef _mm512_mask_reduce_operator
-
-#define _mm512_mask_reduce_operator(op) \
-  __m256d __t1 = _mm512_extractf64x4_pd(__W, 0); \
-  __m256d __t2 = _mm512_extractf64x4_pd(__W, 1); \
-  __m256d __t3 = __t1 op __t2; \
-  __m128d __t4 = _mm256_extractf128_pd(__t3, 0); \
-  __m128d __t5 = _mm256_extractf128_pd(__t3, 1); \
-  __m128d __t6 = __t4 op __t5; \
-  __m128d __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \
-  __m128d __t8 = __t6 op __t7; \
-  return __t8[0]
 
+// -0.0 is used to ignore the start value since it is the neutral value of
+// floating point addition. For more information, please refer to
+// https://llvm.org/docs/LangRef.html#llvm-vector-reduce-fadd-intrinsic
 static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_add_pd(__m512d __W) {
-  _mm512_mask_reduce_operator(+);
+  return __builtin_ia32_reduce_fadd_pd512(-0.0, __W);
 }
 
 static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_pd(__m512d __W) {
-  _mm512_mask_reduce_operator(*);
+  return __builtin_ia32_reduce_fmul_pd512(1.0, __W);
 }
 
 static __inline__ double __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) {
   __W = _mm512_maskz_mov_pd(__M, __W);
-  _mm512_mask_reduce_operator(+);
+  return __builtin_ia32_reduce_fadd_pd512(-0.0, __W);
 }
 
 static __inline__ double __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) {
   __W = _mm512_mask_mov_pd(_mm512_set1_pd(1.0), __M, __W);
-  _mm512_mask_reduce_operator(*);
+  return __builtin_ia32_reduce_fmul_pd512(1.0, __W);
 }
-#undef _mm512_mask_reduce_operator
-
-#define _mm512_mask_reduce_operator(op) \
-  __v8su __t1 = (__v8su)_mm512_extracti64x4_epi64(__W, 0); \
-  __v8su __t2 = (__v8su)_mm512_extracti64x4_epi64(__W, 1); \
-  __m256i __t3 = (__m256i)(__t1 op __t2); \
-  __v4su __t4 = (__v4su)_mm256_extracti128_si256(__t3, 0); \
-  __v4su __t5 = (__v4su)_mm256_extracti128_si256(__t3, 1); \
-  __v4su __t6 = __t4 op __t5; \
-  __v4su __t7 = __builtin_shufflevector(__t6, __t6, 2, 3, 0, 1); \
-  __v4su __t8 = __t6 op __t7; \
-  __v4su __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \
-  __v4su __t10 = __t8 op __t9; \
-  return __t10[0]
 
 static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_reduce_add_epi32(__m512i __W) {
-  _mm512_mask_reduce_operator(+);
+  return __builtin_ia32_reduce_add_d512((__v16si)__W);
 }
 
 static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_reduce_mul_epi32(__m512i __W) {
-  _mm512_mask_reduce_operator(*);
+  return __builtin_ia32_reduce_mul_d512((__v16si)__W);
 }
 
 static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_reduce_and_epi32(__m512i __W) {
-  _mm512_mask_reduce_operator(&);
+  return __builtin_ia32_reduce_and_d512((__v16si)__W);
 }
 
 static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_reduce_or_epi32(__m512i __W) {
-  _mm512_mask_reduce_operator(|);
+  return __builtin_ia32_reduce_or_d512((__v16si)__W);
 }
 
 static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_add_epi32( __mmask16 __M, __m512i __W) {
   __W = _mm512_maskz_mov_epi32(__M, __W);
-  _mm512_mask_reduce_operator(+);
+  return __builtin_ia32_reduce_add_d512((__v16si)__W);
 }
 
 static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) {
   __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(1), __M, __W);
-  _mm512_mask_reduce_operator(*);
+  return __builtin_ia32_reduce_mul_d512((__v16si)__W);
 }
 
 static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) {
   __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0U), __M, __W);
-  _mm512_mask_reduce_operator(&);
+  return __builtin_ia32_reduce_and_d512((__v16si)__W);
 }
 
 static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) {
   __W = _mm512_maskz_mov_epi32(__M, __W);
-  _mm512_mask_reduce_operator(|);
+  return __builtin_ia32_reduce_or_d512((__v16si)__W);
 }
-#undef _mm512_mask_reduce_operator
-
-#define _mm512_mask_reduce_operator(op) \
-  __m256 __t1 = (__m256)_mm512_extractf64x4_pd((__m512d)__W, 0); \
-  __m256 __t2 = (__m256)_mm512_extractf64x4_pd((__m512d)__W, 1); \
-  __m256 __t3 = __t1 op __t2; \
-  __m128 __t4 = _mm256_extractf128_ps(__t3, 0); \
-  __m128 __t5 = _mm256_extractf128_ps(__t3, 1); \
-  __m128 __t6 = __t4 op __t5; \
-  __m128 __t7 = __builtin_shufflevector(__t6, __t6, 2, 3, 0, 1); \
-  __m128 __t8 = __t6 op __t7; \
-  __m128 __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \
-  __m128 __t10 = __t8 op __t9; \
-  return __t10[0]
 
 static __inline__ float __DEFAULT_FN_ATTRS512
 _mm512_reduce_add_ps(__m512 __W) {
-  _mm512_mask_reduce_operator(+);
+  return __builtin_ia32_reduce_fadd_ps512(-0.0f, __W);
 }
 
 static __inline__ float __DEFAULT_FN_ATTRS512
 _mm512_reduce_mul_ps(__m512 __W) {
-  _mm512_mask_reduce_operator(*);
+  return __builtin_ia32_reduce_fmul_ps512(1.0f, __W);
 }
 
 static __inline__ float __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W) {
   __W = _mm512_maskz_mov_ps(__M, __W);
-  _mm512_mask_reduce_operator(+);
+  return __builtin_ia32_reduce_fadd_ps512(-0.0f, __W);
 }
 
 static __inline__ float __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) {
   __W = _mm512_mask_mov_ps(_mm512_set1_ps(1.0f), __M, __W);
-  _mm512_mask_reduce_operator(*);
+  return __builtin_ia32_reduce_fmul_ps512(1.0f, __W);
 }
-#undef _mm512_mask_reduce_operator
-
-#define _mm512_mask_reduce_operator(op) \
-  __m512i __t1 = (__m512i)__builtin_shufflevector((__v8di)__V, (__v8di)__V, 4, 5, 6, 7, 0, 1, 2, 3); \
-  __m512i __t2 = _mm512_##op(__V, __t1); \
-  __m512i __t3 = (__m512i)__builtin_shufflevector((__v8di)__t2, (__v8di)__t2, 2, 3, 0, 1, 6, 7, 4, 5); \
-  __m512i __t4 = _mm512_##op(__t2, __t3); \
-  __m512i __t5 = (__m512i)__builtin_shufflevector((__v8di)__t4, (__v8di)__t4, 1, 0, 3, 2, 5, 4, 7, 6); \
-  __v8di __t6 = (__v8di)_mm512_##op(__t4, __t5); \
-  return __t6[0]
 
 static __inline__ long long __DEFAULT_FN_ATTRS512
 _mm512_reduce_max_epi64(__m512i __V) {
-  _mm512_mask_reduce_operator(max_epi64);
+  return __builtin_ia32_reduce_smax_q512(__V);
 }
 
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
 _mm512_reduce_max_epu64(__m512i __V) {
-  _mm512_mask_reduce_operator(max_epu64);
+  return __builtin_ia32_reduce_umax_q512(__V);
 }
 
 static __inline__ long long __DEFAULT_FN_ATTRS512
 _mm512_reduce_min_epi64(__m512i __V) {
-  _mm512_mask_reduce_operator(min_epi64);
+  return __builtin_ia32_reduce_smin_q512(__V);
 }
 
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
 _mm512_reduce_min_epu64(__m512i __V) {
-  _mm512_mask_reduce_operator(min_epu64);
+  return __builtin_ia32_reduce_umin_q512(__V);
 }
 
 static __inline__ long long __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __V) {
   __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(-__LONG_LONG_MAX__ - 1LL), __M, __V);
-  _mm512_mask_reduce_operator(max_epi64);
+  return __builtin_ia32_reduce_smax_q512(__V);
 }
 
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __V) {
   __V = _mm512_maskz_mov_epi64(__M, __V);
-  _mm512_mask_reduce_operator(max_epu64);
+  return __builtin_ia32_reduce_umax_q512(__V);
 }
 
 static __inline__ long long __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) {
   __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(__LONG_LONG_MAX__), __M, __V);
-  _mm512_mask_reduce_operator(min_epi64);
+  return __builtin_ia32_reduce_smin_q512(__V);
 }
 
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) {
   __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0ULL), __M, __V);
-  _mm512_mask_reduce_operator(min_epu64);
+  return __builtin_ia32_reduce_umin_q512(__V);
 }
-#undef _mm512_mask_reduce_operator
-
-#define _mm512_mask_reduce_operator(op) \
-  __m256i __t1 = _mm512_extracti64x4_epi64(__V, 0); \
-  __m256i __t2 = _mm512_extracti64x4_epi64(__V, 1); \
-  __m256i __t3 = _mm256_##op(__t1, __t2); \
-  __m128i __t4 = _mm256_extracti128_si256(__t3, 0); \
-  __m128i __t5 = _mm256_extracti128_si256(__t3, 1); \
-  __m128i __t6 = _mm_##op(__t4, __t5); \
-  __m128i __t7 = (__m128i)__builtin_shufflevector((__v4si)__t6, (__v4si)__t6, 2, 3, 0, 1); \
-  __m128i __t8 = _mm_##op(__t6, __t7); \
-  __m128i __t9 = (__m128i)__builtin_shufflevector((__v4si)__t8, (__v4si)__t8, 1, 0, 3, 2); \
-  __v4si __t10 = (__v4si)_mm_##op(__t8, __t9); \
-  return __t10[0]
-
 static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_reduce_max_epi32(__m512i __V) {
-  _mm512_mask_reduce_operator(max_epi32);
+  return __builtin_ia32_reduce_smax_d512((__v16si)__V);
 }
 
 static __inline__ unsigned int __DEFAULT_FN_ATTRS512
 _mm512_reduce_max_epu32(__m512i __V) {
-  _mm512_mask_reduce_operator(max_epu32);
+  return __builtin_ia32_reduce_umax_d512((__v16si)__V);
 }
 
 static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_reduce_min_epi32(__m512i __V) {
-  _mm512_mask_reduce_operator(min_epi32);
+  return __builtin_ia32_reduce_smin_d512((__v16si)__V);
 }
 
 static __inline__ unsigned int __DEFAULT_FN_ATTRS512
 _mm512_reduce_min_epu32(__m512i __V) {
-  _mm512_mask_reduce_operator(min_epu32);
+  return __builtin_ia32_reduce_umin_d512((__v16si)__V);
 }
 
 static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __V) {
   __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(-__INT_MAX__ - 1), __M, __V);
-  _mm512_mask_reduce_operator(max_epi32);
+  return __builtin_ia32_reduce_smax_d512((__v16si)__V);
 }
 
 static __inline__ unsigned int __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __V) {
   __V = _mm512_maskz_mov_epi32(__M, __V);
-  _mm512_mask_reduce_operator(max_epu32);
+  return __builtin_ia32_reduce_umax_d512((__v16si)__V);
 }
 
 static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) {
   __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(__INT_MAX__), __M, __V);
-  _mm512_mask_reduce_operator(min_epi32);
+  return __builtin_ia32_reduce_smin_d512((__v16si)__V);
 }
 
 static __inline__ unsigned int __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) {
   __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0U), __M, __V);
-  _mm512_mask_reduce_operator(min_epu32);
+  return __builtin_ia32_reduce_umin_d512((__v16si)__V);
 }
-#undef _mm512_mask_reduce_operator
 
 #define _mm512_mask_reduce_operator(op) \
   __m256d __t1 = _mm512_extractf64x4_pd(__V, 0); \
diff --git a/lib/include/avx512vlvnniintrin.h b/lib/include/avx512vlvnniintrin.h
index b7c8fa08c6..71ac1b4370 100644
--- a/lib/include/avx512vlvnniintrin.h
+++ b/lib/include/avx512vlvnniintrin.h
@@ -18,13 +18,157 @@
 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(128)))
 #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(256)))
 
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
+/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a S, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 7
+///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
+///      DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endoperation
+#define _mm256_dpbusd_epi32(S, A, B) \
+  (__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B))
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A,
-                                             (__v8si)__B);
-}
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
+/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a S using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 7
+///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
+///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endoperation
+#define _mm256_dpbusds_epi32(S, A, B) \
+  (__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B))
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
+/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
+///  and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 7
+///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
+///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
+///      DST.dword[j] := S.dword[j] + tmp1 + tmp2
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endoperation
+#define _mm256_dpwssd_epi32(S, A, B) \
+  (__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B))
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
+/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
+/// using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 7
+///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
+///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
+///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endoperation
+#define _mm256_dpwssds_epi32(S, A, B) \
+  (__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B))
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
+/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a S, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 3
+///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
+///      DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endoperation
+#define _mm_dpbusd_epi32(S, A, B) \
+  (__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B))
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
+/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a S using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 3
+///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
+///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endoperation
+#define _mm_dpbusds_epi32(S, A, B) \
+  (__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B))
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
+/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
+/// and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 3
+///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
+///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
+///      DST.dword[j] := S.dword[j] + tmp1 + tmp2
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endoperation
+#define _mm_dpwssd_epi32(S, A, B) \
+  (__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B))
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
+/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
+/// using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 3
+///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
+///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
+///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endoperation
+#define _mm_dpwssds_epi32(S, A, B) \
+  (__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B))
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
@@ -42,13 +186,6 @@ _mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
                                      (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A,
-                                              (__v8si)__B);
-}
-
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
 {
@@ -65,13 +202,6 @@ _mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
                                      (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A,
-                                             (__v8si)__B);
-}
-
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
 {
@@ -88,13 +218,6 @@ _mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
                                      (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A,
-                                              (__v8si)__B);
-}
-
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
 {
@@ -111,13 +234,6 @@ _mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
                                     (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A,
-                                             (__v4si)__B);
-}
-
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
 {
@@ -134,13 +250,6 @@ _mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
                                         (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A,
-                                              (__v4si)__B);
-}
-
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
 {
@@ -157,13 +266,6 @@ _mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
                                        (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A,
-                                             (__v4si)__B);
-}
-
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
 {
@@ -180,13 +282,6 @@ _mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
                                         (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A,
-                                              (__v4si)__B);
-}
-
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
 {
diff --git a/lib/include/avxintrin.h b/lib/include/avxintrin.h
index 84421bf1b9..382b621575 100644
--- a/lib/include/avxintrin.h
+++ b/lib/include/avxintrin.h
@@ -2245,7 +2245,7 @@ _mm256_cvttps_epi32(__m256 __a)
 
 /// Returns the first element of the input vector of [4 x double].
 ///
-/// \headerfile <avxintrin.h>
+/// \headerfile <x86intrin.h>
 ///
 /// This intrinsic is a utility function and does not correspond to a specific
 ///    instruction.
@@ -2261,7 +2261,7 @@ _mm256_cvtsd_f64(__m256d __a)
 
 /// Returns the first element of the input vector of [8 x i32].
 ///
-/// \headerfile <avxintrin.h>
+/// \headerfile <x86intrin.h>
 ///
 /// This intrinsic is a utility function and does not correspond to a specific
 ///    instruction.
@@ -2278,7 +2278,7 @@ _mm256_cvtsi256_si32(__m256i __a)
 
 /// Returns the first element of the input vector of [8 x float].
 ///
-/// \headerfile <avxintrin.h>
+/// \headerfile <x86intrin.h>
 ///
 /// This intrinsic is a utility function and does not correspond to a specific
 ///    instruction.
diff --git a/lib/include/avxvnniintrin.h b/lib/include/avxvnniintrin.h
new file mode 100644
index 0000000000..ad45cb7962
--- /dev/null
+++ b/lib/include/avxvnniintrin.h
@@ -0,0 +1,225 @@
+/*===--------------- avxvnniintrin.h - VNNI intrinsics --------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avxvnniintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVXVNNIINTRIN_H
+#define __AVXVNNIINTRIN_H
+
+/* Below intrinsics defined in avx512vlvnniintrin.h can be used for AVXVNNI */
+/// \fn __m256i _mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B)
+/// \fn __m256i _mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B)
+/// \fn __m256i _mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B)
+/// \fn __m256i _mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B)
+/// \fn __m128i _mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
+/// \fn __m128i _mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
+/// \fn __m128i _mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
+/// \fn __m128i _mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
+
+/* Intrinsics with _avx_ prefix are for compatibility with msvc. */
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(128)))
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a __S, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 7
+///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
+///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endoperation
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a __S using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 7
+///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
+///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endoperation
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S,
+///  and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 7
+///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endoperation
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S
+/// using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 7
+///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endoperation
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a __S, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 3
+///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
+///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endoperation
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a __S using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 3
+///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
+///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endoperation
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S,
+/// and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 3
+///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endoperation
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S
+/// using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 3
+///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endoperation
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif // __AVXVNNIINTRIN_H
diff --git a/lib/include/cpuid.h b/lib/include/cpuid.h
index 2a88c042d0..34f0e76807 100644
--- a/lib/include/cpuid.h
+++ b/lib/include/cpuid.h
@@ -7,6 +7,9 @@
  *===-----------------------------------------------------------------------===
  */
 
+#ifndef __CPUID_H
+#define __CPUID_H
+
 #if !(__x86_64__ || __i386__)
 #error this header is for x86 only
 #endif
@@ -186,6 +189,7 @@
 /* Features in %edx for leaf 7 sub-leaf 0 */
 #define bit_AVX5124VNNIW  0x00000004
 #define bit_AVX5124FMAPS  0x00000008
+#define bit_UINTR         0x00000020
 #define bit_SERIALIZE     0x00004000
 #define bit_TSXLDTRK      0x00010000
 #define bit_PCONFIG       0x00040000
@@ -195,7 +199,9 @@
 #define bit_AMXINT8       0x02000000
 
 /* Features in %eax for leaf 7 sub-leaf 1 */
+#define bit_AVXVNNI       0x00000008
 #define bit_AVX512BF16    0x00000020
+#define bit_HRESET        0x00400000
 
 /* Features in %eax for leaf 13 sub-leaf 1 */
 #define bit_XSAVEOPT    0x00000001
@@ -309,3 +315,5 @@ static __inline int __get_cpuid_count (unsigned int __leaf,
     __cpuid_count(__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
     return 1;
 }
+
+#endif /* __CPUID_H */
diff --git a/lib/include/cuda_wrappers/algorithm b/lib/include/cuda_wrappers/algorithm
index 01af18360d..f14a0b00bb 100644
--- a/lib/include/cuda_wrappers/algorithm
+++ b/lib/include/cuda_wrappers/algorithm
@@ -1,4 +1,4 @@
-/*===---- complex - CUDA wrapper for <algorithm> ----------------------------===
+/*===---- algorithm - CUDA wrapper for <algorithm> -------------------------===
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/lib/include/cuda_wrappers/new b/lib/include/cuda_wrappers/new
index f49811c5a5..d5fb3b7011 100644
--- a/lib/include/cuda_wrappers/new
+++ b/lib/include/cuda_wrappers/new
@@ -1,4 +1,4 @@
-/*===---- complex - CUDA wrapper for <new> ------------------------------===
+/*===---- new - CUDA wrapper for <new> -------------------------------------===
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -26,6 +26,13 @@
 
 #include_next <new>
 
+#if !defined(__device__)
+// The header has been included too early from the standard C++ library
+// and CUDA-specific macros are not available yet.
+// Undo the include guard and try again later.
+#undef __CLANG_CUDA_WRAPPERS_NEW
+#else
+
 #pragma push_macro("CUDA_NOEXCEPT")
 #if __cplusplus >= 201103L
 #define CUDA_NOEXCEPT noexcept
@@ -95,4 +102,5 @@ __device__ inline void operator delete[](void *, void *) CUDA_NOEXCEPT {}
 
 #pragma pop_macro("CUDA_NOEXCEPT")
 
+#endif // __device__
 #endif // include guard
diff --git a/lib/include/emmintrin.h b/lib/include/emmintrin.h
index 73a777b107..bb759721fa 100644
--- a/lib/include/emmintrin.h
+++ b/lib/include/emmintrin.h
@@ -4025,7 +4025,7 @@ _mm_storeu_si128(__m128i_u *__p, __m128i __b)
 ///
 /// \param __p
 ///    A pointer to a 64-bit memory location. The address of the memory
-///    location does not have to be algned.
+///    location does not have to be aligned.
 /// \param __b
 ///    A 128-bit integer vector containing the value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
diff --git a/lib/include/gfniintrin.h b/lib/include/gfniintrin.h
index 9bff0fcb60..11a321b7c9 100644
--- a/lib/include/gfniintrin.h
+++ b/lib/include/gfniintrin.h
@@ -14,38 +14,56 @@
 #ifndef __GFNIINTRIN_H
 #define __GFNIINTRIN_H
 
+/* Default attributes for simple form (no masking). */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("gfni"), __min_vector_width__(128)))
+
+/* Default attributes for YMM unmasked form. */
+#define __DEFAULT_FN_ATTRS_Y __attribute__((__always_inline__, __nodebug__, __target__("avx,gfni"), __min_vector_width__(256)))
+
+/* Default attributes for ZMM forms. */
+#define __DEFAULT_FN_ATTRS_Z __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni"), __min_vector_width__(512)))
+
+/* Default attributes for VLX forms. */
+#define __DEFAULT_FN_ATTRS_VL128 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS_VL256 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(256)))
 
 #define _mm_gf2p8affineinv_epi64_epi8(A, B, I) \
   (__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A),          \
                                                   (__v16qi)(__m128i)(B),          \
                                                   (char)(I))
 
-#define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
-  (__m128i)__builtin_ia32_selectb_128((__mmask16)(U),                             \
-        (__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I),                          \
-        (__v16qi)(__m128i)(S))
-
-
-#define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
-  (__m128i)_mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(),       \
-        U, A, B, I)
+#define _mm_gf2p8affine_epi64_epi8(A, B, I) \
+  (__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A),             \
+                                                  (__v16qi)(__m128i)(B),          \
+                                                  (char)(I))
 
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_gf2p8mul_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi((__v16qi) __A,
+              (__v16qi) __B);
+}
 
+#ifdef __AVXINTRIN_H
 #define _mm256_gf2p8affineinv_epi64_epi8(A, B, I) \
   (__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A),          \
                                                   (__v32qi)(__m256i)(B),          \
                                                   (char)(I))
 
-#define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
-   (__m256i)__builtin_ia32_selectb_256((__mmask32)(U),                            \
-        (__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I),                       \
-        (__v32qi)(__m256i)(S))
-
-#define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
-  (__m256i)_mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \
-        U, A, B, I)
+#define _mm256_gf2p8affine_epi64_epi8(A, B, I) \
+  (__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A),             \
+                                                  (__v32qi)(__m256i)(B),          \
+                                                  (char)(I))
 
+static __inline__ __m256i __DEFAULT_FN_ATTRS_Y
+_mm256_gf2p8mul_epi8(__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi((__v32qi) __A,
+              (__v32qi) __B);
+}
+#endif /* __AVXINTRIN_H */
 
+#ifdef __AVX512BWINTRIN_H
 #define _mm512_gf2p8affineinv_epi64_epi8(A, B, I) \
   (__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)(__m512i)(A),          \
                                                   (__v64qi)(__m512i)(B),          \
@@ -60,37 +78,6 @@
   (__m512i)_mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_si512(),    \
         U, A, B, I)
 
-#define _mm_gf2p8affine_epi64_epi8(A, B, I) \
-  (__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A),             \
-                                                  (__v16qi)(__m128i)(B),          \
-                                                  (char)(I))
-
-#define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
-  (__m128i)__builtin_ia32_selectb_128((__mmask16)(U),                             \
-        (__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I),                             \
-        (__v16qi)(__m128i)(S))
-
-
-#define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
-  (__m128i)_mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(),          \
-        U, A, B, I)
-
-
-#define _mm256_gf2p8affine_epi64_epi8(A, B, I) \
-  (__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A),             \
-                                                  (__v32qi)(__m256i)(B),          \
-                                                  (char)(I))
-
-#define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
-   (__m256i)__builtin_ia32_selectb_256((__mmask32)(U),                            \
-        (__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I),                          \
-        (__v32qi)(__m256i)(S))
-
-#define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
-  (__m256i)_mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(),    \
-        U, A, B, I)
-
-
 #define _mm512_gf2p8affine_epi64_epi8(A, B, I) \
   (__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)(__m512i)(A),             \
                                                   (__v64qi)(__m512i)(B),          \
@@ -105,63 +92,6 @@
   (__m512i)_mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_si512(),       \
         U, A, B, I)
 
-/* Default attributes for simple form (no masking). */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("gfni"), __min_vector_width__(128)))
-
-/* Default attributes for YMM unmasked form. */
-#define __DEFAULT_FN_ATTRS_Y __attribute__((__always_inline__, __nodebug__, __target__("avx,gfni"), __min_vector_width__(256)))
-
-/* Default attributes for ZMM forms. */
-#define __DEFAULT_FN_ATTRS_Z __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni"), __min_vector_width__(512)))
-
-/* Default attributes for VLX forms. */
-#define __DEFAULT_FN_ATTRS_VL128 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS_VL256 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(256)))
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_gf2p8mul_epi8(__m128i __A, __m128i __B)
-{
-  return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi((__v16qi) __A,
-              (__v16qi) __B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128
-_mm_mask_gf2p8mul_epi8(__m128i __S, __mmask16 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i) __builtin_ia32_selectb_128(__U,
-              (__v16qi) _mm_gf2p8mul_epi8(__A, __B),
-              (__v16qi) __S);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128
-_mm_maskz_gf2p8mul_epi8(__mmask16 __U, __m128i __A, __m128i __B)
-{
-  return _mm_mask_gf2p8mul_epi8((__m128i)_mm_setzero_si128(),
-              __U, __A, __B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS_Y
-_mm256_gf2p8mul_epi8(__m256i __A, __m256i __B)
-{
-  return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi((__v32qi) __A,
-              (__v32qi) __B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256
-_mm256_mask_gf2p8mul_epi8(__m256i __S, __mmask32 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i) __builtin_ia32_selectb_256(__U,
-              (__v32qi) _mm256_gf2p8mul_epi8(__A, __B),
-              (__v32qi) __S);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256
-_mm256_maskz_gf2p8mul_epi8(__mmask32 __U, __m256i __A, __m256i __B)
-{
-  return _mm256_mask_gf2p8mul_epi8((__m256i)_mm256_setzero_si256(),
-              __U, __A, __B);
-}
-
 static __inline__ __m512i __DEFAULT_FN_ATTRS_Z
 _mm512_gf2p8mul_epi8(__m512i __A, __m512i __B)
 {
@@ -183,6 +113,75 @@ _mm512_maskz_gf2p8mul_epi8(__mmask64 __U, __m512i __A, __m512i __B)
   return _mm512_mask_gf2p8mul_epi8((__m512i)_mm512_setzero_si512(),
               __U, __A, __B);
 }
+#endif /* __AVX512BWINTRIN_H */
+
+#ifdef __AVX512VLBWINTRIN_H
+#define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
+  (__m128i)__builtin_ia32_selectb_128((__mmask16)(U),                             \
+        (__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I),                          \
+        (__v16qi)(__m128i)(S))
+
+#define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
+  (__m128i)_mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(),       \
+        U, A, B, I)
+
+#define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
+   (__m256i)__builtin_ia32_selectb_256((__mmask32)(U),                            \
+        (__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I),                       \
+        (__v32qi)(__m256i)(S))
+
+#define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
+  (__m256i)_mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \
+        U, A, B, I)
+
+#define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
+  (__m128i)__builtin_ia32_selectb_128((__mmask16)(U),                             \
+        (__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I),                             \
+        (__v16qi)(__m128i)(S))
+
+#define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
+  (__m128i)_mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(),          \
+        U, A, B, I)
+
+#define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
+   (__m256i)__builtin_ia32_selectb_256((__mmask32)(U),                            \
+        (__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I),                          \
+        (__v32qi)(__m256i)(S))
+
+#define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
+  (__m256i)_mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(),    \
+        U, A, B, I)
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128
+_mm_mask_gf2p8mul_epi8(__m128i __S, __mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_selectb_128(__U,
+              (__v16qi) _mm_gf2p8mul_epi8(__A, __B),
+              (__v16qi) __S);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128
+_mm_maskz_gf2p8mul_epi8(__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return _mm_mask_gf2p8mul_epi8((__m128i)_mm_setzero_si128(),
+              __U, __A, __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256
+_mm256_mask_gf2p8mul_epi8(__m256i __S, __mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_selectb_256(__U,
+              (__v32qi) _mm256_gf2p8mul_epi8(__A, __B),
+              (__v32qi) __S);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256
+_mm256_maskz_gf2p8mul_epi8(__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return _mm256_mask_gf2p8mul_epi8((__m256i)_mm256_setzero_si256(),
+              __U, __A, __B);
+}
+#endif /* __AVX512VLBWINTRIN_H */
 
 #undef __DEFAULT_FN_ATTRS
 #undef __DEFAULT_FN_ATTRS_Y
diff --git a/lib/include/hresetintrin.h b/lib/include/hresetintrin.h
new file mode 100644
index 0000000000..13e31a2e03
--- /dev/null
+++ b/lib/include/hresetintrin.h
@@ -0,0 +1,49 @@
+/*===---------------- hresetintrin.h - HRESET intrinsics -------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __X86GPRINTRIN_H
+#error "Never use <hresetintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef __HRESETINTRIN_H
+#define __HRESETINTRIN_H
+
+#if __has_extension(gnu_asm)
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__, __target__("hreset")))
+
+/// Provides a hint to the processor to selectively reset the prediction
+///    history of the current logical processor specified by a 32-bit integer
+///    value \a __eax.
+///
+/// This intrinsic corresponds to the <c> HRESET </c> instruction.
+///
+/// \operation
+///    IF __eax == 0
+///      // nop
+///    ELSE
+///      FOR i := 0 to 31
+///        IF __eax[i]
+///          ResetPredictionFeature(i)
+///        FI
+///      ENDFOR
+///    FI
+/// \endoperation
+static __inline void __DEFAULT_FN_ATTRS
+_hreset(int __eax)
+{
+  __asm__ ("hreset $0" :: "a"(__eax));
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __has_extension(gnu_asm) */
+
+#endif /* __HRESETINTRIN_H */
diff --git a/lib/include/ia32intrin.h b/lib/include/ia32intrin.h
index 79b7f0655c..00138effd5 100644
--- a/lib/include/ia32intrin.h
+++ b/lib/include/ia32intrin.h
@@ -14,6 +14,18 @@
 #ifndef __IA32INTRIN_H
 #define __IA32INTRIN_H
 
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+#define __DEFAULT_FN_ATTRS_SSE42 __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
+
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS_CAST __attribute__((__always_inline__)) constexpr
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
+#else
+#define __DEFAULT_FN_ATTRS_CAST __attribute__((__always_inline__))
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
+#endif
+
 /** Find the first set bit starting from the lsb. Result is undefined if
  *  input is 0.
  *
@@ -26,7 +38,7 @@
  *     A 32-bit integer operand.
  *  \returns A 32-bit integer containing the bit number.
  */
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
 __bsfd(int __A) {
   return __builtin_ctz(__A);
 }
@@ -43,7 +55,7 @@ __bsfd(int __A) {
  *     A 32-bit integer operand.
  *  \returns A 32-bit integer containing the bit number.
  */
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
 __bsrd(int __A) {
   return 31 - __builtin_clz(__A);
 }
@@ -59,12 +71,12 @@ __bsrd(int __A) {
  *     A 32-bit integer operand.
  *  \returns A 32-bit integer containing the swapped bytes.
  */
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
 __bswapd(int __A) {
   return __builtin_bswap32(__A);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
 _bswap(int __A) {
   return __builtin_bswap32(__A);
 }
@@ -85,7 +97,7 @@ _bswap(int __A) {
  *     A 64-bit integer operand.
  *  \returns A 32-bit integer containing the bit number.
  */
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
 __bsfq(long long __A) {
   return __builtin_ctzll(__A);
 }
@@ -102,7 +114,7 @@ __bsfq(long long __A) {
  *     A 64-bit integer operand.
  *  \returns A 32-bit integer containing the bit number.
  */
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
 __bsrq(long long __A) {
   return 63 - __builtin_clzll(__A);
 }
@@ -118,7 +130,7 @@ __bsrq(long long __A) {
  *     A 64-bit integer operand.
  *  \returns A 64-bit integer containing the swapped bytes.
  */
-static __inline__ long long __attribute__((__always_inline__, __nodebug__))
+static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR
 __bswapq(long long __A) {
   return __builtin_bswap64(__A);
 }
@@ -138,7 +150,7 @@ __bswapq(long long __A) {
  *  \returns A 32-bit integer containing the number of bits with value 1 in the
  *     source operand.
  */
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
 __popcntd(unsigned int __A)
 {
   return __builtin_popcount(__A);
@@ -159,7 +171,7 @@ __popcntd(unsigned int __A)
  *  \returns A 64-bit integer containing the number of bits with value 1 in the
  *     source operand.
  */
-static __inline__ long long __attribute__((__always_inline__, __nodebug__))
+static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR
 __popcntq(unsigned long long __A)
 {
   return __builtin_popcountll(__A);
@@ -169,26 +181,26 @@ __popcntq(unsigned long long __A)
 #endif /* __x86_64__ */
 
 #ifdef __x86_64__
-static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __readeflags(void)
 {
   return __builtin_ia32_readeflags_u64();
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 __writeeflags(unsigned long long __f)
 {
   __builtin_ia32_writeeflags_u64(__f);
 }
 
 #else /* !__x86_64__ */
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __readeflags(void)
 {
   return __builtin_ia32_readeflags_u32();
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 __writeeflags(unsigned int __f)
 {
   __builtin_ia32_writeeflags_u32(__f);
@@ -205,11 +217,9 @@ __writeeflags(unsigned int __f)
  *     A 32-bit float value.
  *  \returns a 32-bit unsigned integer containing the converted value.
  */
-static __inline__ unsigned int __attribute__((__always_inline__))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS_CAST
 _castf32_u32(float __A) {
-  unsigned int D;
-  __builtin_memcpy(&D, &__A, sizeof(__A));
-  return D;
+  return __builtin_bit_cast(unsigned int, __A);
 }
 
 /** Cast a 64-bit float value to a 64-bit unsigned integer value
@@ -222,11 +232,9 @@ _castf32_u32(float __A) {
  *     A 64-bit float value.
  *  \returns a 64-bit unsigned integer containing the converted value.
  */
-static __inline__ unsigned long long __attribute__((__always_inline__))
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CAST
 _castf64_u64(double __A) {
-  unsigned long long D;
-  __builtin_memcpy(&D, &__A, sizeof(__A));
-  return D;
+  return __builtin_bit_cast(unsigned long long, __A);
 }
 
 /** Cast a 32-bit unsigned integer value to a 32-bit float value
@@ -239,11 +247,9 @@ _castf64_u64(double __A) {
  *     A 32-bit unsigned integer value.
  *  \returns a 32-bit float value containing the converted value.
  */
-static __inline__ float __attribute__((__always_inline__))
+static __inline__ float __DEFAULT_FN_ATTRS_CAST
 _castu32_f32(unsigned int __A) {
-  float D;
-  __builtin_memcpy(&D, &__A, sizeof(__A));
-  return D;
+  return __builtin_bit_cast(float, __A);
 }
 
 /** Cast a 64-bit unsigned integer value to a 64-bit float value
@@ -256,11 +262,9 @@ _castu32_f32(unsigned int __A) {
  *     A 64-bit unsigned integer value.
  *  \returns a 64-bit float value containing the converted value.
  */
-static __inline__ double __attribute__((__always_inline__))
+static __inline__ double __DEFAULT_FN_ATTRS_CAST
 _castu64_f64(unsigned long long __A) {
-  double D;
-  __builtin_memcpy(&D, &__A, sizeof(__A));
-  return D;
+  return __builtin_bit_cast(double, __A);
 }
 
 /** Adds the unsigned integer operand to the CRC-32C checksum of the
@@ -278,7 +282,7 @@ _castu64_f64(unsigned long long __A) {
  *  \returns The result of adding operand \a __C to the CRC-32C checksum of
  *     operand \a __D.
  */
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS_SSE42
 __crc32b(unsigned int __C, unsigned char __D)
 {
   return __builtin_ia32_crc32qi(__C, __D);
@@ -299,7 +303,7 @@ __crc32b(unsigned int __C, unsigned char __D)
  *  \returns The result of adding operand \a __C to the CRC-32C checksum of
  *     operand \a __D.
  */
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS_SSE42
 __crc32w(unsigned int __C, unsigned short __D)
 {
   return __builtin_ia32_crc32hi(__C, __D);
@@ -320,7 +324,7 @@ __crc32w(unsigned int __C, unsigned short __D)
  *  \returns The result of adding operand \a __C to the CRC-32C checksum of
  *     operand \a __D.
  */
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS_SSE42
 __crc32d(unsigned int __C, unsigned int __D)
 {
   return __builtin_ia32_crc32si(__C, __D);
@@ -342,20 +346,20 @@ __crc32d(unsigned int __C, unsigned int __D)
  *  \returns The result of adding operand \a __C to the CRC-32C checksum of
  *     operand \a __D.
  */
-static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS_SSE42
 __crc32q(unsigned long long __C, unsigned long long __D)
 {
   return __builtin_ia32_crc32di(__C, __D);
 }
 #endif /* __x86_64__ */
 
-static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __rdpmc(int __A) {
   return __builtin_ia32_rdpmc(__A);
 }
 
 /* __rdtscp */
-static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __rdtscp(unsigned int *__A) {
   return __builtin_ia32_rdtscp(__A);
 }
@@ -364,48 +368,48 @@ __rdtscp(unsigned int *__A) {
 
 #define _rdpmc(A) __rdpmc(A)
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _wbinvd(void) {
   __builtin_ia32_wbinvd();
 }
 
-static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
 __rolb(unsigned char __X, int __C) {
   return __builtin_rotateleft8(__X, __C);
 }
 
-static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
 __rorb(unsigned char __X, int __C) {
   return __builtin_rotateright8(__X, __C);
 }
 
-static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned short __DEFAULT_FN_ATTRS_CONSTEXPR
 __rolw(unsigned short __X, int __C) {
   return __builtin_rotateleft16(__X, __C);
 }
 
-static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned short __DEFAULT_FN_ATTRS_CONSTEXPR
 __rorw(unsigned short __X, int __C) {
   return __builtin_rotateright16(__X, __C);
 }
 
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS_CONSTEXPR
 __rold(unsigned int __X, int __C) {
   return __builtin_rotateleft32(__X, __C);
 }
 
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS_CONSTEXPR
 __rord(unsigned int __X, int __C) {
   return __builtin_rotateright32(__X, __C);
 }
 
 #ifdef __x86_64__
-static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CONSTEXPR
 __rolq(unsigned long long __X, int __C) {
   return __builtin_rotateleft64(__X, __C);
 }
 
-static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CONSTEXPR
 __rorq(unsigned long long __X, int __C) {
   return __builtin_rotateright64(__X, __C);
 }
@@ -429,4 +433,9 @@ __rorq(unsigned long long __X, int __C) {
 #define _rotwl(a,b) __rolw((a), (b))
 #define _rotwr(a,b) __rorw((a), (b))
 
+#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS_CAST
+#undef __DEFAULT_FN_ATTRS_SSE42
+#undef __DEFAULT_FN_ATTRS_CONSTEXPR
+
 #endif /* __IA32INTRIN_H */
diff --git a/lib/include/immintrin.h b/lib/include/immintrin.h
index e9dff2310f..22f7a520c9 100644
--- a/lib/include/immintrin.h
+++ b/lib/include/immintrin.h
@@ -10,6 +10,8 @@
 #ifndef __IMMINTRIN_H
 #define __IMMINTRIN_H
 
+#include <x86gprintrin.h>
+
 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
     defined(__MMX__)
 #include <mmintrin.h>
@@ -143,6 +145,11 @@
 #include <avx512vlvnniintrin.h>
 #endif
 
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AVXVNNI__)
+#include <avxvnniintrin.h>
+#endif
+
 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
     defined(__AVX512DQ__)
 #include <avx512dqintrin.h>
@@ -471,6 +478,11 @@ _storebe_i64(void * __P, long long __D) {
 #include <invpcidintrin.h>
 #endif
 
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__KL__) || defined(__WIDEKL__)
+#include <keylockerintrin.h>
+#endif
+
 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
     defined(__AMXTILE__) || defined(__AMXINT8__) || defined(__AMXBF16__)
 #include <amxintrin.h>
diff --git a/lib/include/intrin.h b/lib/include/intrin.h
index 871b47ca82..a78b96997d 100644
--- a/lib/include/intrin.h
+++ b/lib/include/intrin.h
@@ -57,16 +57,11 @@ void __addfsbyte(unsigned long, unsigned char);
 void __addfsdword(unsigned long, unsigned long);
 void __addfsword(unsigned long, unsigned short);
 void __code_seg(const char *);
-static __inline__
 void __cpuid(int[4], int);
-static __inline__
 void __cpuidex(int[4], int, int);
-static __inline__
 __int64 __emul(int, int);
-static __inline__
 unsigned __int64 __emulu(unsigned int, unsigned int);
 unsigned int __getcallerseflags(void);
-static __inline__
 void __halt(void);
 unsigned char __inbyte(unsigned short);
 void __inbytestring(unsigned short, unsigned char *, unsigned long);
@@ -82,13 +77,9 @@ void __inwordstring(unsigned short, unsigned short *, unsigned long);
 void __lidt(void *);
 unsigned __int64 __ll_lshift(unsigned __int64, int);
 __int64 __ll_rshift(__int64, int);
-static __inline__
 void __movsb(unsigned char *, unsigned char const *, size_t);
-static __inline__
 void __movsd(unsigned long *, unsigned long const *, size_t);
-static __inline__
 void __movsw(unsigned short *, unsigned short const *, size_t);
-static __inline__
 void __nop(void);
 void __nvreg_restore_fence(void);
 void __nvreg_save_fence(void);
@@ -105,23 +96,16 @@ unsigned long __readcr4(void);
 unsigned long __readcr8(void);
 unsigned int __readdr(unsigned int);
 #ifdef __i386__
-static __inline__
 unsigned char __readfsbyte(unsigned long);
-static __inline__
 unsigned __int64 __readfsqword(unsigned long);
-static __inline__
 unsigned short __readfsword(unsigned long);
 #endif
-static __inline__
 unsigned __int64 __readmsr(unsigned long);
 unsigned __int64 __readpmc(unsigned long);
 unsigned long __segmentlimit(unsigned long);
 void __sidt(void *);
-static __inline__
 void __stosb(unsigned char *, unsigned char, size_t);
-static __inline__
 void __stosd(unsigned long *, unsigned long, size_t);
-static __inline__
 void __stosw(unsigned short *, unsigned short, size_t);
 void __svm_clgi(void);
 void __svm_invlpga(void *, int);
@@ -136,7 +120,6 @@ void __vmx_off(void);
 void __vmx_vmptrst(unsigned __int64 *);
 void __wbinvd(void);
 void __writecr0(unsigned int);
-static __inline__
 void __writecr3(unsigned __INTPTR_TYPE__);
 void __writecr4(unsigned int);
 void __writecr8(unsigned int);
@@ -146,11 +129,8 @@ void __writefsdword(unsigned long, unsigned long);
 void __writefsqword(unsigned long, unsigned __int64);
 void __writefsword(unsigned long, unsigned short);
 void __writemsr(unsigned long, unsigned __int64);
-static __inline__
 void *_AddressOfReturnAddress(void);
-static __inline__
 unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
-static __inline__
 unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
 unsigned char _bittest(long const *, long);
 unsigned char _bittestandcomplement(long *, long);
@@ -169,12 +149,10 @@ long _InterlockedExchangeAdd_HLEAcquire(long volatile *, long);
 long _InterlockedExchangeAdd_HLERelease(long volatile *, long);
 __int64 _InterlockedExchangeAdd64_HLEAcquire(__int64 volatile *, __int64);
 __int64 _InterlockedExchangeAdd64_HLERelease(__int64 volatile *, __int64);
-static __inline__ void
-__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
-_ReadBarrier(void);
-static __inline__ void
-__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
-_ReadWriteBarrier(void);
+void __attribute__((__deprecated__(
+    "use other intrinsics or C++11 atomics instead"))) _ReadBarrier(void);
+void __attribute__((__deprecated__(
+    "use other intrinsics or C++11 atomics instead"))) _ReadWriteBarrier(void);
 unsigned int _rorx_u32(unsigned int, const unsigned int);
 int _sarx_i32(int, unsigned int);
 #if __STDC_HOSTED__
@@ -185,9 +163,8 @@ unsigned int _shrx_u32(unsigned int, unsigned int);
 void _Store_HLERelease(long volatile *, long);
 void _Store64_HLERelease(__int64 volatile *, __int64);
 void _StorePointer_HLERelease(void *volatile *, void *);
-static __inline__ void
-__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
-_WriteBarrier(void);
+void __attribute__((__deprecated__(
+    "use other intrinsics or C++11 atomics instead"))) _WriteBarrier(void);
 unsigned __int32 xbegin(void);
 void _xend(void);
 
@@ -197,19 +174,14 @@ void __addgsbyte(unsigned long, unsigned char);
 void __addgsdword(unsigned long, unsigned long);
 void __addgsqword(unsigned long, unsigned __int64);
 void __addgsword(unsigned long, unsigned short);
-static __inline__
 void __faststorefence(void);
 void __incgsbyte(unsigned long);
 void __incgsdword(unsigned long);
 void __incgsqword(unsigned long);
 void __incgsword(unsigned long);
-static __inline__
 void __movsq(unsigned long long *, unsigned long long const *, size_t);
-static __inline__
 unsigned char __readgsbyte(unsigned long);
-static __inline__
 unsigned long __readgsdword(unsigned long);
-static __inline__
 unsigned __int64 __readgsqword(unsigned long);
 unsigned short __readgsword(unsigned long);
 unsigned __int64 __shiftleft128(unsigned __int64 _LowPart,
@@ -218,7 +190,6 @@ unsigned __int64 __shiftleft128(unsigned __int64 _LowPart,
 unsigned __int64 __shiftright128(unsigned __int64 _LowPart,
                                  unsigned __int64 _HighPart,
                                  unsigned char _Shift);
-static __inline__
 void __stosq(unsigned __int64 *, unsigned __int64, size_t);
 unsigned char __vmx_on(unsigned __int64 *);
 unsigned char __vmx_vmclear(unsigned __int64 *);
@@ -243,10 +214,6 @@ unsigned char _interlockedbittestandreset64(__int64 volatile *, __int64);
 unsigned char _interlockedbittestandset64(__int64 volatile *, __int64);
 long _InterlockedCompareExchange_np(long volatile *_Destination, long _Exchange,
                                     long _Comparand);
-unsigned char _InterlockedCompareExchange128(__int64 volatile *_Destination,
-                                             __int64 _ExchangeHigh,
-                                             __int64 _ExchangeLow,
-                                             __int64 *_CompareandResult);
 unsigned char _InterlockedCompareExchange128_np(__int64 volatile *_Destination,
                                                 __int64 _ExchangeHigh,
                                                 __int64 _ExchangeLow,
@@ -269,13 +236,9 @@ unsigned __int64 _rorx_u64(unsigned __int64, const unsigned int);
 __int64 _sarx_i64(__int64, unsigned int);
 unsigned __int64 _shlx_u64(unsigned __int64, unsigned int);
 unsigned __int64 _shrx_u64(unsigned __int64, unsigned int);
-static __inline__
 __int64 __mulh(__int64, __int64);
-static __inline__
 unsigned __int64 __umulh(unsigned __int64, unsigned __int64);
-static __inline__
 __int64 _mul128(__int64, __int64, __int64*);
-static __inline__
 unsigned __int64 _umul128(unsigned __int64,
                           unsigned __int64,
                           unsigned __int64*);
@@ -284,29 +247,19 @@ unsigned __int64 _umul128(unsigned __int64,
 
 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
 
-static __inline__
 unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
-static __inline__
 unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
 
 #endif
 
 #if defined(__i386__) || defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
-static __inline__
 __int64 _InterlockedDecrement64(__int64 volatile *_Addend);
-static __inline__
 __int64 _InterlockedExchange64(__int64 volatile *_Target, __int64 _Value);
-static __inline__
 __int64 _InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value);
-static __inline__
 __int64 _InterlockedExchangeSub64(__int64 volatile *_Subend, __int64 _Value);
-static __inline__
 __int64 _InterlockedIncrement64(__int64 volatile *_Addend);
-static __inline__
 __int64 _InterlockedOr64(__int64 volatile *_Value, __int64 _Mask);
-static __inline__
 __int64 _InterlockedXor64(__int64 volatile *_Value, __int64 _Mask);
-static __inline__
 __int64 _InterlockedAnd64(__int64 volatile *_Value, __int64 _Mask);
 
 #endif
@@ -470,45 +423,81 @@ __int64 _InterlockedCompareExchange64_nf(__int64 volatile *_Destination,
 __int64 _InterlockedCompareExchange64_rel(__int64 volatile *_Destination,
                               __int64 _Exchange, __int64 _Comparand);
 #endif
+#if defined(__x86_64__) || defined(__aarch64__)
+unsigned char _InterlockedCompareExchange128(__int64 volatile *_Destination,
+                                             __int64 _ExchangeHigh,
+                                             __int64 _ExchangeLow,
+                                             __int64 *_ComparandResult);
+#endif
+#if defined(__aarch64__)
+unsigned char _InterlockedCompareExchange128_acq(__int64 volatile *_Destination,
+                                                 __int64 _ExchangeHigh,
+                                                 __int64 _ExchangeLow,
+                                                 __int64 *_ComparandResult);
+unsigned char _InterlockedCompareExchange128_nf(__int64 volatile *_Destination,
+                                                __int64 _ExchangeHigh,
+                                                __int64 _ExchangeLow,
+                                                __int64 *_ComparandResult);
+unsigned char _InterlockedCompareExchange128_rel(__int64 volatile *_Destination,
+                                                 __int64 _ExchangeHigh,
+                                                 __int64 _ExchangeLow,
+                                                 __int64 *_ComparandResult);
+#endif
 
 /*----------------------------------------------------------------------------*\
 |* movs, stos
 \*----------------------------------------------------------------------------*/
 #if defined(__i386__) || defined(__x86_64__)
-static __inline__ void __DEFAULT_FN_ATTRS
-__movsb(unsigned char *__dst, unsigned char const *__src, size_t __n) {
+static __inline__ void __DEFAULT_FN_ATTRS __movsb(unsigned char *__dst,
+                                                  unsigned char const *__src,
+                                                  size_t __n) {
   __asm__ __volatile__("rep movsb" : "+D"(__dst), "+S"(__src), "+c"(__n)
                        : : "memory");
 }
-static __inline__ void __DEFAULT_FN_ATTRS
-__movsd(unsigned long *__dst, unsigned long const *__src, size_t __n) {
-  __asm__ __volatile__("rep movsl" : "+D"(__dst), "+S"(__src), "+c"(__n)
-                       : : "memory");
-}
-static __inline__ void __DEFAULT_FN_ATTRS
-__movsw(unsigned short *__dst, unsigned short const *__src, size_t __n) {
-  __asm__ __volatile__("rep movsw" : "+D"(__dst), "+S"(__src), "+c"(__n)
-                       : : "memory");
-}
-static __inline__ void __DEFAULT_FN_ATTRS
-__stosd(unsigned long *__dst, unsigned long __x, size_t __n) {
-  __asm__ __volatile__("rep stosl" : "+D"(__dst), "+c"(__n) : "a"(__x)
+static __inline__ void __DEFAULT_FN_ATTRS __movsd(unsigned long *__dst,
+                                                  unsigned long const *__src,
+                                                  size_t __n) {
+  __asm__ __volatile__("rep movsl"
+                       : "+D"(__dst), "+S"(__src), "+c"(__n)
+                       :
                        : "memory");
 }
-static __inline__ void __DEFAULT_FN_ATTRS
-__stosw(unsigned short *__dst, unsigned short __x, size_t __n) {
-  __asm__ __volatile__("rep stosw" : "+D"(__dst), "+c"(__n) : "a"(__x)
+static __inline__ void __DEFAULT_FN_ATTRS __movsw(unsigned short *__dst,
+                                                  unsigned short const *__src,
+                                                  size_t __n) {
+  __asm__ __volatile__("rep movsw"
+                       : "+D"(__dst), "+S"(__src), "+c"(__n)
+                       :
+                       : "memory");
+}
+static __inline__ void __DEFAULT_FN_ATTRS __stosd(unsigned long *__dst,
+                                                  unsigned long __x,
+                                                  size_t __n) {
+  __asm__ __volatile__("rep stosl"
+                       : "+D"(__dst), "+c"(__n)
+                       : "a"(__x)
+                       : "memory");
+}
+static __inline__ void __DEFAULT_FN_ATTRS __stosw(unsigned short *__dst,
+                                                  unsigned short __x,
+                                                  size_t __n) {
+  __asm__ __volatile__("rep stosw"
+                       : "+D"(__dst), "+c"(__n)
+                       : "a"(__x)
                        : "memory");
 }
 #endif
 #ifdef __x86_64__
-static __inline__ void __DEFAULT_FN_ATTRS
-__movsq(unsigned long long *__dst, unsigned long long const *__src, size_t __n) {
-  __asm__ __volatile__("rep movsq" : "+D"(__dst), "+S"(__src), "+c"(__n)
-                       : : "memory");
+static __inline__ void __DEFAULT_FN_ATTRS __movsq(
+    unsigned long long *__dst, unsigned long long const *__src, size_t __n) {
+  __asm__ __volatile__("rep movsq"
+                       : "+D"(__dst), "+S"(__src), "+c"(__n)
+                       :
+                       : "memory");
 }
-static __inline__ void __DEFAULT_FN_ATTRS
-__stosq(unsigned __int64 *__dst, unsigned __int64 __x, size_t __n) {
+static __inline__ void __DEFAULT_FN_ATTRS __stosq(unsigned __int64 *__dst,
+                                                  unsigned __int64 __x,
+                                                  size_t __n) {
   __asm__ __volatile__("rep stosq" : "+D"(__dst), "+c"(__n) : "a"(__x)
                        : "memory");
 }
@@ -518,26 +507,25 @@ __stosq(unsigned __int64 *__dst, unsigned __int64 __x, size_t __n) {
 |* Misc
 \*----------------------------------------------------------------------------*/
 #if defined(__i386__) || defined(__x86_64__)
-static __inline__ void __DEFAULT_FN_ATTRS
-__cpuid(int __info[4], int __level) {
-  __asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3])
-                   : "a"(__level), "c"(0));
+static __inline__ void __DEFAULT_FN_ATTRS __cpuid(int __info[4], int __level) {
+  __asm__("cpuid"
+          : "=a"(__info[0]), "=b"(__info[1]), "=c"(__info[2]), "=d"(__info[3])
+          : "a"(__level), "c"(0));
 }
-static __inline__ void __DEFAULT_FN_ATTRS
-__cpuidex(int __info[4], int __level, int __ecx) {
-  __asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3])
-                   : "a"(__level), "c"(__ecx));
+static __inline__ void __DEFAULT_FN_ATTRS __cpuidex(int __info[4], int __level,
+                                                    int __ecx) {
+  __asm__("cpuid"
+          : "=a"(__info[0]), "=b"(__info[1]), "=c"(__info[2]), "=d"(__info[3])
+          : "a"(__level), "c"(__ecx));
 }
-static __inline__ void __DEFAULT_FN_ATTRS
-__halt(void) {
-  __asm__ volatile ("hlt");
+static __inline__ void __DEFAULT_FN_ATTRS __halt(void) {
+  __asm__ volatile("hlt");
 }
 #endif
 
 #if defined(__i386__) || defined(__x86_64__) || defined(__aarch64__)
-static __inline__ void __DEFAULT_FN_ATTRS
-__nop(void) {
-  __asm__ volatile ("nop");
+static __inline__ void __DEFAULT_FN_ATTRS __nop(void) {
+  __asm__ volatile("nop");
 }
 #endif
 
@@ -574,8 +562,7 @@ __readmsr(unsigned long __register) {
 }
 #endif
 
-static __inline__ unsigned __LPTRINT_TYPE__ __DEFAULT_FN_ATTRS
-__readcr3(void) {
+static __inline__ unsigned __LPTRINT_TYPE__ __DEFAULT_FN_ATTRS __readcr3(void) {
   unsigned __LPTRINT_TYPE__ __cr3_val;
   __asm__ __volatile__ ("mov %%cr3, %0" : "=r"(__cr3_val) : : "memory");
   return __cr3_val;
diff --git a/lib/include/keylockerintrin.h b/lib/include/keylockerintrin.h
new file mode 100644
index 0000000000..c15d39c8e3
--- /dev/null
+++ b/lib/include/keylockerintrin.h
@@ -0,0 +1,506 @@
+/*===----------------- keylockerintrin.h - KL Intrinsics -------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <keylockerintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _KEYLOCKERINTRIN_H
+#define _KEYLOCKERINTRIN_H
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__KL__)
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__, __target__("kl"),\
+                 __min_vector_width__(128)))
+
+/// Load internal wrapping key from __intkey, __enkey_lo and __enkey_hi. __ctl
+/// will assigned to EAX, whch specifies the KeySource and whether backing up
+/// the key is permitted. The 256-bit encryption key is loaded from the two
+/// explicit operands (__enkey_lo and __enkey_hi). The 128-bit integrity key is
+/// loaded from the implicit operand XMM0 which assigned by __intkey.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> LOADIWKEY </c> instructions.
+///
+/// \operation
+/// IF CPL > 0 // LOADKWKEY only allowed at ring 0 (supervisor mode)
+///   GP (0)
+/// FI
+/// IF “LOADIWKEY exiting” VM execution control set
+///   VMexit
+/// FI
+/// IF __ctl[4:1] > 1 // Reserved KeySource encoding used
+///   GP (0)
+/// FI
+/// IF __ctl[31:5] != 0 // Reserved bit in __ctl is set
+///   GP (0)
+/// FI
+/// IF __ctl[0] AND (CPUID.19H.ECX[0] == 0) // NoBackup is not supported on this part
+///   GP (0)
+/// FI
+/// IF (__ctl[4:1] == 1) AND (CPUID.19H.ECX[1] == 0) // KeySource of 1 is not supported on this part
+///   GP (0)
+/// FI
+/// IF (__ctl[4:1] == 0) // KeySource of 0.
+///   IWKey.Encryption Key[127:0] := __enkey_hi[127:0]:
+///   IWKey.Encryption Key[255:128] := __enkey_lo[127:0]
+///   IWKey.IntegrityKey[127:0] := __intkey[127:0]
+///   IWKey.NoBackup := __ctl[0]
+///   IWKey.KeySource := __ctl[4:1]
+///   ZF := 0
+/// ELSE // KeySource of 1. See RDSEED definition for details of randomness
+///   IF HW_NRND_GEN.ready == 1 // Full-entropy random data from RDSEED was received
+///     IWKey.Encryption Key[127:0] := __enkey_hi[127:0] XOR HW_NRND_GEN.data[127:0]
+///     IWKey.Encryption Key[255:128] := __enkey_lo[127:0] XOR HW_NRND_GEN.data[255:128]
+///     IWKey.Encryption Key[255:0] := __enkey_hi[127:0]:__enkey_lo[127:0] XOR HW_NRND_GEN.data[255:0]
+///     IWKey.IntegrityKey[127:0] := __intkey[127:0] XOR HW_NRND_GEN.data[383:256]
+///     IWKey.NoBackup := __ctl[0]
+///     IWKey.KeySource := __ctl[4:1]
+///     ZF := 0
+///   ELSE // Random data was not returned from RDSEED. IWKey was not loaded
+///     ZF := 1
+///   FI
+/// FI
+/// dst := ZF
+/// OF := 0
+/// SF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endoperation
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_loadiwkey (unsigned int __ctl, __m128i __intkey,
+               __m128i __enkey_lo, __m128i __enkey_hi) {
+  __builtin_ia32_loadiwkey (__intkey, __enkey_lo, __enkey_hi, __ctl);
+}
+
+/// Wrap a 128-bit AES key from __key into a key handle and output in
+/// ((__m128i*)__h) to ((__m128i*)__h) + 5  and a 32-bit value as return.
+/// The explicit source operand __htype specifies handle restrictions.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> ENCODEKEY128 </c> instructions.
+///
+/// \operation
+/// InputKey[127:0] := __key[127:0]
+/// KeyMetadata[2:0] := __htype[2:0]
+/// KeyMetadata[23:3] := 0 // Reserved for future usage
+/// KeyMetadata[27:24] := 0 // KeyType is AES-128 (value of 0)
+/// KeyMetadata[127:28] := 0 // Reserved for future usage
+/// Handle[383:0] := WrapKey128(InputKey[127:0], KeyMetadata[127:0],
+///                  IWKey.Integrity Key[127:0], IWKey.Encryption Key[255:0])
+/// dst[0] := IWKey.NoBackup
+/// dst[4:1] := IWKey.KeySource[3:0]
+/// dst[31:5] := 0
+/// MEM[__h+127:__h] := Handle[127:0]   // AAD
+/// MEM[__h+255:__h+128] := Handle[255:128] // Integrity Tag
+/// MEM[__h+383:__h+256] := Handle[383:256] // CipherText
+/// MEM[__h+511:__h+384] := 0 // Reserved for future usage
+/// MEM[__h+639:__h+512] := 0 // Reserved for future usage
+/// MEM[__h+767:__h+640] := 0 // Reserved for future usage
+/// OF := 0
+/// SF := 0
+/// ZF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endoperation
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm_encodekey128_u32(unsigned int __htype, __m128i __key, void *__h) {
+  return __builtin_ia32_encodekey128_u32(__htype, (__v2di)__key, __h);
+}
+
+/// Wrap a 256-bit AES key from __key_hi:__key_lo into a key handle, then
+/// output handle in ((__m128i*)__h) to ((__m128i*)__h) + 6 and
+/// a 32-bit value as return.
+/// The explicit source operand __htype specifies handle restrictions.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> ENCODEKEY256 </c> instructions.
+///
+/// \operation
+/// InputKey[127:0] := __key_lo[127:0]
+/// InputKey[255:128] := __key_hi[255:128]
+/// KeyMetadata[2:0] := __htype[2:0]
+/// KeyMetadata[23:3] := 0 // Reserved for future usage
+/// KeyMetadata[27:24] := 1 // KeyType is AES-256 (value of 1)
+/// KeyMetadata[127:28] := 0 // Reserved for future usage
+/// Handle[511:0] := WrapKey256(InputKey[255:0], KeyMetadata[127:0],
+///                  IWKey.Integrity Key[127:0], IWKey.Encryption Key[255:0])
+/// dst[0] := IWKey.NoBackup
+/// dst[4:1] := IWKey.KeySource[3:0]
+/// dst[31:5] := 0
+/// MEM[__h+127:__h]   := Handle[127:0] // AAD
+/// MEM[__h+255:__h+128] := Handle[255:128] // Tag
+/// MEM[__h+383:__h+256] := Handle[383:256] // CipherText[127:0]
+/// MEM[__h+511:__h+384] := Handle[511:384] // CipherText[255:128]
+/// MEM[__h+639:__h+512] := 0 // Reserved for future usage
+/// MEM[__h+767:__h+640] := 0 // Reserved for future usage
+/// MEM[__h+895:__h+768] := 0 Integrity// Reserved for future usage
+/// OF := 0
+/// SF := 0
+/// ZF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endoperation
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm_encodekey256_u32(unsigned int __htype, __m128i __key_lo, __m128i __key_hi,
+                     void *__h) {
+  return __builtin_ia32_encodekey256_u32(__htype, (__v2di)__key_lo,
+                                         (__v2di)__key_hi, __h);
+}
+
+/// The AESENC128KL performs 10 rounds of AES to encrypt the __idata using
+/// the 128-bit key in the handle from the __h. It stores the result in the
+/// __odata. And return the affected ZF flag status.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> AESENC128KL </c> instructions.
+///
+/// \operation
+/// Handle[383:0] := MEM[__h+383:__h] // Load is not guaranteed to be atomic.
+/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) ||
+///                    (Handle[127:0] AND (CPL > 0)) ||
+///                    Handle[383:256] ||
+///                    HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128 )
+/// IF (IllegalHandle)
+///   ZF := 1
+/// ELSE
+///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
+///   IF (Authentic == 0)
+///     ZF := 1
+///   ELSE
+///     MEM[__odata+127:__odata] := AES128Encrypt (__idata[127:0], UnwrappedKey)
+///     ZF := 0
+///   FI
+/// FI
+/// dst := ZF
+/// OF := 0
+/// SF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endoperation
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_mm_aesenc128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
+  return __builtin_ia32_aesenc128kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
+}
+
+/// The AESENC256KL performs 14 rounds of AES to encrypt the __idata using
+/// the 256-bit key in the handle from the __h. It stores the result in the
+/// __odata. And return the affected ZF flag status.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> AESENC256KL </c> instructions.
+///
+/// \operation
+/// Handle[511:0] := MEM[__h+511:__h] // Load is not guaranteed to be atomic.
+/// IllegalHandle := ( HandleReservedBitSet (Handle[511:0]) ||
+///                    (Handle[127:0] AND (CPL > 0)) ||
+///                    Handle[255:128] ||
+///                    HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES256 )
+/// IF (IllegalHandle)
+///   ZF := 1
+/// ELSE
+///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
+///   IF (Authentic == 0)
+///     ZF := 1
+///   ELSE
+///     MEM[__odata+127:__odata] := AES256Encrypt (__idata[127:0], UnwrappedKey)
+///     ZF := 0
+///   FI
+/// FI
+/// dst := ZF
+/// OF := 0
+/// SF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endoperation
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_mm_aesenc256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
+  return __builtin_ia32_aesenc256kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
+}
+
+/// The AESDEC128KL performs 10 rounds of AES to decrypt the __idata using
+/// the 128-bit key in the handle from the __h. It stores the result in the
+/// __odata. And return the affected ZF flag status.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> AESDEC128KL </c> instructions.
+///
+/// \operation
+/// Handle[383:0] := MEM[__h+383:__h] // Load is not guaranteed to be atomic.
+/// IllegalHandle := (HandleReservedBitSet (Handle[383:0]) ||
+///                  (Handle[127:0] AND (CPL > 0)) ||
+///                  Handle[383:256] ||
+///                  HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128)
+/// IF (IllegalHandle)
+///   ZF := 1
+/// ELSE
+///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
+///   IF (Authentic == 0)
+///     ZF := 1
+///   ELSE
+///     MEM[__odata+127:__odata] := AES128Decrypt (__idata[127:0], UnwrappedKey)
+///     ZF := 0
+///   FI
+/// FI
+/// dst := ZF
+/// OF := 0
+/// SF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endoperation
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_mm_aesdec128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
+  return __builtin_ia32_aesdec128kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
+}
+
+/// The AESDEC256KL performs 10 rounds of AES to decrypt the __idata using
+/// the 256-bit key in the handle from the __h. It stores the result in the
+/// __odata. And return the affected ZF flag status.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> AESDEC256KL </c> instructions.
+///
+/// \operation
+/// Handle[511:0] := MEM[__h+511:__h]
+/// IllegalHandle := (HandleReservedBitSet (Handle[511:0]) ||
+///                   (Handle[127:0] AND (CPL > 0)) ||
+///                   Handle[383:256] ||
+///                   HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES256)
+/// IF (IllegalHandle)
+///   ZF := 1
+/// ELSE
+///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
+///   IF (Authentic == 0)
+///     ZF := 1
+///   ELSE
+///     MEM[__odata+127:__odata] := AES256Decrypt (__idata[127:0], UnwrappedKey)
+///     ZF := 0
+///   FI
+/// FI
+/// dst := ZF
+/// OF := 0
+/// SF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endoperation
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_mm_aesdec256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
+  return __builtin_ia32_aesdec256kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) \
+          || defined(__KL__) */
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__WIDEKL__)
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__, __target__("kl,widekl"),\
+                 __min_vector_width__(128)))
+
+/// Encrypt __idata[0] to __idata[7] using 128-bit AES key indicated by handle
+/// at __h and store each resultant block back from __odata to __odata+7. And
+/// return the affected ZF flag status.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> AESENCWIDE128KL </c> instructions.
+///
+/// \operation
+/// Handle := MEM[__h+383:__h]
+/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) ||
+///                    (Handle[127:0] AND (CPL > 0)) ||
+///                    Handle[255:128] ||
+///                    HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128 )
+/// IF (IllegalHandle)
+///   ZF := 1
+/// ELSE
+///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
+///   IF Authentic == 0
+///     ZF := 1
+///   ELSE
+///     FOR i := 0 to 7
+///       __odata[i] := AES128Encrypt (__idata[i], UnwrappedKey)
+///     ENDFOR
+///     ZF := 0
+///   FI
+/// FI
+/// dst := ZF
+/// OF := 0
+/// SF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endoperation
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_mm_aesencwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
+  return __builtin_ia32_aesencwide128kl_u8((__v2di *)__odata,
+                                           (const __v2di *)__idata, __h);
+}
+
+/// Encrypt __idata[0] to __idata[7] using 256-bit AES key indicated by handle
+/// at __h and store each resultant block back from __odata to __odata+7. And
+/// return the affected ZF flag status.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> AESENCWIDE256KL </c> instructions.
+///
+/// \operation
+/// Handle[511:0] := MEM[__h+511:__h]
+/// IllegalHandle := ( HandleReservedBitSet (Handle[511:0]) ||
+///                    (Handle[127:0] AND (CPL > 0)) ||
+///                    Handle[255:128] ||
+///                    HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES512 )
+/// IF (IllegalHandle)
+///   ZF := 1
+/// ELSE
+///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
+///   IF Authentic == 0
+///     ZF := 1
+///   ELSE
+///     FOR i := 0 to 7
+///       __odata[i] := AES256Encrypt (__idata[i], UnwrappedKey)
+///     ENDFOR
+///     ZF := 0
+///   FI
+/// FI
+/// dst := ZF
+/// OF := 0
+/// SF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endoperation
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_mm_aesencwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
+  return __builtin_ia32_aesencwide256kl_u8((__v2di *)__odata,
+                                           (const __v2di *)__idata, __h);
+}
+
+/// Decrypt __idata[0] to __idata[7] using 128-bit AES key indicated by handle
+/// at __h and store each resultant block back from __odata to __odata+7. And
+/// return the affected ZF flag status.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> AESDECWIDE128KL </c> instructions.
+///
+/// \operation
+/// Handle[383:0] := MEM[__h+383:__h]
+/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) ||
+///                    (Handle[127:0] AND (CPL > 0)) ||
+///                    Handle[255:128] ||
+///                    HandleKeyType (Handle) != HANDLE_KEY_TYPE_AES128 )
+/// IF (IllegalHandle)
+///   ZF := 1
+/// ELSE
+///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
+///   IF Authentic == 0
+///     ZF := 1
+///   ELSE
+///     FOR i := 0 to 7
+///       __odata[i] := AES128Decrypt (__idata[i], UnwrappedKey)
+///     ENDFOR
+///     ZF := 0
+///   FI
+/// FI
+/// dst := ZF
+/// OF := 0
+/// SF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endoperation
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_mm_aesdecwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
+  return __builtin_ia32_aesdecwide128kl_u8((__v2di *)__odata,
+                                           (const __v2di *)__idata, __h);
+}
+
+/// Decrypt __idata[0] to __idata[7] using 256-bit AES key indicated by handle
+/// at __h and store each resultant block back from __odata to __odata+7. And
+/// return the affected ZF flag status.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> AESDECWIDE256KL </c> instructions.
+///
+/// \operation
+/// Handle[511:0] := MEM[__h+511:__h]
+/// IllegalHandle = ( HandleReservedBitSet (Handle[511:0]) ||
+///                   (Handle[127:0] AND (CPL > 0)) ||
+///                   Handle[255:128] ||
+///                   HandleKeyType (Handle) != HANDLE_KEY_TYPE_AES512 )
+/// If (IllegalHandle)
+///   ZF := 1
+/// ELSE
+///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
+///   IF Authentic == 0
+///     ZF := 1
+///   ELSE
+///     FOR i := 0 to 7
+///       __odata[i] := AES256Decrypt (__idata[i], UnwrappedKey)
+///     ENDFOR
+///     ZF := 0
+///   FI
+/// FI
+/// dst := ZF
+/// OF := 0
+/// SF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endoperation
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_mm_aesdecwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
+  return __builtin_ia32_aesdecwide256kl_u8((__v2di *)__odata,
+                                           (const __v2di *)__idata, __h);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) \
+          || defined(__WIDEKL__) */
+
+#endif /* _KEYLOCKERINTRIN_H */
diff --git a/lib/include/mm_malloc.h b/lib/include/mm_malloc.h
index 0ea32517ae..933dbaacad 100644
--- a/lib/include/mm_malloc.h
+++ b/lib/include/mm_malloc.h
@@ -54,7 +54,13 @@ _mm_malloc(size_t __size, size_t __align)
 static __inline__ void __attribute__((__always_inline__, __nodebug__))
 _mm_free(void *__p)
 {
+#if defined(__MINGW32__)
+  __mingw_aligned_free(__p);
+#elif defined(_WIN32)
+  _aligned_free(__p);
+#else
   free(__p);
+#endif
 }
 #endif
 
diff --git a/lib/include/opencl-c-base.h b/lib/include/opencl-c-base.h
index 430e07d36f..e8dcd70377 100644
--- a/lib/include/opencl-c-base.h
+++ b/lib/include/opencl-c-base.h
@@ -9,6 +9,21 @@
 #ifndef _OPENCL_BASE_H_
 #define _OPENCL_BASE_H_
 
+// Define extension macros
+
+#if (defined(__OPENCL_CPP_VERSION__) || __OPENCL_C_VERSION__ >= 200)
+// For SPIR all extensions are supported.
+#if defined(__SPIR__)
+#define cl_khr_subgroup_extended_types 1
+#define cl_khr_subgroup_non_uniform_vote 1
+#define cl_khr_subgroup_ballot 1
+#define cl_khr_subgroup_non_uniform_arithmetic 1
+#define cl_khr_subgroup_shuffle 1
+#define cl_khr_subgroup_shuffle_relative 1
+#define cl_khr_subgroup_clustered_reduce 1
+#endif // defined(__SPIR__)
+#endif // (defined(__OPENCL_CPP_VERSION__) || __OPENCL_C_VERSION__ >= 200)
+
 // built-in scalar data types:
 
 /**
@@ -568,4 +583,7 @@ typedef struct {
 #pragma OPENCL EXTENSION cl_intel_device_side_avc_motion_estimation : end
 #endif // cl_intel_device_side_avc_motion_estimation
 
+// Disable any extensions we may have enabled previously.
+#pragma OPENCL EXTENSION all : disable
+
 #endif //_OPENCL_BASE_H_
diff --git a/lib/include/opencl-c.h b/lib/include/opencl-c.h
index 66e18bdd47..ab665628c8 100644
--- a/lib/include/opencl-c.h
+++ b/lib/include/opencl-c.h
@@ -4633,6 +4633,7 @@ float16 __ovld __cnfn convert_float16(float16);
 // Conversions with double data type parameters or return value.
 
 #ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
 char __ovld __cnfn convert_char(double);
 char __ovld __cnfn convert_char_rte(double);
 char __ovld __cnfn convert_char_rtn(double);
@@ -5455,6 +5456,7 @@ double16 __ovld __cnfn convert_double16_rtz(ushort16);
 #endif //cl_khr_fp64
 
 #ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
 // Convert half types to non-double types.
 uchar __ovld __cnfn convert_uchar(half);
 uchar __ovld __cnfn convert_uchar_rte(half);
diff --git a/lib/include/openmp_wrappers/cmath b/lib/include/openmp_wrappers/cmath
index bd6011eb6f..1aff66af7d 100644
--- a/lib/include/openmp_wrappers/cmath
+++ b/lib/include/openmp_wrappers/cmath
@@ -24,8 +24,11 @@
 // which might live in cstdlib.
 #include <cstdlib>
 
+// We need limits because __clang_cuda_cmath.h below uses `std::numeric_limit`.
+#include <limits>
+
 #pragma omp begin declare variant match(                                       \
-    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
+    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any, allow_templates)})
 
 #define __CUDA__
 #define __OPENMP_NVPTX__
diff --git a/lib/include/openmp_wrappers/complex b/lib/include/openmp_wrappers/complex
index d8dcd41670..142e526b81 100644
--- a/lib/include/openmp_wrappers/complex
+++ b/lib/include/openmp_wrappers/complex
@@ -25,3 +25,28 @@
 
 // Grab the host header too.
 #include_next <complex>
+
+
+#ifdef __cplusplus
+
+// If we are compiling against libc++, the macro _LIBCPP_STD_VER should be set
+// after including <cmath> above. Since the complex header we use is a
+// simplified version of the libc++, we don't need it in this case. If we
+// compile against libstdc++, or any other standard library, we will overload
+// the (hopefully template) functions in the <complex> header with the ones we
+// got from libc++ which decomposes math functions, like `std::sin`, into
+// arithmetic and calls to non-complex functions, all of which we can then
+// handle.
+#ifndef _LIBCPP_STD_VER
+
+#pragma omp begin declare variant match(                                       \
+    device = {arch(nvptx, nvptx64)},                                           \
+    implementation = {extension(match_any, allow_templates)})
+
+#include <complex_cmath.h>
+
+#pragma omp end declare variant
+
+#endif
+
+#endif
diff --git a/lib/include/openmp_wrappers/complex_cmath.h b/lib/include/openmp_wrappers/complex_cmath.h
new file mode 100644
index 0000000000..e3d9aebbbc
--- /dev/null
+++ b/lib/include/openmp_wrappers/complex_cmath.h
@@ -0,0 +1,388 @@
+//===------------------------- __complex_cmath.h --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// std::complex header copied from the libcxx source and simplified for use in
+// OpenMP target offload regions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _OPENMP
+#error "This file is for OpenMP compilation only."
+#endif
+
+#ifndef __cplusplus
+#error "This file is for C++ compilation only."
+#endif
+
+#ifndef _LIBCPP_COMPLEX
+#define _LIBCPP_COMPLEX
+
+#include <cmath>
+#include <type_traits>
+
+#define __DEVICE__ static constexpr __attribute__((nothrow))
+
+namespace std {
+
+// abs
+
+template <class _Tp> __DEVICE__ _Tp abs(const std::complex<_Tp> &__c) {
+  return hypot(__c.real(), __c.imag());
+}
+
+// arg
+
+template <class _Tp> __DEVICE__ _Tp arg(const std::complex<_Tp> &__c) {
+  return atan2(__c.imag(), __c.real());
+}
+
+template <class _Tp>
+typename enable_if<is_integral<_Tp>::value || is_same<_Tp, double>::value,
+                   double>::type
+arg(_Tp __re) {
+  return atan2(0., __re);
+}
+
+template <class _Tp>
+typename enable_if<is_same<_Tp, float>::value, float>::type arg(_Tp __re) {
+  return atan2f(0.F, __re);
+}
+
+// norm
+
+template <class _Tp> __DEVICE__ _Tp norm(const std::complex<_Tp> &__c) {
+  if (std::isinf(__c.real()))
+    return abs(__c.real());
+  if (std::isinf(__c.imag()))
+    return abs(__c.imag());
+  return __c.real() * __c.real() + __c.imag() * __c.imag();
+}
+
+// conj
+
+template <class _Tp> std::complex<_Tp> conj(const std::complex<_Tp> &__c) {
+  return std::complex<_Tp>(__c.real(), -__c.imag());
+}
+
+// proj
+
+template <class _Tp> std::complex<_Tp> proj(const std::complex<_Tp> &__c) {
+  std::complex<_Tp> __r = __c;
+  if (std::isinf(__c.real()) || std::isinf(__c.imag()))
+    __r = std::complex<_Tp>(INFINITY, copysign(_Tp(0), __c.imag()));
+  return __r;
+}
+
+// polar
+
+template <class _Tp>
+complex<_Tp> polar(const _Tp &__rho, const _Tp &__theta = _Tp()) {
+  if (std::isnan(__rho) || signbit(__rho))
+    return std::complex<_Tp>(_Tp(NAN), _Tp(NAN));
+  if (std::isnan(__theta)) {
+    if (std::isinf(__rho))
+      return std::complex<_Tp>(__rho, __theta);
+    return std::complex<_Tp>(__theta, __theta);
+  }
+  if (std::isinf(__theta)) {
+    if (std::isinf(__rho))
+      return std::complex<_Tp>(__rho, _Tp(NAN));
+    return std::complex<_Tp>(_Tp(NAN), _Tp(NAN));
+  }
+  _Tp __x = __rho * cos(__theta);
+  if (std::isnan(__x))
+    __x = 0;
+  _Tp __y = __rho * sin(__theta);
+  if (std::isnan(__y))
+    __y = 0;
+  return std::complex<_Tp>(__x, __y);
+}
+
+// log
+
+template <class _Tp> std::complex<_Tp> log(const std::complex<_Tp> &__x) {
+  return std::complex<_Tp>(log(abs(__x)), arg(__x));
+}
+
+// log10
+
+template <class _Tp> std::complex<_Tp> log10(const std::complex<_Tp> &__x) {
+  return log(__x) / log(_Tp(10));
+}
+
+// sqrt
+
+template <class _Tp>
+__DEVICE__ std::complex<_Tp> sqrt(const std::complex<_Tp> &__x) {
+  if (std::isinf(__x.imag()))
+    return std::complex<_Tp>(_Tp(INFINITY), __x.imag());
+  if (std::isinf(__x.real())) {
+    if (__x.real() > _Tp(0))
+      return std::complex<_Tp>(__x.real(), std::isnan(__x.imag())
+                                               ? __x.imag()
+                                               : copysign(_Tp(0), __x.imag()));
+    return std::complex<_Tp>(std::isnan(__x.imag()) ? __x.imag() : _Tp(0),
+                             copysign(__x.real(), __x.imag()));
+  }
+  return polar(sqrt(abs(__x)), arg(__x) / _Tp(2));
+}
+
+// exp
+
+template <class _Tp>
+__DEVICE__ std::complex<_Tp> exp(const std::complex<_Tp> &__x) {
+  _Tp __i = __x.imag();
+  if (std::isinf(__x.real())) {
+    if (__x.real() < _Tp(0)) {
+      if (!std::isfinite(__i))
+        __i = _Tp(1);
+    } else if (__i == 0 || !std::isfinite(__i)) {
+      if (std::isinf(__i))
+        __i = _Tp(NAN);
+      return std::complex<_Tp>(__x.real(), __i);
+    }
+  } else if (std::isnan(__x.real()) && __x.imag() == 0)
+    return __x;
+  _Tp __e = exp(__x.real());
+  return std::complex<_Tp>(__e * cos(__i), __e * sin(__i));
+}
+
+// pow
+
+template <class _Tp>
+std::complex<_Tp> pow(const std::complex<_Tp> &__x,
+                      const std::complex<_Tp> &__y) {
+  return exp(__y * log(__x));
+}
+
+// __sqr, computes pow(x, 2)
+
+template <class _Tp> std::complex<_Tp> __sqr(const std::complex<_Tp> &__x) {
+  return std::complex<_Tp>((__x.real() - __x.imag()) *
+                               (__x.real() + __x.imag()),
+                           _Tp(2) * __x.real() * __x.imag());
+}
+
+// asinh
+
+template <class _Tp>
+__DEVICE__ std::complex<_Tp> asinh(const std::complex<_Tp> &__x) {
+  const _Tp __pi(atan2(+0., -0.));
+  if (std::isinf(__x.real())) {
+    if (std::isnan(__x.imag()))
+      return __x;
+    if (std::isinf(__x.imag()))
+      return std::complex<_Tp>(__x.real(),
+                               copysign(__pi * _Tp(0.25), __x.imag()));
+    return std::complex<_Tp>(__x.real(), copysign(_Tp(0), __x.imag()));
+  }
+  if (std::isnan(__x.real())) {
+    if (std::isinf(__x.imag()))
+      return std::complex<_Tp>(__x.imag(), __x.real());
+    if (__x.imag() == 0)
+      return __x;
+    return std::complex<_Tp>(__x.real(), __x.real());
+  }
+  if (std::isinf(__x.imag()))
+    return std::complex<_Tp>(copysign(__x.imag(), __x.real()),
+                             copysign(__pi / _Tp(2), __x.imag()));
+  std::complex<_Tp> __z = log(__x + sqrt(__sqr(__x) + _Tp(1)));
+  return std::complex<_Tp>(copysign(__z.real(), __x.real()),
+                           copysign(__z.imag(), __x.imag()));
+}
+
+// acosh
+
+template <class _Tp>
+__DEVICE__ std::complex<_Tp> acosh(const std::complex<_Tp> &__x) {
+  const _Tp __pi(atan2(+0., -0.));
+  if (std::isinf(__x.real())) {
+    if (std::isnan(__x.imag()))
+      return std::complex<_Tp>(abs(__x.real()), __x.imag());
+    if (std::isinf(__x.imag())) {
+      if (__x.real() > 0)
+        return std::complex<_Tp>(__x.real(),
+                                 copysign(__pi * _Tp(0.25), __x.imag()));
+      else
+        return std::complex<_Tp>(-__x.real(),
+                                 copysign(__pi * _Tp(0.75), __x.imag()));
+    }
+    if (__x.real() < 0)
+      return std::complex<_Tp>(-__x.real(), copysign(__pi, __x.imag()));
+    return std::complex<_Tp>(__x.real(), copysign(_Tp(0), __x.imag()));
+  }
+  if (std::isnan(__x.real())) {
+    if (std::isinf(__x.imag()))
+      return std::complex<_Tp>(abs(__x.imag()), __x.real());
+    return std::complex<_Tp>(__x.real(), __x.real());
+  }
+  if (std::isinf(__x.imag()))
+    return std::complex<_Tp>(abs(__x.imag()),
+                             copysign(__pi / _Tp(2), __x.imag()));
+  std::complex<_Tp> __z = log(__x + sqrt(__sqr(__x) - _Tp(1)));
+  return std::complex<_Tp>(copysign(__z.real(), _Tp(0)),
+                           copysign(__z.imag(), __x.imag()));
+}
+
+// atanh
+
+template <class _Tp>
+__DEVICE__ std::complex<_Tp> atanh(const std::complex<_Tp> &__x) {
+  const _Tp __pi(atan2(+0., -0.));
+  if (std::isinf(__x.imag())) {
+    return std::complex<_Tp>(copysign(_Tp(0), __x.real()),
+                             copysign(__pi / _Tp(2), __x.imag()));
+  }
+  if (std::isnan(__x.imag())) {
+    if (std::isinf(__x.real()) || __x.real() == 0)
+      return std::complex<_Tp>(copysign(_Tp(0), __x.real()), __x.imag());
+    return std::complex<_Tp>(__x.imag(), __x.imag());
+  }
+  if (std::isnan(__x.real())) {
+    return std::complex<_Tp>(__x.real(), __x.real());
+  }
+  if (std::isinf(__x.real())) {
+    return std::complex<_Tp>(copysign(_Tp(0), __x.real()),
+                             copysign(__pi / _Tp(2), __x.imag()));
+  }
+  if (abs(__x.real()) == _Tp(1) && __x.imag() == _Tp(0)) {
+    return std::complex<_Tp>(copysign(_Tp(INFINITY), __x.real()),
+                             copysign(_Tp(0), __x.imag()));
+  }
+  std::complex<_Tp> __z = log((_Tp(1) + __x) / (_Tp(1) - __x)) / _Tp(2);
+  return std::complex<_Tp>(copysign(__z.real(), __x.real()),
+                           copysign(__z.imag(), __x.imag()));
+}
+
+// sinh
+
+template <class _Tp>
+__DEVICE__ std::complex<_Tp> sinh(const std::complex<_Tp> &__x) {
+  if (std::isinf(__x.real()) && !std::isfinite(__x.imag()))
+    return std::complex<_Tp>(__x.real(), _Tp(NAN));
+  if (__x.real() == 0 && !std::isfinite(__x.imag()))
+    return std::complex<_Tp>(__x.real(), _Tp(NAN));
+  if (__x.imag() == 0 && !std::isfinite(__x.real()))
+    return __x;
+  return std::complex<_Tp>(sinh(__x.real()) * cos(__x.imag()),
+                           cosh(__x.real()) * sin(__x.imag()));
+}
+
+// cosh
+
+template <class _Tp>
+__DEVICE__ std::complex<_Tp> cosh(const std::complex<_Tp> &__x) {
+  if (std::isinf(__x.real()) && !std::isfinite(__x.imag()))
+    return std::complex<_Tp>(abs(__x.real()), _Tp(NAN));
+  if (__x.real() == 0 && !std::isfinite(__x.imag()))
+    return std::complex<_Tp>(_Tp(NAN), __x.real());
+  if (__x.real() == 0 && __x.imag() == 0)
+    return std::complex<_Tp>(_Tp(1), __x.imag());
+  if (__x.imag() == 0 && !std::isfinite(__x.real()))
+    return std::complex<_Tp>(abs(__x.real()), __x.imag());
+  return std::complex<_Tp>(cosh(__x.real()) * cos(__x.imag()),
+                           sinh(__x.real()) * sin(__x.imag()));
+}
+
+// tanh
+
+template <class _Tp>
+__DEVICE__ std::complex<_Tp> tanh(const std::complex<_Tp> &__x) {
+  if (std::isinf(__x.real())) {
+    if (!std::isfinite(__x.imag()))
+      return std::complex<_Tp>(_Tp(1), _Tp(0));
+    return std::complex<_Tp>(_Tp(1),
+                             copysign(_Tp(0), sin(_Tp(2) * __x.imag())));
+  }
+  if (std::isnan(__x.real()) && __x.imag() == 0)
+    return __x;
+  _Tp __2r(_Tp(2) * __x.real());
+  _Tp __2i(_Tp(2) * __x.imag());
+  _Tp __d(cosh(__2r) + cos(__2i));
+  _Tp __2rsh(sinh(__2r));
+  if (std::isinf(__2rsh) && std::isinf(__d))
+    return std::complex<_Tp>(__2rsh > _Tp(0) ? _Tp(1) : _Tp(-1),
+                             __2i > _Tp(0) ? _Tp(0) : _Tp(-0.));
+  return std::complex<_Tp>(__2rsh / __d, sin(__2i) / __d);
+}
+
+// asin
+
+template <class _Tp>
+__DEVICE__ std::complex<_Tp> asin(const std::complex<_Tp> &__x) {
+  std::complex<_Tp> __z = asinh(complex<_Tp>(-__x.imag(), __x.real()));
+  return std::complex<_Tp>(__z.imag(), -__z.real());
+}
+
+// acos
+
+template <class _Tp>
+__DEVICE__ std::complex<_Tp> acos(const std::complex<_Tp> &__x) {
+  const _Tp __pi(atan2(+0., -0.));
+  if (std::isinf(__x.real())) {
+    if (std::isnan(__x.imag()))
+      return std::complex<_Tp>(__x.imag(), __x.real());
+    if (std::isinf(__x.imag())) {
+      if (__x.real() < _Tp(0))
+        return std::complex<_Tp>(_Tp(0.75) * __pi, -__x.imag());
+      return std::complex<_Tp>(_Tp(0.25) * __pi, -__x.imag());
+    }
+    if (__x.real() < _Tp(0))
+      return std::complex<_Tp>(__pi,
+                               signbit(__x.imag()) ? -__x.real() : __x.real());
+    return std::complex<_Tp>(_Tp(0),
+                             signbit(__x.imag()) ? __x.real() : -__x.real());
+  }
+  if (std::isnan(__x.real())) {
+    if (std::isinf(__x.imag()))
+      return std::complex<_Tp>(__x.real(), -__x.imag());
+    return std::complex<_Tp>(__x.real(), __x.real());
+  }
+  if (std::isinf(__x.imag()))
+    return std::complex<_Tp>(__pi / _Tp(2), -__x.imag());
+  if (__x.real() == 0 && (__x.imag() == 0 || isnan(__x.imag())))
+    return std::complex<_Tp>(__pi / _Tp(2), -__x.imag());
+  std::complex<_Tp> __z = log(__x + sqrt(__sqr(__x) - _Tp(1)));
+  if (signbit(__x.imag()))
+    return std::complex<_Tp>(abs(__z.imag()), abs(__z.real()));
+  return std::complex<_Tp>(abs(__z.imag()), -abs(__z.real()));
+}
+
+// atan
+
+template <class _Tp>
+__DEVICE__ std::complex<_Tp> atan(const std::complex<_Tp> &__x) {
+  std::complex<_Tp> __z = atanh(complex<_Tp>(-__x.imag(), __x.real()));
+  return std::complex<_Tp>(__z.imag(), -__z.real());
+}
+
+// sin
+
+template <class _Tp>
+__DEVICE__ std::complex<_Tp> sin(const std::complex<_Tp> &__x) {
+  std::complex<_Tp> __z = sinh(complex<_Tp>(-__x.imag(), __x.real()));
+  return std::complex<_Tp>(__z.imag(), -__z.real());
+}
+
+// cos
+
+template <class _Tp> std::complex<_Tp> cos(const std::complex<_Tp> &__x) {
+  return cosh(complex<_Tp>(-__x.imag(), __x.real()));
+}
+
+// tan
+
+template <class _Tp>
+__DEVICE__ std::complex<_Tp> tan(const std::complex<_Tp> &__x) {
+  std::complex<_Tp> __z = tanh(complex<_Tp>(-__x.imag(), __x.real()));
+  return std::complex<_Tp>(__z.imag(), -__z.real());
+}
+
+} // namespace std
+
+#endif
diff --git a/lib/include/popcntintrin.h b/lib/include/popcntintrin.h
index 3129010147..0aa94aecda 100644
--- a/lib/include/popcntintrin.h
+++ b/lib/include/popcntintrin.h
@@ -13,6 +13,12 @@
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("popcnt")))
 
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
+#else
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
+#endif
+
 /// Counts the number of bits in the source operand having a value of 1.
 ///
 /// \headerfile <x86intrin.h>
@@ -23,7 +29,7 @@
 ///    An unsigned 32-bit integer operand.
 /// \returns A 32-bit integer containing the number of bits with value 1 in the
 ///    source operand.
-static __inline__ int __DEFAULT_FN_ATTRS
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm_popcnt_u32(unsigned int __A)
 {
   return __builtin_popcount(__A);
@@ -40,7 +46,7 @@ _mm_popcnt_u32(unsigned int __A)
 ///    An unsigned 64-bit integer operand.
 /// \returns A 64-bit integer containing the number of bits with value 1 in the
 ///    source operand.
-static __inline__ long long __DEFAULT_FN_ATTRS
+static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm_popcnt_u64(unsigned long long __A)
 {
   return __builtin_popcountll(__A);
@@ -48,5 +54,6 @@ _mm_popcnt_u64(unsigned long long __A)
 #endif /* __x86_64__ */
 
 #undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS_CONSTEXPR
 
 #endif /* __POPCNTINTRIN_H */
diff --git a/lib/include/ppc_wrappers/smmintrin.h b/lib/include/ppc_wrappers/smmintrin.h
index 56ef6ba76b..64f0c76199 100644
--- a/lib/include/ppc_wrappers/smmintrin.h
+++ b/lib/include/ppc_wrappers/smmintrin.h
@@ -78,6 +78,30 @@ extern __inline __m128i
   return (__m128i)vec_sel((__v16qu)__A, (__v16qu)__B, __lmask);
 }
 
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_insert_epi8(__m128i const __A, int const __D, int const __N) {
+  __v16qi result = (__v16qi)__A;
+  result[__N & 0xf] = __D;
+  return (__m128i)result;
+}
+
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_insert_epi32(__m128i const __A, int const __D, int const __N) {
+  __v4si result = (__v4si)__A;
+  result[__N & 3] = __D;
+  return (__m128i)result;
+}
+
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_insert_epi64(__m128i const __A, long long const __D, int const __N) {
+  __v2di result = (__v2di)__A;
+  result[__N & 1] = __D;
+  return (__m128i)result;
+}
+
 #else
 #include_next <smmintrin.h>
 #endif /* defined(__linux__) && defined(__ppc64__) */
diff --git a/lib/include/uintrintrin.h b/lib/include/uintrintrin.h
new file mode 100644
index 0000000000..78aa8779c3
--- /dev/null
+++ b/lib/include/uintrintrin.h
@@ -0,0 +1,150 @@
+/*===------------------ uintrintrin.h - UINTR intrinsics -------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86GPRINTRIN_H
+#error "Never use <uintrintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef __UINTRINTRIN_H
+#define __UINTRINTRIN_H
+
+/* Define the default attributes for the functions in this file */
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__, __target__("uintr")))
+
+#ifdef __x86_64__
+
+/// Clears the user interrupt flag (UIF). Its effect takes place immediately: a
+///    user interrupt cannot be delivered on the instruction boundary following
+///    CLUI. Can be executed only if CR4.UINT = 1, the logical processor is in
+///    64-bit mode, and software is not executing inside an enclave; otherwise,
+///    each causes an invalid-opcode exception. Causes a transactional abort if
+///    executed inside a transactional region; the abort loads EAX as it would
+///    had it been due to an execution of CLI.
+///
+/// \headerfile <x86gprintrin.h>
+///
+/// This intrinsic corresponds to the <c> CLUI </c> instruction.
+///
+/// \operation
+///   UIF := 0
+/// \endoperation
+static __inline__ void __DEFAULT_FN_ATTRS
+_clui (void)
+{
+  __builtin_ia32_clui();
+}
+
+/// Sets the user interrupt flag (UIF). Its effect takes place immediately; a
+///    user interrupt may be delivered on the instruction boundary following
+///    STUI. Can be executed only if CR4.UINT = 1, the logical processor is in
+///    64-bit mode, and software is not executing inside an enclave; otherwise,
+///    each causes an invalid-opcode exception. Causes a transactional abort if
+///    executed inside a transactional region; the abort loads EAX as it would
+///    had it been due to an execution of STI.
+///
+/// \headerfile <x86gprintrin.h>
+///
+/// This intrinsic corresponds to the <c> STUI </c> instruction.
+///
+/// \operation
+///   UIF := 1
+/// \endoperation
+static __inline__ void __DEFAULT_FN_ATTRS
+_stui (void)
+{
+  __builtin_ia32_stui();
+}
+
+/// Get the current value of the user interrupt flag (UIF). Can be executed
+///    regardless of CPL and inside a transactional region. Can be executed only
+///    if CR4.UINT = 1, the logical processor is in 64-bit mode, and software is
+///    not executing inside an enclave; otherwise, it causes an invalid-opcode
+///    exception.
+///
+/// \headerfile <x86gprintrin.h>
+///
+/// This intrinsic corresponds to the <c> TESTUI </c> instruction.
+///
+/// \returns The current value of the user interrupt flag (UIF).
+///
+/// \operation
+///   CF := UIF
+///   ZF := 0
+///   AF := 0
+///   OF := 0
+///   PF := 0
+///   SF := 0
+///   dst := CF
+/// \endoperation
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_testui (void)
+{
+  return __builtin_ia32_testui();
+}
+
+/// Send interprocessor user interrupt. Can be executed only if
+///    CR4.UINT = IA32_UINT_TT[0] = 1, the logical processor is in 64-bit mode,
+///    and software is not executing inside an enclave; otherwise, it causes an
+///    invalid-opcode exception. May be executed at any privilege level, all of
+///    its memory accesses are performed with supervisor privilege.
+///
+/// \headerfile <x86gprintrin.h>
+///
+/// This intrinsic corresponds to the <c> SENDUIPI </c> instruction
+///
+/// \param __a
+///    Index of user-interrupt target table entry in user-interrupt target
+///    table.
+///
+/// \operation
+///   IF __a > UITTSZ
+///     GP (0)
+///   FI
+///   tempUITTE := MEM[UITTADDR + (a<<4)]
+///   // tempUITTE must be valid, and can't have any reserved bit set
+///   IF (tempUITTE.V == 0 OR tempUITTE[7:1] != 0)
+///     GP (0)
+///   FI
+///   tempUPID := MEM[tempUITTE.UPIDADDR] // under lock
+///   // tempUPID can't have any reserved bit set
+///   IF (tempUPID[15:2] != 0 OR tempUPID[31:24] != 0)
+///     GP (0) // release lock
+///   FI
+///   tempUPID.PIR[tempUITTE.UV] := 1;
+///   IF (tempUPID.SN == 0 AND tempUPID.ON == 0)
+///     tempUPID.ON := 1
+///     sendNotify := 1
+///   ELSE
+///     sendNotify := 0
+///   FI
+///   MEM[tempUITTE.UPIDADDR] := tempUPID // release lock
+///   IF sendNotify == 1
+///     IF IA32_APIC_BASE[10] == 1 // local APIC is in x2APIC mode
+///       // send ordinary IPI with vector tempUPID.NV to 32-bit physical APIC
+///       // ID tempUPID.NDST
+///       SendOrdinaryIPI(tempUPID.NV, tempUPID.NDST)
+///     ELSE
+///       // send ordinary IPI with vector tempUPID.NV to 8-bit physical APIC
+///       // ID tempUPID.NDST[15:8]
+///       SendOrdinaryIPI(tempUPID.NV, tempUPID.NDST[15:8])
+///     FI
+///   FI
+/// \endoperation
+static __inline__ void __DEFAULT_FN_ATTRS
+_senduipi (unsigned long long __a)
+{
+  __builtin_ia32_senduipi(__a);
+}
+
+#endif /* __x86_64__ */
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __UINTRINTRIN_H */
diff --git a/lib/include/wasm_simd128.h b/lib/include/wasm_simd128.h
index b78123834b..ac88516ac9 100644
--- a/lib/include/wasm_simd128.h
+++ b/lib/include/wasm_simd128.h
@@ -18,8 +18,7 @@ typedef int32_t v128_t __attribute__((__vector_size__(16), __aligned__(16)));
 
 // Internal types determined by clang builtin definitions
 typedef int32_t __v128_u __attribute__((__vector_size__(16), __aligned__(1)));
-typedef char __i8x16 __attribute__((__vector_size__(16), __aligned__(16)));
-typedef signed char __s8x16
+typedef signed char __i8x16
     __attribute__((__vector_size__(16), __aligned__(16)));
 typedef unsigned char __u8x16
     __attribute__((__vector_size__(16), __aligned__(16)));
@@ -35,6 +34,13 @@ typedef unsigned long long __u64x2
 typedef float __f32x4 __attribute__((__vector_size__(16), __aligned__(16)));
 typedef double __f64x2 __attribute__((__vector_size__(16), __aligned__(16)));
 
+typedef signed char __i8x8 __attribute__((__vector_size__(8), __aligned__(8)));
+typedef unsigned char __u8x8
+    __attribute__((__vector_size__(8), __aligned__(8)));
+typedef short __i16x4 __attribute__((__vector_size__(8), __aligned__(8)));
+typedef unsigned short __u16x4
+    __attribute__((__vector_size__(8), __aligned__(8)));
+
 #define __DEFAULT_FN_ATTRS                                                     \
   __attribute__((__always_inline__, __nodebug__, __target__("simd128"),        \
                  __min_vector_width__(128)))
@@ -273,7 +279,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_splat(int8_t __a) {
   (__builtin_wasm_extract_lane_s_i8x16((__i8x16)(__a), __i))
 
 #define wasm_u8x16_extract_lane(__a, __i)                                      \
-  (__builtin_wasm_extract_lane_u_i8x16((__i8x16)(__a), __i))
+  (__builtin_wasm_extract_lane_u_i8x16((__u8x16)(__a), __i))
 
 #define wasm_i8x16_replace_lane(__a, __i, __b)                                 \
   ((v128_t)__builtin_wasm_replace_lane_i8x16((__i8x16)(__a), __i, __b))
@@ -286,7 +292,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_splat(int16_t __a) {
   (__builtin_wasm_extract_lane_s_i16x8((__i16x8)(__a), __i))
 
 #define wasm_u16x8_extract_lane(__a, __i)                                      \
-  (__builtin_wasm_extract_lane_u_i16x8((__i16x8)(__a), __i))
+  (__builtin_wasm_extract_lane_u_i16x8((__u16x8)(__a), __i))
 
 #define wasm_i16x8_replace_lane(__a, __i, __b)                                 \
   ((v128_t)__builtin_wasm_replace_lane_i16x8((__i16x8)(__a), __i, __b))
@@ -333,17 +339,17 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_splat(double __a) {
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_eq(v128_t __a,
                                                           v128_t __b) {
-  return (v128_t)((__s8x16)__a == (__s8x16)__b);
+  return (v128_t)((__i8x16)__a == (__i8x16)__b);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_ne(v128_t __a,
                                                           v128_t __b) {
-  return (v128_t)((__s8x16)__a != (__s8x16)__b);
+  return (v128_t)((__i8x16)__a != (__i8x16)__b);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_lt(v128_t __a,
                                                           v128_t __b) {
-  return (v128_t)((__s8x16)__a < (__s8x16)__b);
+  return (v128_t)((__i8x16)__a < (__i8x16)__b);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_lt(v128_t __a,
@@ -353,7 +359,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_lt(v128_t __a,
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_gt(v128_t __a,
                                                           v128_t __b) {
-  return (v128_t)((__s8x16)__a > (__s8x16)__b);
+  return (v128_t)((__i8x16)__a > (__i8x16)__b);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_gt(v128_t __a,
@@ -363,7 +369,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_gt(v128_t __a,
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_le(v128_t __a,
                                                           v128_t __b) {
-  return (v128_t)((__s8x16)__a <= (__s8x16)__b);
+  return (v128_t)((__i8x16)__a <= (__i8x16)__b);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_le(v128_t __a,
@@ -373,7 +379,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_le(v128_t __a,
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_ge(v128_t __a,
                                                           v128_t __b) {
-  return (v128_t)((__s8x16)__a >= (__s8x16)__b);
+  return (v128_t)((__i8x16)__a >= (__i8x16)__b);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_ge(v128_t __a,
@@ -595,7 +601,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_shl(v128_t __a,
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_shr(v128_t __a,
                                                            int32_t __b) {
-  return (v128_t)((__s8x16)__a >> __b);
+  return (v128_t)((__i8x16)__a >> __b);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_shr(v128_t __a,
@@ -616,8 +622,8 @@ wasm_i8x16_add_saturate(v128_t __a, v128_t __b) {
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_u8x16_add_saturate(v128_t __a, v128_t __b) {
-  return (v128_t)__builtin_wasm_add_saturate_u_i8x16((__i8x16)__a,
-                                                     (__i8x16)__b);
+  return (v128_t)__builtin_wasm_add_saturate_u_i8x16((__u8x16)__a,
+                                                     (__u8x16)__b);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_sub(v128_t __a,
@@ -633,8 +639,8 @@ wasm_i8x16_sub_saturate(v128_t __a, v128_t __b) {
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_u8x16_sub_saturate(v128_t __a, v128_t __b) {
-  return (v128_t)__builtin_wasm_sub_saturate_u_i8x16((__i8x16)__a,
-                                                     (__i8x16)__b);
+  return (v128_t)__builtin_wasm_sub_saturate_u_i8x16((__u8x16)__a,
+                                                     (__u8x16)__b);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_min(v128_t __a,
@@ -644,7 +650,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_min(v128_t __a,
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_min(v128_t __a,
                                                            v128_t __b) {
-  return (v128_t)__builtin_wasm_min_u_i8x16((__i8x16)__a, (__i8x16)__b);
+  return (v128_t)__builtin_wasm_min_u_i8x16((__u8x16)__a, (__u8x16)__b);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_max(v128_t __a,
@@ -654,12 +660,12 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_max(v128_t __a,
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_max(v128_t __a,
                                                            v128_t __b) {
-  return (v128_t)__builtin_wasm_max_u_i8x16((__i8x16)__a, (__i8x16)__b);
+  return (v128_t)__builtin_wasm_max_u_i8x16((__u8x16)__a, (__u8x16)__b);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_avgr(v128_t __a,
                                                             v128_t __b) {
-  return (v128_t)__builtin_wasm_avgr_u_i8x16((__i8x16)__a, (__i8x16)__b);
+  return (v128_t)__builtin_wasm_avgr_u_i8x16((__u8x16)__a, (__u8x16)__b);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_abs(v128_t __a) {
@@ -706,8 +712,8 @@ wasm_i16x8_add_saturate(v128_t __a, v128_t __b) {
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_u16x8_add_saturate(v128_t __a, v128_t __b) {
-  return (v128_t)__builtin_wasm_add_saturate_u_i16x8((__i16x8)__a,
-                                                     (__i16x8)__b);
+  return (v128_t)__builtin_wasm_add_saturate_u_i16x8((__u16x8)__a,
+                                                     (__u16x8)__b);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_sub(v128_t __a,
@@ -723,8 +729,8 @@ wasm_i16x8_sub_saturate(v128_t __a, v128_t __b) {
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_u16x8_sub_saturate(v128_t __a, v128_t __b) {
-  return (v128_t)__builtin_wasm_sub_saturate_u_i16x8((__i16x8)__a,
-                                                     (__i16x8)__b);
+  return (v128_t)__builtin_wasm_sub_saturate_u_i16x8((__u16x8)__a,
+                                                     (__u16x8)__b);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_mul(v128_t __a,
@@ -739,7 +745,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_min(v128_t __a,
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_min(v128_t __a,
                                                            v128_t __b) {
-  return (v128_t)__builtin_wasm_min_u_i16x8((__i16x8)__a, (__i16x8)__b);
+  return (v128_t)__builtin_wasm_min_u_i16x8((__u16x8)__a, (__u16x8)__b);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_max(v128_t __a,
@@ -749,12 +755,12 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_max(v128_t __a,
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_max(v128_t __a,
                                                            v128_t __b) {
-  return (v128_t)__builtin_wasm_max_u_i16x8((__i16x8)__a, (__i16x8)__b);
+  return (v128_t)__builtin_wasm_max_u_i16x8((__u16x8)__a, (__u16x8)__b);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_avgr(v128_t __a,
                                                             v128_t __b) {
-  return (v128_t)__builtin_wasm_avgr_u_i16x8((__i16x8)__a, (__i16x8)__b);
+  return (v128_t)__builtin_wasm_avgr_u_i16x8((__u16x8)__a, (__u16x8)__b);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_abs(v128_t __a) {
@@ -810,7 +816,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_min(v128_t __a,
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_min(v128_t __a,
                                                            v128_t __b) {
-  return (v128_t)__builtin_wasm_min_u_i32x4((__i32x4)__a, (__i32x4)__b);
+  return (v128_t)__builtin_wasm_min_u_i32x4((__u32x4)__a, (__u32x4)__b);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_max(v128_t __a,
@@ -820,7 +826,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_max(v128_t __a,
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_max(v128_t __a,
                                                            v128_t __b) {
-  return (v128_t)__builtin_wasm_max_u_i32x4((__i32x4)__a, (__i32x4)__b);
+  return (v128_t)__builtin_wasm_max_u_i32x4((__u32x4)__a, (__u32x4)__b);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_neg(v128_t __a) {
@@ -1071,8 +1077,8 @@ wasm_i8x16_narrow_i16x8(v128_t __a, v128_t __b) {
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_u8x16_narrow_i16x8(v128_t __a, v128_t __b) {
-  return (v128_t)__builtin_wasm_narrow_u_i8x16_i16x8((__i16x8)__a,
-                                                     (__i16x8)__b);
+  return (v128_t)__builtin_wasm_narrow_u_i8x16_i16x8((__u16x8)__a,
+                                                     (__u16x8)__b);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS
@@ -1083,48 +1089,76 @@ wasm_i16x8_narrow_i32x4(v128_t __a, v128_t __b) {
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_u16x8_narrow_i32x4(v128_t __a, v128_t __b) {
-  return (v128_t)__builtin_wasm_narrow_u_i16x8_i32x4((__i32x4)__a,
-                                                     (__i32x4)__b);
+  return (v128_t)__builtin_wasm_narrow_u_i16x8_i32x4((__u32x4)__a,
+                                                     (__u32x4)__b);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_i16x8_widen_low_i8x16(v128_t __a) {
-  return (v128_t)__builtin_wasm_widen_low_s_i16x8_i8x16((__i8x16)__a);
+  return (v128_t) __builtin_convertvector(
+      (__i8x8){((__i8x16)__a)[0], ((__i8x16)__a)[1], ((__i8x16)__a)[2],
+               ((__i8x16)__a)[3], ((__i8x16)__a)[4], ((__i8x16)__a)[5],
+               ((__i8x16)__a)[6], ((__i8x16)__a)[7]},
+      __i16x8);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_i16x8_widen_high_i8x16(v128_t __a) {
-  return (v128_t)__builtin_wasm_widen_high_s_i16x8_i8x16((__i8x16)__a);
+  return (v128_t) __builtin_convertvector(
+      (__i8x8){((__i8x16)__a)[8], ((__i8x16)__a)[9], ((__i8x16)__a)[10],
+               ((__i8x16)__a)[11], ((__i8x16)__a)[12], ((__i8x16)__a)[13],
+               ((__i8x16)__a)[14], ((__i8x16)__a)[15]},
+      __i16x8);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_i16x8_widen_low_u8x16(v128_t __a) {
-  return (v128_t)__builtin_wasm_widen_low_u_i16x8_i8x16((__i8x16)__a);
+  return (v128_t) __builtin_convertvector(
+      (__u8x8){((__u8x16)__a)[0], ((__u8x16)__a)[1], ((__u8x16)__a)[2],
+               ((__u8x16)__a)[3], ((__u8x16)__a)[4], ((__u8x16)__a)[5],
+               ((__u8x16)__a)[6], ((__u8x16)__a)[7]},
+      __u16x8);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_i16x8_widen_high_u8x16(v128_t __a) {
-  return (v128_t)__builtin_wasm_widen_high_u_i16x8_i8x16((__i8x16)__a);
+  return (v128_t) __builtin_convertvector(
+      (__u8x8){((__u8x16)__a)[8], ((__u8x16)__a)[9], ((__u8x16)__a)[10],
+               ((__u8x16)__a)[11], ((__u8x16)__a)[12], ((__u8x16)__a)[13],
+               ((__u8x16)__a)[14], ((__u8x16)__a)[15]},
+      __u16x8);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_i32x4_widen_low_i16x8(v128_t __a) {
-  return (v128_t)__builtin_wasm_widen_low_s_i32x4_i16x8((__i16x8)__a);
+  return (v128_t) __builtin_convertvector(
+      (__i16x4){((__i16x8)__a)[0], ((__i16x8)__a)[1], ((__i16x8)__a)[2],
+                ((__i16x8)__a)[3]},
+      __i32x4);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_i32x4_widen_high_i16x8(v128_t __a) {
-  return (v128_t)__builtin_wasm_widen_high_s_i32x4_i16x8((__i16x8)__a);
+  return (v128_t) __builtin_convertvector(
+      (__i16x4){((__i16x8)__a)[4], ((__i16x8)__a)[5], ((__i16x8)__a)[6],
+                ((__i16x8)__a)[7]},
+      __i32x4);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_i32x4_widen_low_u16x8(v128_t __a) {
-  return (v128_t)__builtin_wasm_widen_low_u_i32x4_i16x8((__i16x8)__a);
+  return (v128_t) __builtin_convertvector(
+      (__u16x4){((__u16x8)__a)[0], ((__u16x8)__a)[1], ((__u16x8)__a)[2],
+                ((__u16x8)__a)[3]},
+      __u32x4);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_i32x4_widen_high_u16x8(v128_t __a) {
-  return (v128_t)__builtin_wasm_widen_high_u_i32x4_i16x8((__i16x8)__a);
+  return (v128_t) __builtin_convertvector(
+      (__u16x4){((__u16x8)__a)[4], ((__u16x8)__a)[5], ((__u16x8)__a)[6],
+                ((__u16x8)__a)[7]},
+      __u32x4);
 }
 
 // Undefine helper macros
diff --git a/lib/include/x86gprintrin.h b/lib/include/x86gprintrin.h
new file mode 100644
index 0000000000..1fc6cab4b2
--- /dev/null
+++ b/lib/include/x86gprintrin.h
@@ -0,0 +1,23 @@
+/*===--------------- x86gprintrin.h - X86 GPR intrinsics ------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86GPRINTRIN_H
+#define __X86GPRINTRIN_H
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__HRESET__)
+#include <hresetintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__UINTR__)
+#include <uintrintrin.h>
+#endif
+
+#endif /* __X86GPRINTRIN_H */
diff --git a/lib/libcxx/include/__availability b/lib/libcxx/include/__availability
new file mode 100644
index 0000000000..db2267c8eb
--- /dev/null
+++ b/lib/libcxx/include/__availability
@@ -0,0 +1,206 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___AVAILABILITY
+#define _LIBCPP___AVAILABILITY
+
+#include <__config>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#   pragma GCC system_header
+#endif
+
+// Libc++ is shipped by various vendors. In particular, it is used as a system
+// library on macOS, iOS and other Apple platforms. In order for users to be
+// able to compile a binary that is intended to be deployed to an older version
+// of a platform, Clang provides availability attributes [1]. These attributes
+// can be placed on declarations and are used to describe the life cycle of a
+// symbol in the library.
+//
+// The main goal is to ensure a compile-time error if a symbol that hasn't been
+// introduced in a previously released library is used in a program that targets
+// that previously released library. Normally, this would be a load-time error
+// when one tries to launch the program against the older library.
+//
+// For example, the filesystem library was introduced in the dylib in macOS 10.15.
+// If a user compiles on a macOS 10.15 host but targets macOS 10.13 with their
+// program, the compiler would normally not complain (because the required
+// declarations are in the headers), but the dynamic loader would fail to find
+// the symbols when actually trying to launch the program on macOS 10.13. To
+// turn this into a compile-time issue instead, declarations are annotated with
+// when they were introduced, and the compiler can produce a diagnostic if the
+// program references something that isn't available on the deployment target.
+//
+// This mechanism is general in nature, and any vendor can add their markup to
+// the library (see below). Whenever a new feature is added that requires support
+// in the shared library, a macro should be added below to mark this feature
+// as unavailable. When vendors decide to ship the feature as part of their
+// shared library, they can update the markup appropriately.
+//
+// Note that this mechanism is disabled by default in the "upstream" libc++.
+// Availability annotations are only meaningful when shipping libc++ inside
+// a platform (i.e. as a system library), and so vendors that want them should
+// turn those annotations on at CMake configuration time.
+//
+// [1]: https://clang.llvm.org/docs/AttributeReference.html#availability
+
+
+// For backwards compatibility, allow users to define _LIBCPP_DISABLE_AVAILABILITY
+// for a while.
+#if defined(_LIBCPP_DISABLE_AVAILABILITY)
+#   if !defined(_LIBCPP_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS)
+#       define _LIBCPP_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS
+#   endif
+#endif
+
+// Availability markup is disabled when building the library, or when the compiler
+// doesn't support the proper attributes.
+#if defined(_LIBCPP_BUILDING_LIBRARY) ||                                        \
+    defined(_LIBCXXABI_BUILDING_LIBRARY) ||                                     \
+    !__has_feature(attribute_availability_with_strict) ||                       \
+    !__has_feature(attribute_availability_in_templates) ||                      \
+    !__has_extension(pragma_clang_attribute_external_declaration)
+#   if !defined(_LIBCPP_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS)
+#       define _LIBCPP_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS
+#   endif
+#endif
+
+#if defined(_LIBCPP_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS)
+
+    // This controls the availability of std::shared_mutex and std::shared_timed_mutex,
+    // which were added to the dylib later.
+#   define _LIBCPP_AVAILABILITY_SHARED_MUTEX
+
+    // These macros control the availability of std::bad_optional_access and
+    // other exception types. These were put in the shared library to prevent
+    // code bloat from every user program defining the vtable for these exception
+    // types.
+#   define _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS
+#   define _LIBCPP_AVAILABILITY_BAD_VARIANT_ACCESS
+#   define _LIBCPP_AVAILABILITY_BAD_ANY_CAST
+
+    // This controls the availability of std::uncaught_exceptions().
+#   define _LIBCPP_AVAILABILITY_UNCAUGHT_EXCEPTIONS
+
+    // This controls the availability of the sized version of ::operator delete,
+    // which was added to the dylib later.
+#   define _LIBCPP_AVAILABILITY_SIZED_NEW_DELETE
+
+    // This controls the availability of the std::future_error exception.
+#   define _LIBCPP_AVAILABILITY_FUTURE_ERROR
+
+    // This controls the availability of std::type_info's vtable.
+    // I can't imagine how using std::type_info can work at all if
+    // this isn't supported.
+#   define _LIBCPP_AVAILABILITY_TYPEINFO_VTABLE
+
+    // This controls the availability of std::locale::category members
+    // (e.g. std::locale::collate), which are defined in the dylib.
+#   define _LIBCPP_AVAILABILITY_LOCALE_CATEGORY
+
+    // This controls the availability of atomic operations on std::shared_ptr
+    // (e.g. `std::atomic_store(std::shared_ptr)`), which require a shared
+    // lock table located in the dylib.
+#   define _LIBCPP_AVAILABILITY_ATOMIC_SHARED_PTR
+
+    // These macros control the availability of all parts of <filesystem> that
+    // depend on something in the dylib.
+#   define _LIBCPP_AVAILABILITY_FILESYSTEM
+#   define _LIBCPP_AVAILABILITY_FILESYSTEM_PUSH
+#   define _LIBCPP_AVAILABILITY_FILESYSTEM_POP
+
+    // This controls the availability of std::to_chars.
+#   define _LIBCPP_AVAILABILITY_TO_CHARS
+
+    // This controls the availability of the C++20 synchronization library,
+    // which requires shared library support for various operations
+    // (see libcxx/src/atomic.cpp).
+#   define _LIBCPP_AVAILABILITY_SYNC
+
+#elif defined(__APPLE__)
+
+#   define _LIBCPP_AVAILABILITY_SHARED_MUTEX                                    \
+        __attribute__((availability(macosx,strict,introduced=10.12)))           \
+        __attribute__((availability(ios,strict,introduced=10.0)))               \
+        __attribute__((availability(tvos,strict,introduced=10.0)))              \
+        __attribute__((availability(watchos,strict,introduced=3.0)))
+#   define _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS                             \
+        __attribute__((availability(macosx,strict,introduced=10.13)))           \
+        __attribute__((availability(ios,strict,introduced=11.0)))               \
+        __attribute__((availability(tvos,strict,introduced=11.0)))              \
+        __attribute__((availability(watchos,strict,introduced=4.0)))
+#   define _LIBCPP_AVAILABILITY_BAD_VARIANT_ACCESS                              \
+        _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS
+#   define _LIBCPP_AVAILABILITY_BAD_ANY_CAST                                    \
+        _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS
+#   define _LIBCPP_AVAILABILITY_UNCAUGHT_EXCEPTIONS                             \
+        __attribute__((availability(macosx,strict,introduced=10.12)))           \
+        __attribute__((availability(ios,strict,introduced=10.0)))               \
+        __attribute__((availability(tvos,strict,introduced=10.0)))              \
+        __attribute__((availability(watchos,strict,introduced=3.0)))
+#   define _LIBCPP_AVAILABILITY_SIZED_NEW_DELETE                                \
+        __attribute__((availability(macosx,strict,introduced=10.12)))           \
+        __attribute__((availability(ios,strict,introduced=10.0)))               \
+        __attribute__((availability(tvos,strict,introduced=10.0)))              \
+        __attribute__((availability(watchos,strict,introduced=3.0)))
+#   define _LIBCPP_AVAILABILITY_FUTURE_ERROR                                    \
+        __attribute__((availability(ios,strict,introduced=6.0)))
+#   define _LIBCPP_AVAILABILITY_TYPEINFO_VTABLE                                 \
+        __attribute__((availability(macosx,strict,introduced=10.9)))            \
+        __attribute__((availability(ios,strict,introduced=7.0)))
+#   define _LIBCPP_AVAILABILITY_LOCALE_CATEGORY                                 \
+        __attribute__((availability(macosx,strict,introduced=10.9)))            \
+        __attribute__((availability(ios,strict,introduced=7.0)))
+#   define _LIBCPP_AVAILABILITY_ATOMIC_SHARED_PTR                               \
+        __attribute__((availability(macosx,strict,introduced=10.9)))            \
+        __attribute__((availability(ios,strict,introduced=7.0)))
+#   define _LIBCPP_AVAILABILITY_FILESYSTEM                                      \
+        __attribute__((availability(macosx,strict,introduced=10.15)))           \
+        __attribute__((availability(ios,strict,introduced=13.0)))               \
+        __attribute__((availability(tvos,strict,introduced=13.0)))              \
+        __attribute__((availability(watchos,strict,introduced=6.0)))
+#   define _LIBCPP_AVAILABILITY_FILESYSTEM_PUSH                                 \
+        _Pragma("clang attribute push(__attribute__((availability(macosx,strict,introduced=10.15))), apply_to=any(function,record))") \
+        _Pragma("clang attribute push(__attribute__((availability(ios,strict,introduced=13.0))), apply_to=any(function,record))")     \
+        _Pragma("clang attribute push(__attribute__((availability(tvos,strict,introduced=13.0))), apply_to=any(function,record))")    \
+        _Pragma("clang attribute push(__attribute__((availability(watchos,strict,introduced=6.0))), apply_to=any(function,record))")
+#   define _LIBCPP_AVAILABILITY_FILESYSTEM_POP                                  \
+        _Pragma("clang attribute pop")                                          \
+        _Pragma("clang attribute pop")                                          \
+        _Pragma("clang attribute pop")                                          \
+        _Pragma("clang attribute pop")
+#   define _LIBCPP_AVAILABILITY_TO_CHARS                                        \
+        _LIBCPP_AVAILABILITY_FILESYSTEM
+#   define _LIBCPP_AVAILABILITY_SYNC                                            \
+        __attribute__((unavailable))
+
+#else
+
+// ...New vendors can add availability markup here...
+
+#   error "It looks like you're trying to enable vendor availability markup, but you haven't defined the corresponding macros yet!"
+
+#endif
+
+// Define availability attributes that depend on _LIBCPP_NO_EXCEPTIONS.
+// Those are defined in terms of the availability attributes above, and
+// should not be vendor-specific.
+#if defined(_LIBCPP_NO_EXCEPTIONS)
+#   define _LIBCPP_AVAILABILITY_FUTURE
+#   define _LIBCPP_AVAILABILITY_THROW_BAD_ANY_CAST
+#   define _LIBCPP_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS
+#   define _LIBCPP_AVAILABILITY_THROW_BAD_VARIANT_ACCESS
+#else
+#   define _LIBCPP_AVAILABILITY_FUTURE                    _LIBCPP_AVAILABILITY_FUTURE_ERROR
+#   define _LIBCPP_AVAILABILITY_THROW_BAD_ANY_CAST        _LIBCPP_AVAILABILITY_BAD_ANY_CAST
+#   define _LIBCPP_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS
+#   define _LIBCPP_AVAILABILITY_THROW_BAD_VARIANT_ACCESS  _LIBCPP_AVAILABILITY_BAD_VARIANT_ACCESS
+#endif
+
+#endif  // _LIBCPP___AVAILABILITY
diff --git a/lib/libcxx/include/__bit_reference b/lib/libcxx/include/__bit_reference
index 4a2b82064b..9cfb4b84e6 100644
--- a/lib/libcxx/include/__bit_reference
+++ b/lib/libcxx/include/__bit_reference
@@ -11,7 +11,7 @@
 #define _LIBCPP___BIT_REFERENCE
 
 #include <__config>
-#include <bit>
+#include <__bits>
 #include <algorithm>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -239,8 +239,8 @@ __bit_iterator<_Cp, _IsConst>
 find(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, const _Tp& __value_)
 {
     if (static_cast<bool>(__value_))
-        return __find_bool_true(__first, static_cast<typename _Cp::size_type>(__last - __first));
-    return __find_bool_false(__first, static_cast<typename _Cp::size_type>(__last - __first));
+        return _VSTD::__find_bool_true(__first, static_cast<typename _Cp::size_type>(__last - __first));
+    return _VSTD::__find_bool_false(__first, static_cast<typename _Cp::size_type>(__last - __first));
 }
 
 // count
@@ -313,8 +313,8 @@ typename __bit_iterator<_Cp, _IsConst>::difference_type
 count(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, const _Tp& __value_)
 {
     if (static_cast<bool>(__value_))
-        return __count_bool_true(__first, static_cast<typename _Cp::size_type>(__last - __first));
-    return __count_bool_false(__first, static_cast<typename _Cp::size_type>(__last - __first));
+        return _VSTD::__count_bool_true(__first, static_cast<typename _Cp::size_type>(__last - __first));
+    return _VSTD::__count_bool_false(__first, static_cast<typename _Cp::size_type>(__last - __first));
 }
 
 // fill_n
@@ -387,9 +387,9 @@ fill_n(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n, bool __v
     if (__n > 0)
     {
         if (__value_)
-            __fill_n_true(__first, __n);
+            _VSTD::__fill_n_true(__first, __n);
         else
-            __fill_n_false(__first, __n);
+            _VSTD::__fill_n_false(__first, __n);
     }
 }
 
@@ -538,8 +538,8 @@ __bit_iterator<_Cp, false>
 copy(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result)
 {
     if (__first.__ctz_ == __result.__ctz_)
-        return __copy_aligned(__first, __last, __result);
-    return __copy_unaligned(__first, __last, __result);
+        return _VSTD::__copy_aligned(__first, __last, __result);
+    return _VSTD::__copy_unaligned(__first, __last, __result);
 }
 
 // copy_backward
@@ -685,8 +685,8 @@ __bit_iterator<_Cp, false>
 copy_backward(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result)
 {
     if (__last.__ctz_ == __result.__ctz_)
-        return __copy_backward_aligned(__first, __last, __result);
-    return __copy_backward_unaligned(__first, __last, __result);
+        return _VSTD::__copy_backward_aligned(__first, __last, __result);
+    return _VSTD::__copy_backward_unaligned(__first, __last, __result);
 }
 
 // move
@@ -868,8 +868,8 @@ swap_ranges(__bit_iterator<__C1, false> __first1, __bit_iterator<__C1, false> __
             __bit_iterator<__C2, false> __first2)
 {
     if (__first1.__ctz_ == __first2.__ctz_)
-        return __swap_ranges_aligned(__first1, __last1, __first2);
-    return __swap_ranges_unaligned(__first1, __last1, __first2);
+        return _VSTD::__swap_ranges_aligned(__first1, __last1, __first2);
+    return _VSTD::__swap_ranges_unaligned(__first1, __last1, __first2);
 }
 
 // rotate
@@ -1083,8 +1083,8 @@ bool
 equal(__bit_iterator<_Cp, _IC1> __first1, __bit_iterator<_Cp, _IC1> __last1, __bit_iterator<_Cp, _IC2> __first2)
 {
     if (__first1.__ctz_ == __first2.__ctz_)
-        return __equal_aligned(__first1, __last1, __first2);
-    return __equal_unaligned(__first1, __last1, __first2);
+        return _VSTD::__equal_aligned(__first1, __last1, __first2);
+    return _VSTD::__equal_unaligned(__first1, __last1, __first2);
 }
 
 template <class _Cp, bool _IsConst,
diff --git a/lib/libcxx/include/__bits b/lib/libcxx/include/__bits
new file mode 100644
index 0000000000..0d321da922
--- /dev/null
+++ b/lib/libcxx/include/__bits
@@ -0,0 +1,146 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___BITS
+#define _LIBCPP___BITS
+
+#include <__config>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#pragma GCC system_header
+#endif
+
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+#ifndef _LIBCPP_COMPILER_MSVC
+
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
+int __libcpp_ctz(unsigned __x)           _NOEXCEPT { return __builtin_ctz(__x); }
+
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
+int __libcpp_ctz(unsigned long __x)      _NOEXCEPT { return __builtin_ctzl(__x); }
+
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
+int __libcpp_ctz(unsigned long long __x) _NOEXCEPT { return __builtin_ctzll(__x); }
+
+
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
+int __libcpp_clz(unsigned __x)           _NOEXCEPT { return __builtin_clz(__x); }
+
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
+int __libcpp_clz(unsigned long __x)      _NOEXCEPT { return __builtin_clzl(__x); }
+
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
+int __libcpp_clz(unsigned long long __x) _NOEXCEPT { return __builtin_clzll(__x); }
+
+
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
+int __libcpp_popcount(unsigned __x)           _NOEXCEPT { return __builtin_popcount(__x); }
+
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
+int __libcpp_popcount(unsigned long __x)      _NOEXCEPT { return __builtin_popcountl(__x); }
+
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
+int __libcpp_popcount(unsigned long long __x) _NOEXCEPT { return __builtin_popcountll(__x); }
+
+#else  // _LIBCPP_COMPILER_MSVC
+
+// Precondition:  __x != 0
+inline _LIBCPP_INLINE_VISIBILITY
+int __libcpp_ctz(unsigned __x) {
+  static_assert(sizeof(unsigned) == sizeof(unsigned long), "");
+  static_assert(sizeof(unsigned long) == 4, "");
+  unsigned long __where;
+  if (_BitScanForward(&__where, __x))
+    return static_cast<int>(__where);
+  return 32;
+}
+
+inline _LIBCPP_INLINE_VISIBILITY
+int __libcpp_ctz(unsigned long __x) {
+    static_assert(sizeof(unsigned long) == sizeof(unsigned), "");
+    return __ctz(static_cast<unsigned>(__x));
+}
+
+inline _LIBCPP_INLINE_VISIBILITY
+int __libcpp_ctz(unsigned long long __x) {
+    unsigned long __where;
+#if defined(_LIBCPP_HAS_BITSCAN64)
+    (defined(_M_AMD64) || defined(__x86_64__))
+  if (_BitScanForward64(&__where, __x))
+    return static_cast<int>(__where);
+#else
+  // Win32 doesn't have _BitScanForward64 so emulate it with two 32 bit calls.
+  if (_BitScanForward(&__where, static_cast<unsigned long>(__x)))
+    return static_cast<int>(__where);
+  if (_BitScanForward(&__where, static_cast<unsigned long>(__x >> 32)))
+    return static_cast<int>(__where + 32);
+#endif
+  return 64;
+}
+
+// Precondition:  __x != 0
+inline _LIBCPP_INLINE_VISIBILITY
+int __libcpp_clz(unsigned __x) {
+  static_assert(sizeof(unsigned) == sizeof(unsigned long), "");
+  static_assert(sizeof(unsigned long) == 4, "");
+  unsigned long __where;
+  if (_BitScanReverse(&__where, __x))
+    return static_cast<int>(31 - __where);
+  return 32; // Undefined Behavior.
+}
+
+inline _LIBCPP_INLINE_VISIBILITY
+int __libcpp_clz(unsigned long __x) {
+    static_assert(sizeof(unsigned) == sizeof(unsigned long), "");
+    return __libcpp_clz(static_cast<unsigned>(__x));
+}
+
+inline _LIBCPP_INLINE_VISIBILITY
+int __libcpp_clz(unsigned long long __x) {
+  unsigned long __where;
+#if defined(_LIBCPP_HAS_BITSCAN64)
+  if (_BitScanReverse64(&__where, __x))
+    return static_cast<int>(63 - __where);
+#else
+  // Win32 doesn't have _BitScanReverse64 so emulate it with two 32 bit calls.
+  if (_BitScanReverse(&__where, static_cast<unsigned long>(__x >> 32)))
+    return static_cast<int>(63 - (__where + 32));
+  if (_BitScanReverse(&__where, static_cast<unsigned long>(__x)))
+    return static_cast<int>(63 - __where);
+#endif
+  return 64; // Undefined Behavior.
+}
+
+inline _LIBCPP_INLINE_VISIBILITY int __libcpp_popcount(unsigned __x) {
+  static_assert(sizeof(unsigned) == 4, "");
+  return __popcnt(__x);
+}
+
+inline _LIBCPP_INLINE_VISIBILITY int __libcpp_popcount(unsigned long __x) {
+  static_assert(sizeof(unsigned long) == 4, "");
+  return __popcnt(__x);
+}
+
+inline _LIBCPP_INLINE_VISIBILITY int __libcpp_popcount(unsigned long long __x) {
+  static_assert(sizeof(unsigned long long) == 8, "");
+  return __popcnt64(__x);
+}
+
+#endif // _LIBCPP_COMPILER_MSVC
+
+_LIBCPP_END_NAMESPACE_STD
+
+_LIBCPP_POP_MACROS
+
+#endif  // _LIBCPP__BITS
diff --git a/lib/libcxx/include/__config b/lib/libcxx/include/__config
index 575147cead..a3838c89e8 100644
--- a/lib/libcxx/include/__config
+++ b/lib/libcxx/include/__config
@@ -32,13 +32,13 @@
 #  define _GNUC_VER_NEW 0
 #endif
 
-#define _LIBCPP_VERSION 11000
+#define _LIBCPP_VERSION 12000
 
 #ifndef _LIBCPP_ABI_VERSION
 #  define _LIBCPP_ABI_VERSION 1
 #endif
 
-#ifndef __STDC_HOSTED__
+#if __STDC_HOSTED__ == 0
 #  define _LIBCPP_FREESTANDING
 #endif
 
@@ -49,8 +49,10 @@
 #    define _LIBCPP_STD_VER 14
 #  elif __cplusplus <= 201703L
 #    define _LIBCPP_STD_VER 17
+#  elif __cplusplus <= 202002L
+#    define _LIBCPP_STD_VER 20
 #  else
-#    define _LIBCPP_STD_VER 18  // current year, or date of c++2a ratification
+#    define _LIBCPP_STD_VER 21  // current year, or date of c++2b ratification
 #  endif
 #endif  // _LIBCPP_STD_VER
 
@@ -63,7 +65,7 @@
 #elif defined(__wasm__)
 #  define _LIBCPP_OBJECT_FORMAT_WASM  1
 #else
-#  error Unknown object file format
+   // ... add new file formats here ...
 #endif
 
 #if defined(_LIBCPP_ABI_UNSTABLE) || _LIBCPP_ABI_VERSION >= 2
@@ -105,6 +107,10 @@
 // Re-worked external template instantiations for std::string with a focus on
 // performance and fast-path inlining.
 #  define _LIBCPP_ABI_STRING_OPTIMIZED_EXTERNAL_INSTANTIATION
+// Enable clang::trivial_abi on std::unique_ptr.
+#  define _LIBCPP_ABI_ENABLE_UNIQUE_PTR_TRIVIAL_ABI
+// Enable clang::trivial_abi on std::shared_ptr and std::weak_ptr
+#  define _LIBCPP_ABI_ENABLE_SHARED_PTR_TRIVIAL_ABI
 #elif _LIBCPP_ABI_VERSION == 1
 #  if !defined(_LIBCPP_OBJECT_FORMAT_COFF)
 // Enable compiling copies of now inline methods into the dylib to support
@@ -121,9 +127,11 @@
 #  endif
 #endif
 
-#ifdef _LIBCPP_TRIVIAL_PAIR_COPY_CTOR
-#error "_LIBCPP_TRIVIAL_PAIR_COPY_CTOR" is no longer supported. \
-       use _LIBCPP_DEPRECATED_ABI_DISABLE_PAIR_TRIVIAL_COPY_CTOR instead
+#if defined(_LIBCPP_BUILDING_LIBRARY) || defined(_LIBCPP_ABI_UNSTABLE) || _LIBCPP_ABI_VERSION >= 2
+// Enable additional explicit instantiations of iostreams components. This
+// reduces the number of weak definitions generated in programs that use
+// iostreams by providing a single strong definition in the shared library.
+# define _LIBCPP_ABI_ENABLE_ADDITIONAL_IOSTREAM_EXPLICIT_INSTANTIATIONS_1
 #endif
 
 #define _LIBCPP_CONCAT1(_LIBCPP_X,_LIBCPP_Y) _LIBCPP_X##_LIBCPP_Y
@@ -256,14 +264,14 @@
 #  endif  // __LONG_LONG_SUPPORTED
 #endif  // __FreeBSD__
 
-#ifdef __NetBSD__
+#if defined(__NetBSD__) || defined(__OpenBSD__)
 #  include <sys/endian.h>
 #  if _BYTE_ORDER == _LITTLE_ENDIAN
 #    define _LIBCPP_LITTLE_ENDIAN
 #  else  // _BYTE_ORDER == _LITTLE_ENDIAN
 #    define _LIBCPP_BIG_ENDIAN
 #  endif  // _BYTE_ORDER == _LITTLE_ENDIAN
-#endif  // __NetBSD__
+#endif  // defined(__NetBSD__) || defined(__OpenBSD__)
 
 #if defined(_WIN32)
 #  define _LIBCPP_WIN32API
@@ -304,7 +312,7 @@
 #  endif
 #endif // __sun__
 
-#if defined(__CloudABI__)
+#if defined(__OpenBSD__) || defined(__CloudABI__)
    // Certain architectures provide arc4random(). Prefer using
    // arc4random() over /dev/{u,}random to make it possible to obtain
    // random data even when using sandboxing mechanisms such as chroots,
@@ -344,13 +352,11 @@
 #  if defined(__FreeBSD__)
 #    define _LIBCPP_HAS_ALIGNED_ALLOC
 #    define _LIBCPP_HAS_QUICK_EXIT
-#    define _LIBCPP_HAS_C11_FEATURES
 #    if __FreeBSD_version >= 1300064 || \
        (__FreeBSD_version >= 1201504 && __FreeBSD_version < 1300000)
 #      define _LIBCPP_HAS_TIMESPEC_GET
 #    endif
 #  elif defined(__BIONIC__)
-#    define _LIBCPP_HAS_C11_FEATURES
 #    if __ANDROID_API__ >= 21
 #      define _LIBCPP_HAS_QUICK_EXIT
 #    endif
@@ -364,7 +370,9 @@
 #    define _LIBCPP_HAS_ALIGNED_ALLOC
 #    define _LIBCPP_HAS_QUICK_EXIT
 #    define _LIBCPP_HAS_TIMESPEC_GET
-#    define _LIBCPP_HAS_C11_FEATURES
+#  elif defined(__OpenBSD__)
+#    define _LIBCPP_HAS_ALIGNED_ALLOC
+#    define _LIBCPP_HAS_TIMESPEC_GET
 #  elif defined(__linux__)
 #    if !defined(_LIBCPP_HAS_MUSL_LIBC)
 #      if _LIBCPP_GLIBC_PREREQ(2, 15) || defined(__BIONIC__)
@@ -372,16 +380,24 @@
 #      endif
 #      if _LIBCPP_GLIBC_PREREQ(2, 17)
 #        define _LIBCPP_HAS_ALIGNED_ALLOC
-#        define _LIBCPP_HAS_C11_FEATURES
 #        define _LIBCPP_HAS_TIMESPEC_GET
 #      endif
 #    else // defined(_LIBCPP_HAS_MUSL_LIBC)
 #      define _LIBCPP_HAS_ALIGNED_ALLOC
 #      define _LIBCPP_HAS_QUICK_EXIT
 #      define _LIBCPP_HAS_TIMESPEC_GET
-#      define _LIBCPP_HAS_C11_FEATURES
 #    endif
-#  endif // __linux__
+#  elif defined(__APPLE__)
+     // timespec_get and aligned_alloc were introduced in macOS 10.15 and
+     // aligned releases
+#    if (__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ >= 101500 || \
+         __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ >= 130000 || \
+         __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ >= 130000 || \
+         __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ >= 60000)
+#      define _LIBCPP_HAS_ALIGNED_ALLOC
+#      define _LIBCPP_HAS_TIMESPEC_GET
+#    endif
+#  endif // __APPLE__
 #endif
 
 #ifndef _LIBCPP_CXX03_LANG
@@ -389,9 +405,7 @@
 #elif defined(_LIBCPP_COMPILER_CLANG)
 # define _LIBCPP_ALIGNOF(_Tp) _Alignof(_Tp)
 #else
-// This definition is potentially buggy, but it's only taken with GCC in C++03,
-// which we barely support anyway. See llvm.org/PR39713
-# define _LIBCPP_ALIGNOF(_Tp) __alignof(_Tp)
+# error "We don't know a correct way to implement alignof(T) in C++03 outside of Clang"
 #endif
 
 #define _LIBCPP_PREFERRED_ALIGNOF(_Tp) __alignof(_Tp)
@@ -433,10 +447,6 @@ typedef __char32_t char32_t;
 #  define _LIBCPP_NORETURN __attribute__ ((noreturn))
 #endif
 
-#if !(__has_feature(cxx_lambdas))
-#define _LIBCPP_HAS_NO_LAMBDAS
-#endif
-
 #if !(__has_feature(cxx_nullptr))
 #  if (__has_extension(cxx_nullptr) || __has_keyword(__nullptr)) && defined(_LIBCPP_ABI_ALWAYS_USE_CXX11_NULLPTR)
 #    define nullptr __nullptr
@@ -445,18 +455,6 @@ typedef __char32_t char32_t;
 #  endif
 #endif
 
-#if !(__has_feature(cxx_rvalue_references))
-#define _LIBCPP_HAS_NO_RVALUE_REFERENCES
-#endif
-
-#if !(__has_feature(cxx_auto_type))
-#define _LIBCPP_HAS_NO_AUTO_TYPE
-#endif
-
-#if !(__has_feature(cxx_variadic_templates))
-#define _LIBCPP_HAS_NO_VARIADICS
-#endif
-
 // Objective-C++ features (opt-in)
 #if __has_feature(objc_arc)
 #define _LIBCPP_HAS_OBJC_ARC
@@ -720,7 +718,7 @@ typedef __char32_t char32_t;
 #endif
 
 #ifndef _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS
-#  if !defined(_LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS) && __has_attribute(__type_visibility__)
+#  if !defined(_LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS)
 #    define _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS __attribute__ ((__visibility__("default")))
 #  else
 #    define _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS
@@ -754,16 +752,6 @@ typedef __char32_t char32_t;
 #  endif
 #endif
 
-#ifndef _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION
-#  ifdef _LIBCPP_OBJECT_FORMAT_COFF // Windows binaries can't merge typeinfos.
-#    define _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION 2
-#  else
-     // TODO: This isn't strictly correct on ELF platforms due to llvm.org/PR37398
-     // And we should consider defaulting to OFF.
-#    define _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION 1
-#  endif
-#endif
-
 #ifndef _LIBCPP_HIDE_FROM_ABI
 #  if _LIBCPP_HIDE_FROM_ABI_PER_TU
 #    define _LIBCPP_HIDE_FROM_ABI _LIBCPP_HIDDEN _LIBCPP_INTERNAL_LINKAGE
@@ -838,6 +826,12 @@ typedef unsigned int   char32_t;
 #  define _LIBCPP_CONSTEXPR constexpr
 #endif
 
+#ifndef __cpp_consteval
+#  define _LIBCPP_CONSTEVAL _LIBCPP_CONSTEXPR
+#else
+#  define _LIBCPP_CONSTEVAL consteval
+#endif
+
 #ifdef _LIBCPP_CXX03_LANG
 #  define _LIBCPP_DEFAULT {}
 #else
@@ -863,10 +857,6 @@ typedef unsigned int   char32_t;
 #  define _LIBCPP_EXPLICIT
 #endif
 
-#if !__has_builtin(__builtin_operator_new) || !__has_builtin(__builtin_operator_delete)
-#define _LIBCPP_HAS_NO_BUILTIN_OPERATOR_NEW_DELETE
-#endif
-
 #ifdef _LIBCPP_HAS_NO_STRONG_ENUMS
 #  define _LIBCPP_DECLARE_STRONG_ENUM(x) struct _LIBCPP_TYPE_VIS x { enum __lx
 #  define _LIBCPP_DECLARE_STRONG_ENUM_EPILOG(x) \
@@ -880,34 +870,43 @@ typedef unsigned int   char32_t;
 #  define _LIBCPP_DECLARE_STRONG_ENUM_EPILOG(x)
 #endif  // _LIBCPP_HAS_NO_STRONG_ENUMS
 
-#ifdef _LIBCPP_DEBUG
-#  if _LIBCPP_DEBUG == 0
-#    define _LIBCPP_DEBUG_LEVEL 1
-#  elif _LIBCPP_DEBUG == 1
-#    define _LIBCPP_DEBUG_LEVEL 2
-#  else
-#    error Supported values for _LIBCPP_DEBUG are 0 and 1
-#  endif
-#  if !defined(_LIBCPP_BUILDING_LIBRARY)
-#    define _LIBCPP_EXTERN_TEMPLATE(...)
-#  endif
+// _LIBCPP_DEBUG potential values:
+//  - undefined: No assertions. This is the default.
+//  - 0:         Basic assertions
+//  - 1:         Basic assertions + iterator validity checks.
+#if !defined(_LIBCPP_DEBUG)
+# define _LIBCPP_DEBUG_LEVEL 0
+#elif _LIBCPP_DEBUG == 0
+# define _LIBCPP_DEBUG_LEVEL 1
+#elif _LIBCPP_DEBUG == 1
+# define _LIBCPP_DEBUG_LEVEL 2
+#else
+# error Supported values for _LIBCPP_DEBUG are 0 and 1
 #endif
 
-#ifndef _LIBCPP_DEBUG_LEVEL
-# define _LIBCPP_DEBUG_LEVEL 0
+// _LIBCPP_DEBUG_LEVEL is always defined to one of [0, 1, 2] at this point
+#if _LIBCPP_DEBUG_LEVEL >= 1 && !defined(_LIBCPP_DISABLE_EXTERN_TEMPLATE)
+# define _LIBCPP_EXTERN_TEMPLATE(...)
 #endif
 
 #ifdef _LIBCPP_DISABLE_EXTERN_TEMPLATE
-#define _LIBCPP_EXTERN_TEMPLATE(...)
-#define _LIBCPP_EXTERN_TEMPLATE2(...)
+# define _LIBCPP_EXTERN_TEMPLATE(...)
+# define _LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(...)
 #endif
 
 #ifndef _LIBCPP_EXTERN_TEMPLATE
 #define _LIBCPP_EXTERN_TEMPLATE(...) extern template __VA_ARGS__;
 #endif
 
-#ifndef _LIBCPP_EXTERN_TEMPLATE2
-#define _LIBCPP_EXTERN_TEMPLATE2(...) extern template __VA_ARGS__;
+// When the Debug mode is enabled, we disable extern declarations because we
+// don't want to use the functions compiled in the library, which might not
+// have had the debug mode enabled when built. However, some extern declarations
+// need to be used, because code correctness depends on it (several instances
+// in the <locale>). Those special declarations are declared with
+// _LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE, which is enabled even
+// when the debug mode is enabled.
+#ifndef _LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE
+# define _LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(...) extern template __VA_ARGS__;
 #endif
 
 #ifndef _LIBCPP_EXTERN_TEMPLATE_DEFINE
@@ -938,6 +937,8 @@ typedef unsigned int   char32_t;
    // We're deferring to Microsoft's STL to provide aligned new et al. We don't
    // have it unless the language feature test macro is defined.
 #  define _LIBCPP_HAS_NO_LIBRARY_ALIGNED_ALLOCATION
+#elif defined(__MVS__)
+#  define _LIBCPP_HAS_NO_LIBRARY_ALIGNED_ALLOCATION
 #endif
 
 #if defined(__APPLE__)
@@ -999,6 +1000,18 @@ typedef unsigned int   char32_t;
 #  define _LIBCPP_DEPRECATED_IN_CXX17
 #endif
 
+#if _LIBCPP_STD_VER > 17
+#  define _LIBCPP_DEPRECATED_IN_CXX20 _LIBCPP_DEPRECATED
+#else
+#  define _LIBCPP_DEPRECATED_IN_CXX20
+#endif
+
+#if !defined(_LIBCPP_NO_HAS_CHAR8_T)
+#  define _LIBCPP_DEPRECATED_WITH_CHAR8_T _LIBCPP_DEPRECATED
+#else
+#  define _LIBCPP_DEPRECATED_WITH_CHAR8_T
+#endif
+
 // Macros to enter and leave a state where deprecation warnings are suppressed.
 #if !defined(_LIBCPP_SUPPRESS_DEPRECATED_PUSH) && \
     (defined(_LIBCPP_COMPILER_CLANG) || defined(_LIBCPP_COMPILER_GCC))
@@ -1037,14 +1050,6 @@ typedef unsigned int   char32_t;
 #  define _LIBCPP_CONSTEXPR_AFTER_CXX17
 #endif
 
-#if _LIBCPP_STD_VER > 17 && \
-    !defined(_LIBCPP_HAS_NO_CXX14_CONSTEXPR) && \
-    !defined(_LIBCPP_HAS_NO_BUILTIN_IS_CONSTANT_EVALUATED)
-#  define _LIBCPP_CONSTEXPR_AFTER_CXX17_WITH_IS_CONSTANT_EVALUATED constexpr
-#else
-#  define _LIBCPP_CONSTEXPR_AFTER_CXX17_WITH_IS_CONSTANT_EVALUATED
-#endif
-
 // The _LIBCPP_NODISCARD_ATTRIBUTE should only be used to define other
 // NODISCARD macros to the correct attribute.
 #if __has_cpp_attribute(nodiscard) || defined(_LIBCPP_COMPILER_MSVC)
@@ -1079,12 +1084,6 @@ typedef unsigned int   char32_t;
 #  define _LIBCPP_INLINE_VAR
 #endif
 
-#ifdef _LIBCPP_HAS_NO_RVALUE_REFERENCES
-#  define _LIBCPP_EXPLICIT_MOVE(x) _VSTD::move(x)
-#else
-#  define _LIBCPP_EXPLICIT_MOVE(x) (x)
-#endif
-
 #ifndef _LIBCPP_CONSTEXPR_IF_NODEBUG
 #if defined(_LIBCPP_DEBUG) || defined(_LIBCPP_HAS_NO_CXX14_CONSTEXPR)
 #define _LIBCPP_CONSTEXPR_IF_NODEBUG
@@ -1100,7 +1099,7 @@ typedef unsigned int   char32_t;
 #endif
 
 #ifndef _LIBCPP_HAS_NO_ASAN
-_LIBCPP_FUNC_VIS extern "C" void __sanitizer_annotate_contiguous_container(
+extern "C" _LIBCPP_FUNC_VIS void __sanitizer_annotate_contiguous_container(
   const void *, const void *, const void *, const void *);
 #endif
 
@@ -1125,11 +1124,14 @@ _LIBCPP_FUNC_VIS extern "C" void __sanitizer_annotate_contiguous_container(
 #  if defined(__FreeBSD__) || \
       defined(__wasi__) || \
       defined(__NetBSD__) || \
+      defined(__OpenBSD__) || \
+      defined(__NuttX__) || \
       defined(__linux__) || \
       defined(__GNU__) || \
       defined(__APPLE__) || \
       defined(__CloudABI__) || \
       defined(__sun__) || \
+      defined(__MVS__) || \
       (defined(__MINGW32__) && __has_include(<pthread.h>))
 #    define _LIBCPP_HAS_THREAD_API_PTHREAD
 #  elif defined(__Fuchsia__)
@@ -1167,10 +1169,6 @@ _LIBCPP_FUNC_VIS extern "C" void __sanitizer_annotate_contiguous_container(
        _LIBCPP_HAS_NO_THREADS is defined.
 #endif
 
-#if defined(__STDCPP_THREADS__) && defined(_LIBCPP_HAS_NO_THREADS)
-#error _LIBCPP_HAS_NO_THREADS cannot be set when __STDCPP_THREADS__ is set.
-#endif
-
 #if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(__STDCPP_THREADS__)
 #define __STDCPP_THREADS__ 1
 #endif
@@ -1222,13 +1220,15 @@ _LIBCPP_FUNC_VIS extern "C" void __sanitizer_annotate_contiguous_container(
 // Some systems do not provide gets() in their C library, for security reasons.
 #ifndef _LIBCPP_C_HAS_NO_GETS
 #  if defined(_LIBCPP_MSVCRT) || \
-      (defined(__FreeBSD_version) && __FreeBSD_version >= 1300043)
+      (defined(__FreeBSD_version) && __FreeBSD_version >= 1300043) || \
+      defined(__OpenBSD__)
 #    define _LIBCPP_C_HAS_NO_GETS
 #  endif
 #endif
 
-#if defined(__BIONIC__) || defined(__CloudABI__) ||                            \
-    defined(__Fuchsia__) || defined(__wasi__) || defined(_LIBCPP_HAS_MUSL_LIBC)
+#if defined(__BIONIC__) || defined(__CloudABI__) || defined(__NuttX__) ||      \
+    defined(__Fuchsia__) || defined(__wasi__) || defined(_LIBCPP_HAS_MUSL_LIBC) || \
+    defined(__MVS__) || defined(__OpenBSD__)
 #define _LIBCPP_PROVIDES_DEFAULT_RUNE_TABLE
 #endif
 
@@ -1337,6 +1337,12 @@ _LIBCPP_FUNC_VIS extern "C" void __sanitizer_annotate_contiguous_container(
 #endif
 #endif // !defined(_LIBCPP_NODEBUG_TYPE)
 
+#if __has_attribute(__preferred_name__)
+#define _LIBCPP_PREFERRED_NAME(x) __attribute__((__preferred_name__(x)))
+#else
+#define _LIBCPP_PREFERRED_NAME(x)
+#endif
+
 #if defined(_LIBCPP_ABI_MICROSOFT) && \
     (defined(_LIBCPP_COMPILER_MSVC) || __has_declspec_attribute(empty_bases))
 #  define _LIBCPP_DECLSPEC_EMPTY_BASES __declspec(empty_bases)
@@ -1367,120 +1373,6 @@ _LIBCPP_FUNC_VIS extern "C" void __sanitizer_annotate_contiguous_container(
 #define _LIBCPP_HAS_NO_SPACESHIP_OPERATOR
 #endif
 
-// Decide whether to use availability macros.
-#if !defined(_LIBCPP_BUILDING_LIBRARY) &&                                      \
-    !defined(_LIBCXXABI_BUILDING_LIBRARY) &&                                   \
-    !defined(_LIBCPP_DISABLE_AVAILABILITY) &&                                  \
-    __has_feature(attribute_availability_with_strict) &&                       \
-    __has_feature(attribute_availability_in_templates) &&                      \
-    __has_extension(pragma_clang_attribute_external_declaration)
-#  ifdef __APPLE__
-#    define _LIBCPP_USE_AVAILABILITY_APPLE
-#  endif
-#endif
-
-// Define availability macros.
-#if defined(_LIBCPP_USE_AVAILABILITY_APPLE)
-#  define _LIBCPP_AVAILABILITY_SHARED_MUTEX                                    \
-     __attribute__((availability(macosx,strict,introduced=10.12)))             \
-     __attribute__((availability(ios,strict,introduced=10.0)))                 \
-     __attribute__((availability(tvos,strict,introduced=10.0)))                \
-     __attribute__((availability(watchos,strict,introduced=3.0)))
-#  define _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS                             \
-     __attribute__((availability(macosx,strict,introduced=10.13)))             \
-     __attribute__((availability(ios,strict,introduced=11.0)))                 \
-     __attribute__((availability(tvos,strict,introduced=11.0)))                \
-     __attribute__((availability(watchos,strict,introduced=4.0)))
-#  define _LIBCPP_AVAILABILITY_BAD_VARIANT_ACCESS                              \
-     _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS
-#  define _LIBCPP_AVAILABILITY_BAD_ANY_CAST                                    \
-     _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS
-#  define _LIBCPP_AVAILABILITY_UNCAUGHT_EXCEPTIONS                             \
-     __attribute__((availability(macosx,strict,introduced=10.12)))             \
-     __attribute__((availability(ios,strict,introduced=10.0)))                 \
-     __attribute__((availability(tvos,strict,introduced=10.0)))                \
-     __attribute__((availability(watchos,strict,introduced=3.0)))
-#  define _LIBCPP_AVAILABILITY_SIZED_NEW_DELETE                                \
-     __attribute__((availability(macosx,strict,introduced=10.12)))             \
-     __attribute__((availability(ios,strict,introduced=10.0)))                 \
-     __attribute__((availability(tvos,strict,introduced=10.0)))                \
-     __attribute__((availability(watchos,strict,introduced=3.0)))
-#  define _LIBCPP_AVAILABILITY_FUTURE_ERROR                                    \
-     __attribute__((availability(ios,strict,introduced=6.0)))
-#  define _LIBCPP_AVAILABILITY_TYPEINFO_VTABLE                                 \
-     __attribute__((availability(macosx,strict,introduced=10.9)))              \
-     __attribute__((availability(ios,strict,introduced=7.0)))
-#  define _LIBCPP_AVAILABILITY_LOCALE_CATEGORY                                 \
-     __attribute__((availability(macosx,strict,introduced=10.9)))              \
-     __attribute__((availability(ios,strict,introduced=7.0)))
-#  define _LIBCPP_AVAILABILITY_ATOMIC_SHARED_PTR                               \
-     __attribute__((availability(macosx,strict,introduced=10.9)))              \
-     __attribute__((availability(ios,strict,introduced=7.0)))
-#  define _LIBCPP_AVAILABILITY_FILESYSTEM                                      \
-     __attribute__((availability(macosx,strict,introduced=10.15)))             \
-     __attribute__((availability(ios,strict,introduced=13.0)))                 \
-     __attribute__((availability(tvos,strict,introduced=13.0)))                \
-     __attribute__((availability(watchos,strict,introduced=6.0)))
-#  define _LIBCPP_AVAILABILITY_FILESYSTEM_PUSH                                 \
-     _Pragma("clang attribute push(__attribute__((availability(macosx,strict,introduced=10.15))), apply_to=any(function,record))") \
-     _Pragma("clang attribute push(__attribute__((availability(ios,strict,introduced=13.0))), apply_to=any(function,record))")     \
-     _Pragma("clang attribute push(__attribute__((availability(tvos,strict,introduced=13.0))), apply_to=any(function,record))")    \
-     _Pragma("clang attribute push(__attribute__((availability(watchos,strict,introduced=6.0))), apply_to=any(function,record))")
-#  define _LIBCPP_AVAILABILITY_FILESYSTEM_POP                                  \
-     _Pragma("clang attribute pop")                                            \
-     _Pragma("clang attribute pop")                                            \
-     _Pragma("clang attribute pop")                                            \
-     _Pragma("clang attribute pop")
-#  define _LIBCPP_AVAILABILITY_TO_CHARS                                        \
-     _LIBCPP_AVAILABILITY_FILESYSTEM
-#  define _LIBCPP_AVAILABILITY_SYNC                                            \
-     __attribute__((unavailable))
-#else
-#  define _LIBCPP_AVAILABILITY_SHARED_MUTEX
-#  define _LIBCPP_AVAILABILITY_BAD_VARIANT_ACCESS
-#  define _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS
-#  define _LIBCPP_AVAILABILITY_BAD_ANY_CAST
-#  define _LIBCPP_AVAILABILITY_UNCAUGHT_EXCEPTIONS
-#  define _LIBCPP_AVAILABILITY_SIZED_NEW_DELETE
-#  define _LIBCPP_AVAILABILITY_FUTURE_ERROR
-#  define _LIBCPP_AVAILABILITY_TYPEINFO_VTABLE
-#  define _LIBCPP_AVAILABILITY_LOCALE_CATEGORY
-#  define _LIBCPP_AVAILABILITY_ATOMIC_SHARED_PTR
-#  define _LIBCPP_AVAILABILITY_FILESYSTEM
-#  define _LIBCPP_AVAILABILITY_FILESYSTEM_PUSH
-#  define _LIBCPP_AVAILABILITY_FILESYSTEM_POP
-#  define _LIBCPP_AVAILABILITY_TO_CHARS
-#  define _LIBCPP_AVAILABILITY_SYNC
-#endif
-
-// Define availability that depends on _LIBCPP_NO_EXCEPTIONS.
-#ifdef _LIBCPP_NO_EXCEPTIONS
-#  define _LIBCPP_AVAILABILITY_FUTURE
-#  define _LIBCPP_AVAILABILITY_THROW_BAD_ANY_CAST
-#  define _LIBCPP_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS
-#  define _LIBCPP_AVAILABILITY_THROW_BAD_VARIANT_ACCESS
-#else
-#  define _LIBCPP_AVAILABILITY_FUTURE                    _LIBCPP_AVAILABILITY_FUTURE_ERROR
-#  define _LIBCPP_AVAILABILITY_THROW_BAD_ANY_CAST        _LIBCPP_AVAILABILITY_BAD_ANY_CAST
-#  define _LIBCPP_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS
-#  define _LIBCPP_AVAILABILITY_THROW_BAD_VARIANT_ACCESS  _LIBCPP_AVAILABILITY_BAD_VARIANT_ACCESS
-#endif
-
-// The stream API was dropped and re-added in the dylib shipped on macOS
-// and iOS. We can only assume the dylib to provide these definitions for
-// macosx >= 10.9 and ios >= 7.0. Otherwise, the definitions are available
-// from the headers, but not from the dylib. Explicit instantiation
-// declarations for streams exist conditionally to this; if we provide
-// an explicit instantiation declaration and we try to deploy to a dylib
-// that does not provide those symbols, we'll get a load-time error.
-#if !defined(_LIBCPP_BUILDING_LIBRARY) &&                                      \
-    ((defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) &&                \
-      __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 1090) ||                 \
-     (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) &&               \
-      __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 70000))
-#  define _LIBCPP_DO_NOT_ASSUME_STREAMS_EXPLICIT_INSTANTIATION_IN_DYLIB
-#endif
-
 #if defined(_LIBCPP_COMPILER_IBM)
 #define _LIBCPP_HAS_NO_PRAGMA_PUSH_POP_MACRO
 #endif
@@ -1547,6 +1439,12 @@ _LIBCPP_FUNC_VIS extern "C" void __sanitizer_annotate_contiguous_container(
 #define _LIBCPP_HAS_NO_FGETPOS_FSETPOS
 #endif
 
+#if __has_attribute(init_priority)
+# define _LIBCPP_INIT_PRIORITY_MAX __attribute__((init_priority(101)))
+#else
+# define _LIBCPP_INIT_PRIORITY_MAX
+#endif
+
 #endif // __cplusplus
 
 #endif // _LIBCPP_CONFIG
diff --git a/lib/libcxx/include/__config_site.in b/lib/libcxx/include/__config_site.in
index a6984b2eef..ec4d410bb9 100644
--- a/lib/libcxx/include/__config_site.in
+++ b/lib/libcxx/include/__config_site.in
@@ -26,12 +26,14 @@
 #cmakedefine _LIBCPP_HAS_THREAD_API_WIN32
 #cmakedefine _LIBCPP_HAS_THREAD_LIBRARY_EXTERNAL
 #cmakedefine _LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS
+#cmakedefine _LIBCPP_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS
 #cmakedefine _LIBCPP_NO_VCRUNTIME
-#ifndef _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION
 #cmakedefine _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION @_LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION@
-#endif
 #cmakedefine _LIBCPP_ABI_NAMESPACE @_LIBCPP_ABI_NAMESPACE@
+#cmakedefine _LIBCPP_HAS_NO_FILESYSTEM_LIBRARY
 #cmakedefine _LIBCPP_HAS_PARALLEL_ALGORITHMS
+#cmakedefine _LIBCPP_HAS_NO_RANDOM_DEVICE
+#cmakedefine _LIBCPP_HAS_NO_LOCALIZATION
 
 @_LIBCPP_ABI_DEFINES@
 
diff --git a/lib/libcxx/include/__debug b/lib/libcxx/include/__debug
index 11367413fc..7b5bfb3f83 100644
--- a/lib/libcxx/include/__debug
+++ b/lib/libcxx/include/__debug
@@ -27,26 +27,21 @@
 #   include <cstddef>
 #endif
 
-#if _LIBCPP_DEBUG_LEVEL >= 1 && !defined(_LIBCPP_ASSERT)
-# define _LIBCPP_ASSERT(x, m) ((x) ? (void)0 : \
-  _VSTD::__libcpp_debug_function(_VSTD::__libcpp_debug_info(__FILE__, __LINE__, #x, m)))
-#endif
-
-#if _LIBCPP_DEBUG_LEVEL >= 2
-#ifndef _LIBCPP_DEBUG_ASSERT
-#define _LIBCPP_DEBUG_ASSERT(x, m) _LIBCPP_ASSERT(x, m)
-#endif
-#define _LIBCPP_DEBUG_MODE(...) __VA_ARGS__
-#endif
-
-#ifndef _LIBCPP_ASSERT
-#   define _LIBCPP_ASSERT(x, m) ((void)0)
-#endif
-#ifndef _LIBCPP_DEBUG_ASSERT
+#if _LIBCPP_DEBUG_LEVEL == 0
 #   define _LIBCPP_DEBUG_ASSERT(x, m) ((void)0)
+#   define _LIBCPP_ASSERT_IMPL(x, m) ((void)0)
+#elif _LIBCPP_DEBUG_LEVEL == 1
+#   define _LIBCPP_DEBUG_ASSERT(x, m) ((void)0)
+#   define _LIBCPP_ASSERT_IMPL(x, m) ((x) ? (void)0 : _VSTD::__libcpp_debug_function(_VSTD::__libcpp_debug_info(__FILE__, __LINE__, #x, m)))
+#elif _LIBCPP_DEBUG_LEVEL == 2
+#   define _LIBCPP_DEBUG_ASSERT(x, m) _LIBCPP_ASSERT(x, m)
+#   define _LIBCPP_ASSERT_IMPL(x, m) ((x) ? (void)0 : _VSTD::__libcpp_debug_function(_VSTD::__libcpp_debug_info(__FILE__, __LINE__, #x, m)))
+#else
+#   error _LIBCPP_DEBUG_LEVEL must be one of 0, 1, 2
 #endif
-#ifndef _LIBCPP_DEBUG_MODE
-#define _LIBCPP_DEBUG_MODE(...) ((void)0)
+
+#if !defined(_LIBCPP_ASSERT)
+#   define _LIBCPP_ASSERT(x, m) _LIBCPP_ASSERT_IMPL(x, m)
 #endif
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -59,7 +54,7 @@ struct _LIBCPP_TEMPLATE_VIS __libcpp_debug_info {
   __libcpp_debug_info(const char* __f, int __l, const char* __p, const char* __m)
     : __file_(__f), __line_(__l), __pred_(__p), __msg_(__m) {}
 
-  _LIBCPP_FUNC_VIS std::string what() const;
+  _LIBCPP_FUNC_VIS string what() const;
 
   const char* __file_;
   int __line_;
@@ -83,7 +78,7 @@ void __libcpp_abort_debug_function(__libcpp_debug_info const&);
 _LIBCPP_FUNC_VIS
 bool __libcpp_set_debug_function(__libcpp_debug_function_type __func);
 
-#if _LIBCPP_DEBUG_LEVEL >= 2 || defined(_LIBCPP_BUILDING_LIBRARY)
+#if _LIBCPP_DEBUG_LEVEL == 2 || defined(_LIBCPP_BUILDING_LIBRARY)
 
 struct _LIBCPP_TYPE_VIS __c_node;
 
@@ -226,7 +221,7 @@ public:
 
     template <class _Cont>
     _LIBCPP_INLINE_VISIBILITY static __c_node* __create_C_node(void *__mem, void *__c, __c_node *__next) {
-        return ::new(__mem) _C_node<_Cont>(__c, __next);
+        return ::new (__mem) _C_node<_Cont>(__c, __next);
     }
 
     template <class _Cont>
@@ -271,7 +266,7 @@ _LIBCPP_FUNC_VIS __libcpp_db* __get_db();
 _LIBCPP_FUNC_VIS const __libcpp_db* __get_const_db();
 
 
-#endif // _LIBCPP_DEBUG_LEVEL >= 2 || defined(_LIBCPP_BUILDING_LIBRARY)
+#endif // _LIBCPP_DEBUG_LEVEL == 2 || defined(_LIBCPP_BUILDING_LIBRARY)
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/lib/libcxx/include/__functional_03 b/lib/libcxx/include/__functional_03
index bf86428dea..9616480611 100644
--- a/lib/libcxx/include/__functional_03
+++ b/lib/libcxx/include/__functional_03
@@ -126,7 +126,7 @@ __func<_Fp, _Alloc, _Rp()>::__clone() const
     _Ap __a(__f_.second());
     typedef __allocator_destructor<_Ap> _Dp;
     unique_ptr<__func, _Dp> __hold(__a.allocate(1), _Dp(__a, 1));
-    ::new (__hold.get()) __func(__f_.first(), _Alloc(__a));
+    ::new ((void*)__hold.get()) __func(__f_.first(), _Alloc(__a));
     return __hold.release();
 }
 
@@ -134,7 +134,7 @@ template<class _Fp, class _Alloc, class _Rp>
 void
 __func<_Fp, _Alloc, _Rp()>::__clone(__base<_Rp()>* __p) const
 {
-    ::new (__p) __func(__f_.first(), __f_.second());
+    ::new ((void*)__p) __func(__f_.first(), __f_.second());
 }
 
 template<class _Fp, class _Alloc, class _Rp>
@@ -212,7 +212,7 @@ __func<_Fp, _Alloc, _Rp(_A0)>::__clone() const
     _Ap __a(__f_.second());
     typedef __allocator_destructor<_Ap> _Dp;
     unique_ptr<__func, _Dp> __hold(__a.allocate(1), _Dp(__a, 1));
-    ::new (__hold.get()) __func(__f_.first(), _Alloc(__a));
+    ::new ((void*)__hold.get()) __func(__f_.first(), _Alloc(__a));
     return __hold.release();
 }
 
@@ -220,7 +220,7 @@ template<class _Fp, class _Alloc, class _Rp, class _A0>
 void
 __func<_Fp, _Alloc, _Rp(_A0)>::__clone(__base<_Rp(_A0)>* __p) const
 {
-    ::new (__p) __func(__f_.first(), __f_.second());
+    ::new ((void*)__p) __func(__f_.first(), __f_.second());
 }
 
 template<class _Fp, class _Alloc, class _Rp, class _A0>
@@ -298,7 +298,7 @@ __func<_Fp, _Alloc, _Rp(_A0, _A1)>::__clone() const
     _Ap __a(__f_.second());
     typedef __allocator_destructor<_Ap> _Dp;
     unique_ptr<__func, _Dp> __hold(__a.allocate(1), _Dp(__a, 1));
-    ::new (__hold.get()) __func(__f_.first(), _Alloc(__a));
+    ::new ((void*)__hold.get()) __func(__f_.first(), _Alloc(__a));
     return __hold.release();
 }
 
@@ -306,7 +306,7 @@ template<class _Fp, class _Alloc, class _Rp, class _A0, class _A1>
 void
 __func<_Fp, _Alloc, _Rp(_A0, _A1)>::__clone(__base<_Rp(_A0, _A1)>* __p) const
 {
-    ::new (__p) __func(__f_.first(), __f_.second());
+    ::new ((void*)__p) __func(__f_.first(), __f_.second());
 }
 
 template<class _Fp, class _Alloc, class _Rp, class _A0, class _A1>
@@ -384,7 +384,7 @@ __func<_Fp, _Alloc, _Rp(_A0, _A1, _A2)>::__clone() const
     _Ap __a(__f_.second());
     typedef __allocator_destructor<_Ap> _Dp;
     unique_ptr<__func, _Dp> __hold(__a.allocate(1), _Dp(__a, 1));
-    ::new (__hold.get()) __func(__f_.first(), _Alloc(__a));
+    ::new ((void*)__hold.get()) __func(__f_.first(), _Alloc(__a));
     return __hold.release();
 }
 
@@ -392,7 +392,7 @@ template<class _Fp, class _Alloc, class _Rp, class _A0, class _A1, class _A2>
 void
 __func<_Fp, _Alloc, _Rp(_A0, _A1, _A2)>::__clone(__base<_Rp(_A0, _A1, _A2)>* __p) const
 {
-    ::new (__p) __func(__f_.first(), __f_.second());
+    ::new ((void*)__p) __func(__f_.first(), __f_.second());
 }
 
 template<class _Fp, class _Alloc, class _Rp, class _A0, class _A1, class _A2>
@@ -554,7 +554,7 @@ function<_Rp()>::function(_Fp __f,
         if (sizeof(_FF) <= sizeof(__buf_))
         {
             __f_ = (__base*)&__buf_;
-            ::new (__f_) _FF(__f);
+            ::new ((void*)__f_) _FF(__f);
         }
         else
         {
@@ -562,7 +562,7 @@ function<_Rp()>::function(_Fp __f,
             _Ap __a;
             typedef __allocator_destructor<_Ap> _Dp;
             unique_ptr<__base, _Dp> __hold(__a.allocate(1), _Dp(__a, 1));
-            ::new (__hold.get()) _FF(__f, allocator<_Fp>(__a));
+            ::new ((void*)__hold.get()) _FF(__f, allocator<_Fp>(__a));
             __f_ = __hold.release();
         }
     }
@@ -581,7 +581,7 @@ function<_Rp()>::function(allocator_arg_t, const _Alloc& __a0, _Fp __f,
         if (sizeof(_FF) <= sizeof(__buf_))
         {
             __f_ = (__base*)&__buf_;
-            ::new (__f_) _FF(__f, __a0);
+            ::new ((void*)__f_) _FF(__f, __a0);
         }
         else
         {
@@ -589,7 +589,7 @@ function<_Rp()>::function(allocator_arg_t, const _Alloc& __a0, _Fp __f,
             _Ap __a(__a0);
             typedef __allocator_destructor<_Ap> _Dp;
             unique_ptr<__base, _Dp> __hold(__a.allocate(1), _Dp(__a, 1));
-            ::new (__hold.get()) _FF(__f, _Alloc(__a));
+            ::new ((void*)__hold.get()) _FF(__f, _Alloc(__a));
             __f_ = __hold.release();
         }
     }
@@ -834,7 +834,7 @@ function<_Rp(_A0)>::function(_Fp __f,
         if (sizeof(_FF) <= sizeof(__buf_))
         {
             __f_ = (__base*)&__buf_;
-            ::new (__f_) _FF(__f);
+            ::new ((void*)__f_) _FF(__f);
         }
         else
         {
@@ -842,7 +842,7 @@ function<_Rp(_A0)>::function(_Fp __f,
             _Ap __a;
             typedef __allocator_destructor<_Ap> _Dp;
             unique_ptr<__base, _Dp> __hold(__a.allocate(1), _Dp(__a, 1));
-            ::new (__hold.get()) _FF(__f, allocator<_Fp>(__a));
+            ::new ((void*)__hold.get()) _FF(__f, allocator<_Fp>(__a));
             __f_ = __hold.release();
         }
     }
@@ -861,7 +861,7 @@ function<_Rp(_A0)>::function(allocator_arg_t, const _Alloc& __a0, _Fp __f,
         if (sizeof(_FF) <= sizeof(__buf_))
         {
             __f_ = (__base*)&__buf_;
-            ::new (__f_) _FF(__f, __a0);
+            ::new ((void*)__f_) _FF(__f, __a0);
         }
         else
         {
@@ -869,7 +869,7 @@ function<_Rp(_A0)>::function(allocator_arg_t, const _Alloc& __a0, _Fp __f,
             _Ap __a(__a0);
             typedef __allocator_destructor<_Ap> _Dp;
             unique_ptr<__base, _Dp> __hold(__a.allocate(1), _Dp(__a, 1));
-            ::new (__hold.get()) _FF(__f, _Alloc(__a));
+            ::new ((void*)__hold.get()) _FF(__f, _Alloc(__a));
             __f_ = __hold.release();
         }
     }
@@ -1114,7 +1114,7 @@ function<_Rp(_A0, _A1)>::function(_Fp __f,
         if (sizeof(_FF) <= sizeof(__buf_))
         {
             __f_ = (__base*)&__buf_;
-            ::new (__f_) _FF(__f);
+            ::new ((void*)__f_) _FF(__f);
         }
         else
         {
@@ -1122,7 +1122,7 @@ function<_Rp(_A0, _A1)>::function(_Fp __f,
             _Ap __a;
             typedef __allocator_destructor<_Ap> _Dp;
             unique_ptr<__base, _Dp> __hold(__a.allocate(1), _Dp(__a, 1));
-            ::new (__hold.get()) _FF(__f, allocator<_Fp>(__a));
+            ::new ((void*)__hold.get()) _FF(__f, allocator<_Fp>(__a));
             __f_ = __hold.release();
         }
     }
@@ -1141,7 +1141,7 @@ function<_Rp(_A0, _A1)>::function(allocator_arg_t, const _Alloc& __a0, _Fp __f,
         if (sizeof(_FF) <= sizeof(__buf_))
         {
             __f_ = (__base*)&__buf_;
-            ::new (__f_) _FF(__f, __a0);
+            ::new ((void*)__f_) _FF(__f, __a0);
         }
         else
         {
@@ -1149,7 +1149,7 @@ function<_Rp(_A0, _A1)>::function(allocator_arg_t, const _Alloc& __a0, _Fp __f,
             _Ap __a(__a0);
             typedef __allocator_destructor<_Ap> _Dp;
             unique_ptr<__base, _Dp> __hold(__a.allocate(1), _Dp(__a, 1));
-            ::new (__hold.get()) _FF(__f, _Alloc(__a));
+            ::new ((void*)__hold.get()) _FF(__f, _Alloc(__a));
             __f_ = __hold.release();
         }
     }
@@ -1394,7 +1394,7 @@ function<_Rp(_A0, _A1, _A2)>::function(_Fp __f,
         if (sizeof(_FF) <= sizeof(__buf_))
         {
             __f_ = (__base*)&__buf_;
-            ::new (__f_) _FF(__f);
+            ::new ((void*)__f_) _FF(__f);
         }
         else
         {
@@ -1402,7 +1402,7 @@ function<_Rp(_A0, _A1, _A2)>::function(_Fp __f,
             _Ap __a;
             typedef __allocator_destructor<_Ap> _Dp;
             unique_ptr<__base, _Dp> __hold(__a.allocate(1), _Dp(__a, 1));
-            ::new (__hold.get()) _FF(__f, allocator<_Fp>(__a));
+            ::new ((void*)__hold.get()) _FF(__f, allocator<_Fp>(__a));
             __f_ = __hold.release();
         }
     }
@@ -1421,7 +1421,7 @@ function<_Rp(_A0, _A1, _A2)>::function(allocator_arg_t, const _Alloc& __a0, _Fp
         if (sizeof(_FF) <= sizeof(__buf_))
         {
             __f_ = (__base*)&__buf_;
-            ::new (__f_) _FF(__f, __a0);
+            ::new ((void*)__f_) _FF(__f, __a0);
         }
         else
         {
@@ -1429,7 +1429,7 @@ function<_Rp(_A0, _A1, _A2)>::function(allocator_arg_t, const _Alloc& __a0, _Fp
             _Ap __a(__a0);
             typedef __allocator_destructor<_Ap> _Dp;
             unique_ptr<__base, _Dp> __hold(__a.allocate(1), _Dp(__a, 1));
-            ::new (__hold.get()) _FF(__f, _Alloc(__a));
+            ::new ((void*)__hold.get()) _FF(__f, _Alloc(__a));
             __f_ = __hold.release();
         }
     }
diff --git a/lib/libcxx/include/__functional_base b/lib/libcxx/include/__functional_base
index f591bf5a9d..1c02e960d5 100644
--- a/lib/libcxx/include/__functional_base
+++ b/lib/libcxx/include/__functional_base
@@ -298,7 +298,7 @@ struct __weak_result_type<_Rp (_Cp::*)(_A1, _A2, _A3...) const volatile>
 template <class _Tp, class ..._Args>
 struct __invoke_return
 {
-    typedef decltype(__invoke(_VSTD::declval<_Tp>(), _VSTD::declval<_Args>()...)) type;
+    typedef decltype(_VSTD::__invoke(declval<_Tp>(), declval<_Args>()...)) type;
 };
 
 #else // defined(_LIBCPP_CXX03_LANG)
@@ -308,64 +308,64 @@ struct __invoke_return
 #endif  // !defined(_LIBCPP_CXX03_LANG)
 
 
-template <class _Ret>
+template <class _Ret, bool = is_void<_Ret>::value>
 struct __invoke_void_return_wrapper
 {
 #ifndef _LIBCPP_CXX03_LANG
     template <class ..._Args>
     static _Ret __call(_Args&&... __args) {
-        return __invoke(_VSTD::forward<_Args>(__args)...);
+        return _VSTD::__invoke(_VSTD::forward<_Args>(__args)...);
     }
 #else
     template <class _Fn>
     static _Ret __call(_Fn __f) {
-        return __invoke(__f);
+        return _VSTD::__invoke(__f);
     }
 
     template <class _Fn, class _A0>
     static _Ret __call(_Fn __f, _A0& __a0) {
-        return __invoke(__f, __a0);
+        return _VSTD::__invoke(__f, __a0);
     }
 
     template <class _Fn, class _A0, class _A1>
     static _Ret __call(_Fn __f, _A0& __a0, _A1& __a1) {
-        return __invoke(__f, __a0, __a1);
+        return _VSTD::__invoke(__f, __a0, __a1);
     }
 
     template <class _Fn, class _A0, class _A1, class _A2>
     static _Ret __call(_Fn __f, _A0& __a0, _A1& __a1, _A2& __a2){
-        return __invoke(__f, __a0, __a1, __a2);
+        return _VSTD::__invoke(__f, __a0, __a1, __a2);
     }
 #endif
 };
 
-template <>
-struct __invoke_void_return_wrapper<void>
+template <class _Ret>
+struct __invoke_void_return_wrapper<_Ret, true>
 {
 #ifndef _LIBCPP_CXX03_LANG
     template <class ..._Args>
     static void __call(_Args&&... __args) {
-        __invoke(_VSTD::forward<_Args>(__args)...);
+        _VSTD::__invoke(_VSTD::forward<_Args>(__args)...);
     }
 #else
     template <class _Fn>
     static void __call(_Fn __f) {
-        __invoke(__f);
+        _VSTD::__invoke(__f);
     }
 
     template <class _Fn, class _A0>
     static void __call(_Fn __f, _A0& __a0) {
-        __invoke(__f, __a0);
+        _VSTD::__invoke(__f, __a0);
     }
 
     template <class _Fn, class _A0, class _A1>
     static void __call(_Fn __f, _A0& __a0, _A1& __a1) {
-        __invoke(__f, __a0, __a1);
+        _VSTD::__invoke(__f, __a0, __a1);
     }
 
     template <class _Fn, class _A0, class _A1, class _A2>
     static void __call(_Fn __f, _A0& __a0, _A1& __a1, _A2& __a2) {
-        __invoke(__f, __a0, __a1, __a2);
+        _VSTD::__invoke(__f, __a0, __a1, __a2);
     }
 #endif
 };
@@ -382,135 +382,138 @@ private:
 
 public:
     // construct/copy/destroy
-    _LIBCPP_INLINE_VISIBILITY reference_wrapper(type& __f) _NOEXCEPT
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
+    reference_wrapper(type& __f) _NOEXCEPT
         : __f_(_VSTD::addressof(__f)) {}
 #ifndef _LIBCPP_CXX03_LANG
     private: reference_wrapper(type&&); public: // = delete; // do not bind to temps
 #endif
 
     // access
-    _LIBCPP_INLINE_VISIBILITY operator type&    () const _NOEXCEPT {return *__f_;}
-    _LIBCPP_INLINE_VISIBILITY          type& get() const _NOEXCEPT {return *__f_;}
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
+    operator type&() const _NOEXCEPT {return *__f_;}
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
+    type& get() const _NOEXCEPT {return *__f_;}
 
 #ifndef _LIBCPP_CXX03_LANG
     // invoke
     template <class... _ArgTypes>
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
     typename __invoke_of<type&, _ArgTypes...>::type
     operator() (_ArgTypes&&... __args) const {
-        return __invoke(get(), _VSTD::forward<_ArgTypes>(__args)...);
+        return _VSTD::__invoke(get(), _VSTD::forward<_ArgTypes>(__args)...);
     }
 #else
 
     _LIBCPP_INLINE_VISIBILITY
     typename __invoke_return<type>::type
     operator() () const {
-        return __invoke(get());
+        return _VSTD::__invoke(get());
     }
 
     template <class _A0>
     _LIBCPP_INLINE_VISIBILITY
     typename __invoke_return0<type, _A0>::type
     operator() (_A0& __a0) const {
-        return __invoke(get(), __a0);
+        return _VSTD::__invoke(get(), __a0);
     }
 
     template <class _A0>
     _LIBCPP_INLINE_VISIBILITY
     typename __invoke_return0<type, _A0 const>::type
     operator() (_A0 const& __a0) const {
-        return __invoke(get(), __a0);
+        return _VSTD::__invoke(get(), __a0);
     }
 
     template <class _A0, class _A1>
     _LIBCPP_INLINE_VISIBILITY
     typename __invoke_return1<type, _A0, _A1>::type
     operator() (_A0& __a0, _A1& __a1) const {
-        return __invoke(get(), __a0, __a1);
+        return _VSTD::__invoke(get(), __a0, __a1);
     }
 
     template <class _A0, class _A1>
     _LIBCPP_INLINE_VISIBILITY
     typename __invoke_return1<type, _A0 const, _A1>::type
     operator() (_A0 const& __a0, _A1& __a1) const {
-        return __invoke(get(), __a0, __a1);
+        return _VSTD::__invoke(get(), __a0, __a1);
     }
 
     template <class _A0, class _A1>
     _LIBCPP_INLINE_VISIBILITY
     typename __invoke_return1<type, _A0, _A1 const>::type
     operator() (_A0& __a0, _A1 const& __a1) const {
-        return __invoke(get(), __a0, __a1);
+        return _VSTD::__invoke(get(), __a0, __a1);
     }
 
     template <class _A0, class _A1>
     _LIBCPP_INLINE_VISIBILITY
     typename __invoke_return1<type, _A0 const, _A1 const>::type
     operator() (_A0 const& __a0, _A1 const& __a1) const {
-        return __invoke(get(), __a0, __a1);
+        return _VSTD::__invoke(get(), __a0, __a1);
     }
 
     template <class _A0, class _A1, class _A2>
     _LIBCPP_INLINE_VISIBILITY
     typename __invoke_return2<type, _A0, _A1, _A2>::type
     operator() (_A0& __a0, _A1& __a1, _A2& __a2) const {
-        return __invoke(get(), __a0, __a1, __a2);
+        return _VSTD::__invoke(get(), __a0, __a1, __a2);
     }
 
     template <class _A0, class _A1, class _A2>
     _LIBCPP_INLINE_VISIBILITY
     typename __invoke_return2<type, _A0 const, _A1, _A2>::type
     operator() (_A0 const& __a0, _A1& __a1, _A2& __a2) const {
-        return __invoke(get(), __a0, __a1, __a2);
+        return _VSTD::__invoke(get(), __a0, __a1, __a2);
     }
 
     template <class _A0, class _A1, class _A2>
     _LIBCPP_INLINE_VISIBILITY
     typename __invoke_return2<type, _A0, _A1 const, _A2>::type
     operator() (_A0& __a0, _A1 const& __a1, _A2& __a2) const {
-        return __invoke(get(), __a0, __a1, __a2);
+        return _VSTD::__invoke(get(), __a0, __a1, __a2);
     }
 
     template <class _A0, class _A1, class _A2>
     _LIBCPP_INLINE_VISIBILITY
     typename __invoke_return2<type, _A0, _A1, _A2 const>::type
     operator() (_A0& __a0, _A1& __a1, _A2 const& __a2) const {
-        return __invoke(get(), __a0, __a1, __a2);
+        return _VSTD::__invoke(get(), __a0, __a1, __a2);
     }
 
     template <class _A0, class _A1, class _A2>
     _LIBCPP_INLINE_VISIBILITY
     typename __invoke_return2<type, _A0 const, _A1 const, _A2>::type
     operator() (_A0 const& __a0, _A1 const& __a1, _A2& __a2) const {
-        return __invoke(get(), __a0, __a1, __a2);
+        return _VSTD::__invoke(get(), __a0, __a1, __a2);
     }
 
     template <class _A0, class _A1, class _A2>
     _LIBCPP_INLINE_VISIBILITY
     typename __invoke_return2<type, _A0 const, _A1, _A2 const>::type
     operator() (_A0 const& __a0, _A1& __a1, _A2 const& __a2) const {
-        return __invoke(get(), __a0, __a1, __a2);
+        return _VSTD::__invoke(get(), __a0, __a1, __a2);
     }
 
     template <class _A0, class _A1, class _A2>
     _LIBCPP_INLINE_VISIBILITY
     typename __invoke_return2<type, _A0, _A1 const, _A2 const>::type
     operator() (_A0& __a0, _A1 const& __a1, _A2 const& __a2) const {
-        return __invoke(get(), __a0, __a1, __a2);
+        return _VSTD::__invoke(get(), __a0, __a1, __a2);
     }
 
     template <class _A0, class _A1, class _A2>
     _LIBCPP_INLINE_VISIBILITY
     typename __invoke_return2<type, _A0 const, _A1 const, _A2 const>::type
     operator() (_A0 const& __a0, _A1 const& __a1, _A2 const& __a2) const {
-        return __invoke(get(), __a0, __a1, __a2);
+        return _VSTD::__invoke(get(), __a0, __a1, __a2);
     }
 #endif // _LIBCPP_CXX03_LANG
 };
 
 
 template <class _Tp>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 reference_wrapper<_Tp>
 ref(_Tp& __t) _NOEXCEPT
 {
@@ -518,7 +521,7 @@ ref(_Tp& __t) _NOEXCEPT
 }
 
 template <class _Tp>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 reference_wrapper<_Tp>
 ref(reference_wrapper<_Tp> __t) _NOEXCEPT
 {
@@ -526,7 +529,7 @@ ref(reference_wrapper<_Tp> __t) _NOEXCEPT
 }
 
 template <class _Tp>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 reference_wrapper<const _Tp>
 cref(const _Tp& __t) _NOEXCEPT
 {
@@ -534,7 +537,7 @@ cref(const _Tp& __t) _NOEXCEPT
 }
 
 template <class _Tp>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 reference_wrapper<const _Tp>
 cref(reference_wrapper<_Tp> __t) _NOEXCEPT
 {
diff --git a/lib/libcxx/include/__functional_base_03 b/lib/libcxx/include/__functional_base_03
index e6dac90c84..9b08bd26a8 100644
--- a/lib/libcxx/include/__functional_base_03
+++ b/lib/libcxx/include/__functional_base_03
@@ -40,7 +40,7 @@ struct __enable_invoke_imp<_Ret, _T1, false, true>  {
 template <class _Ret, class _T1>
 struct __enable_invoke_imp<_Ret, _T1, false, false>  {
     typedef typename add_lvalue_reference<
-                typename __apply_cv<decltype(*_VSTD::declval<_T1>()), _Ret>::type
+                typename __apply_cv<decltype(*declval<_T1>()), _Ret>::type
             >::type _Bullet4;
     typedef _Bullet4 type;
 };
@@ -142,7 +142,7 @@ __invoke(_Fn __f, _T1& __t1) {
 
 template <class _Fp>
 inline _LIBCPP_INLINE_VISIBILITY
-decltype(_VSTD::declval<_Fp&>()())
+decltype(declval<_Fp&>()())
 __invoke(_Fp& __f)
 {
     return __f();
@@ -150,7 +150,7 @@ __invoke(_Fp& __f)
 
 template <class _Fp, class _A0>
 inline _LIBCPP_INLINE_VISIBILITY
-decltype(_VSTD::declval<_Fp&>()(_VSTD::declval<_A0&>()))
+decltype(declval<_Fp&>()(declval<_A0&>()))
 __invoke(_Fp& __f, _A0& __a0)
 {
     return __f(__a0);
@@ -158,7 +158,7 @@ __invoke(_Fp& __f, _A0& __a0)
 
 template <class _Fp, class _A0, class _A1>
 inline _LIBCPP_INLINE_VISIBILITY
-decltype(_VSTD::declval<_Fp&>()(_VSTD::declval<_A0&>(), _VSTD::declval<_A1&>()))
+decltype(declval<_Fp&>()(declval<_A0&>(), declval<_A1&>()))
 __invoke(_Fp& __f, _A0& __a0, _A1& __a1)
 {
     return __f(__a0, __a1);
@@ -166,7 +166,7 @@ __invoke(_Fp& __f, _A0& __a0, _A1& __a1)
 
 template <class _Fp, class _A0, class _A1, class _A2>
 inline _LIBCPP_INLINE_VISIBILITY
-decltype(_VSTD::declval<_Fp&>()(_VSTD::declval<_A0&>(), _VSTD::declval<_A1&>(), _VSTD::declval<_A2&>()))
+decltype(declval<_Fp&>()(declval<_A0&>(), declval<_A1&>(), declval<_A2&>()))
 __invoke(_Fp& __f, _A0& __a0, _A1& __a1, _A2& __a2)
 {
     return __f(__a0, __a1, __a2);
@@ -181,13 +181,13 @@ struct __invoke_return
 template <class _Fp>
 struct __invoke_return<_Fp, false>
 {
-    typedef decltype(__invoke(_VSTD::declval<_Fp&>())) type;
+    typedef decltype(_VSTD::__invoke(declval<_Fp&>())) type;
 };
 
 template <class _Tp, class _A0>
 struct __invoke_return0
 {
-    typedef decltype(__invoke(_VSTD::declval<_Tp&>(), _VSTD::declval<_A0&>())) type;
+    typedef decltype(_VSTD::__invoke(declval<_Tp&>(), declval<_A0&>())) type;
 };
 
 template <class _Rp, class _Tp, class _A0>
@@ -199,8 +199,8 @@ struct __invoke_return0<_Rp _Tp::*, _A0>
 template <class _Tp, class _A0, class _A1>
 struct __invoke_return1
 {
-    typedef decltype(__invoke(_VSTD::declval<_Tp&>(), _VSTD::declval<_A0&>(),
-                                                      _VSTD::declval<_A1&>())) type;
+    typedef decltype(_VSTD::__invoke(declval<_Tp&>(), declval<_A0&>(),
+                                                      declval<_A1&>())) type;
 };
 
 template <class _Rp, class _Class, class _A0, class _A1>
@@ -211,9 +211,9 @@ struct __invoke_return1<_Rp _Class::*, _A0, _A1> {
 template <class _Tp, class _A0, class _A1, class _A2>
 struct __invoke_return2
 {
-    typedef decltype(__invoke(_VSTD::declval<_Tp&>(), _VSTD::declval<_A0&>(),
-                                                      _VSTD::declval<_A1&>(),
-                                                      _VSTD::declval<_A2&>())) type;
+    typedef decltype(_VSTD::__invoke(declval<_Tp&>(), declval<_A0&>(),
+                                                      declval<_A1&>(),
+                                                      declval<_A2&>())) type;
 };
 
 template <class _Ret, class _Class, class _A0, class _A1, class _A2>
diff --git a/lib/libcxx/include/__hash_table b/lib/libcxx/include/__hash_table
index 13ff096897..521ebbf2c4 100644
--- a/lib/libcxx/include/__hash_table
+++ b/lib/libcxx/include/__hash_table
@@ -34,19 +34,17 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _Key, class _Tp>
 struct __hash_value_type;
 
-#ifndef _LIBCPP_CXX03_LANG
 template <class _Tp>
 struct __is_hash_value_type_imp : false_type {};
 
 template <class _Key, class _Value>
-struct __is_hash_value_type_imp<__hash_value_type<_Key, _Value>> : true_type {};
+struct __is_hash_value_type_imp<__hash_value_type<_Key, _Value> > : true_type {};
 
 template <class ..._Args>
 struct __is_hash_value_type : false_type {};
 
 template <class _One>
 struct __is_hash_value_type<_One> : __is_hash_value_type_imp<typename __uncvref<_One>::type> {};
-#endif
 
 _LIBCPP_FUNC_VIS
 size_t __next_prime(size_t __n);
@@ -122,7 +120,7 @@ inline _LIBCPP_INLINE_VISIBILITY
 size_t
 __next_hash_pow2(size_t __n)
 {
-    return __n < 2 ? __n : (size_t(1) << (std::numeric_limits<size_t>::digits - __libcpp_clz(__n-1)));
+    return __n < 2 ? __n : (size_t(1) << (numeric_limits<size_t>::digits - __libcpp_clz(__n-1)));
 }
 
 
@@ -155,12 +153,10 @@ struct __hash_key_value_types {
   static __container_value_type* __get_ptr(__node_value_type& __n) {
     return _VSTD::addressof(__n);
   }
-#ifndef _LIBCPP_CXX03_LANG
   _LIBCPP_INLINE_VISIBILITY
   static __container_value_type&& __move(__node_value_type& __v) {
     return _VSTD::move(__v);
   }
-#endif
 };
 
 template <class _Key, class _Tp>
@@ -197,13 +193,10 @@ struct __hash_key_value_types<__hash_value_type<_Key, _Tp> > {
   static __container_value_type* __get_ptr(__node_value_type& __n) {
     return _VSTD::addressof(__n.__get_value());
   }
-#ifndef _LIBCPP_CXX03_LANG
   _LIBCPP_INLINE_VISIBILITY
   static pair<key_type&&, mapped_type&&> __move(__node_value_type& __v) {
     return __v.__move();
   }
-#endif
-
 };
 
 template <class _Tp, class _AllocPtr, class _KVTypes = __hash_key_value_types<_Tp>,
@@ -295,10 +288,12 @@ public:
     typedef typename _NodeTypes::__node_value_type_pointer pointer;
 
     _LIBCPP_INLINE_VISIBILITY __hash_iterator() _NOEXCEPT : __node_(nullptr) {
-        _LIBCPP_DEBUG_MODE(__get_db()->__insert_i(this));
+#if _LIBCPP_DEBUG_LEVEL == 2
+        __get_db()->__insert_i(this);
+#endif
     }
 
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_INLINE_VISIBILITY
     __hash_iterator(const __hash_iterator& __i)
         : __node_(__i.__node_)
@@ -322,7 +317,7 @@ public:
         }
         return *this;
     }
-#endif  // _LIBCPP_DEBUG_LEVEL >= 2
+#endif  // _LIBCPP_DEBUG_LEVEL == 2
 
     _LIBCPP_INLINE_VISIBILITY
     reference operator*() const {
@@ -364,7 +359,7 @@ public:
         {return !(__x == __y);}
 
 private:
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_INLINE_VISIBILITY
     __hash_iterator(__next_pointer __node, const void* __c) _NOEXCEPT
         : __node_(__node)
@@ -405,17 +400,21 @@ public:
 
 
     _LIBCPP_INLINE_VISIBILITY __hash_const_iterator() _NOEXCEPT : __node_(nullptr) {
-        _LIBCPP_DEBUG_MODE(__get_db()->__insert_i(this));
+#if _LIBCPP_DEBUG_LEVEL == 2
+        __get_db()->__insert_i(this);
+#endif
     }
 
     _LIBCPP_INLINE_VISIBILITY
     __hash_const_iterator(const __non_const_iterator& __x) _NOEXCEPT
         : __node_(__x.__node_)
     {
-        _LIBCPP_DEBUG_MODE(__get_db()->__iterator_copy(this, &__x));
+#if _LIBCPP_DEBUG_LEVEL == 2
+        __get_db()->__iterator_copy(this, &__x);
+#endif
     }
 
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_INLINE_VISIBILITY
     __hash_const_iterator(const __hash_const_iterator& __i)
         : __node_(__i.__node_)
@@ -439,7 +438,7 @@ public:
         }
         return *this;
     }
-#endif  // _LIBCPP_DEBUG_LEVEL >= 2
+#endif  // _LIBCPP_DEBUG_LEVEL == 2
 
     _LIBCPP_INLINE_VISIBILITY
     reference operator*() const {
@@ -480,7 +479,7 @@ public:
         {return !(__x == __y);}
 
 private:
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_INLINE_VISIBILITY
     __hash_const_iterator(__next_pointer __node, const void* __c) _NOEXCEPT
         : __node_(__node)
@@ -518,10 +517,12 @@ public:
     typedef typename _NodeTypes::__node_value_type_pointer      pointer;
 
     _LIBCPP_INLINE_VISIBILITY __hash_local_iterator() _NOEXCEPT : __node_(nullptr) {
-        _LIBCPP_DEBUG_MODE(__get_db()->__insert_i(this));
+#if _LIBCPP_DEBUG_LEVEL == 2
+        __get_db()->__insert_i(this);
+#endif
     }
 
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_INLINE_VISIBILITY
     __hash_local_iterator(const __hash_local_iterator& __i)
         : __node_(__i.__node_),
@@ -549,7 +550,7 @@ public:
         }
         return *this;
     }
-#endif  // _LIBCPP_DEBUG_LEVEL >= 2
+#endif  // _LIBCPP_DEBUG_LEVEL == 2
 
     _LIBCPP_INLINE_VISIBILITY
     reference operator*() const {
@@ -593,7 +594,7 @@ public:
         {return !(__x == __y);}
 
 private:
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_INLINE_VISIBILITY
     __hash_local_iterator(__next_pointer __node, size_t __bucket,
                           size_t __bucket_count, const void* __c) _NOEXCEPT
@@ -650,7 +651,9 @@ public:
 
 
     _LIBCPP_INLINE_VISIBILITY __hash_const_local_iterator() _NOEXCEPT : __node_(nullptr) {
-        _LIBCPP_DEBUG_MODE(__get_db()->__insert_i(this));
+#if _LIBCPP_DEBUG_LEVEL == 2
+        __get_db()->__insert_i(this);
+#endif
     }
 
     _LIBCPP_INLINE_VISIBILITY
@@ -659,10 +662,12 @@ public:
           __bucket_(__x.__bucket_),
           __bucket_count_(__x.__bucket_count_)
     {
-        _LIBCPP_DEBUG_MODE(__get_db()->__iterator_copy(this, &__x));
+#if _LIBCPP_DEBUG_LEVEL == 2
+        __get_db()->__iterator_copy(this, &__x);
+#endif
     }
 
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_INLINE_VISIBILITY
     __hash_const_local_iterator(const __hash_const_local_iterator& __i)
         : __node_(__i.__node_),
@@ -690,7 +695,7 @@ public:
         }
         return *this;
     }
-#endif  // _LIBCPP_DEBUG_LEVEL >= 2
+#endif  // _LIBCPP_DEBUG_LEVEL == 2
 
     _LIBCPP_INLINE_VISIBILITY
     reference operator*() const {
@@ -734,7 +739,7 @@ public:
         {return !(__x == __y);}
 
 private:
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_INLINE_VISIBILITY
     __hash_const_local_iterator(__next_pointer __node, size_t __bucket,
                                 size_t __bucket_count, const void* __c) _NOEXCEPT
@@ -783,7 +788,6 @@ public:
         _NOEXCEPT_(is_nothrow_copy_constructible<allocator_type>::value)
         : __data_(__size, __a) {}
 
-#ifndef _LIBCPP_CXX03_LANG
     _LIBCPP_INLINE_VISIBILITY
     __bucket_list_deallocator(__bucket_list_deallocator&& __x)
         _NOEXCEPT_(is_nothrow_move_constructible<allocator_type>::value)
@@ -791,7 +795,6 @@ public:
     {
         __x.size() = 0;
     }
-#endif
 
     _LIBCPP_INLINE_VISIBILITY
     size_type& size() _NOEXCEPT {return __data_.first();}
@@ -1007,7 +1010,6 @@ public:
     explicit __hash_table(const allocator_type& __a);
     __hash_table(const __hash_table& __u);
     __hash_table(const __hash_table& __u, const allocator_type& __a);
-#ifndef _LIBCPP_CXX03_LANG
     __hash_table(__hash_table&& __u)
         _NOEXCEPT_(
             is_nothrow_move_constructible<__bucket_list>::value &&
@@ -1016,11 +1018,9 @@ public:
             is_nothrow_move_constructible<hasher>::value &&
             is_nothrow_move_constructible<key_equal>::value);
     __hash_table(__hash_table&& __u, const allocator_type& __a);
-#endif  // _LIBCPP_CXX03_LANG
     ~__hash_table();
 
     __hash_table& operator=(const __hash_table& __u);
-#ifndef _LIBCPP_CXX03_LANG
     _LIBCPP_INLINE_VISIBILITY
     __hash_table& operator=(__hash_table&& __u)
         _NOEXCEPT_(
@@ -1028,7 +1028,6 @@ public:
             is_nothrow_move_assignable<__node_allocator>::value &&
             is_nothrow_move_assignable<hasher>::value &&
             is_nothrow_move_assignable<key_equal>::value);
-#endif
     template <class _InputIterator>
         void __assign_unique(_InputIterator __first, _InputIterator __last);
     template <class _InputIterator>
@@ -1037,7 +1036,7 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     size_type max_size() const _NOEXCEPT
     {
-        return std::min<size_type>(
+        return _VSTD::min<size_type>(
             __node_traits::max_size(__node_alloc()),
             numeric_limits<difference_type >::max()
         );
@@ -1066,7 +1065,6 @@ public:
     iterator             __node_insert_multi(const_iterator __p,
                                              __node_pointer __nd);
 
-#ifndef _LIBCPP_CXX03_LANG
     template <class _Key, class ..._Args>
     _LIBCPP_INLINE_VISIBILITY
     pair<iterator, bool> __emplace_unique_key_args(_Key const& __k, _Args&&... __args);
@@ -1151,15 +1149,6 @@ public:
         return __emplace_hint_multi(__p, _VSTD::forward<_Pp>(__x));
     }
 
-#else  // !defined(_LIBCPP_CXX03_LANG)
-    template <class _Key, class _Args>
-    _LIBCPP_INLINE_VISIBILITY
-    pair<iterator, bool> __emplace_unique_key_args(_Key const&, _Args& __args);
-
-    iterator __insert_multi(const __container_value_type& __x);
-    iterator __insert_multi(const_iterator __p, const __container_value_type& __x);
-#endif
-
     _LIBCPP_INLINE_VISIBILITY
     pair<iterator, bool> __insert_unique(const __container_value_type& __x) {
         return __emplace_unique_key_args(_NodeTypes::__get_key(__x), __x);
@@ -1295,7 +1284,7 @@ public:
     {
         _LIBCPP_ASSERT(__n < bucket_count(),
             "unordered container::begin(n) called with n >= bucket_count()");
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         return local_iterator(__bucket_list_[__n], __n, bucket_count(), this);
 #else
         return local_iterator(__bucket_list_[__n], __n, bucket_count());
@@ -1308,7 +1297,7 @@ public:
     {
         _LIBCPP_ASSERT(__n < bucket_count(),
             "unordered container::end(n) called with n >= bucket_count()");
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         return local_iterator(nullptr, __n, bucket_count(), this);
 #else
         return local_iterator(nullptr, __n, bucket_count());
@@ -1321,7 +1310,7 @@ public:
     {
         _LIBCPP_ASSERT(__n < bucket_count(),
             "unordered container::cbegin(n) called with n >= bucket_count()");
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         return const_local_iterator(__bucket_list_[__n], __n, bucket_count(), this);
 #else
         return const_local_iterator(__bucket_list_[__n], __n, bucket_count());
@@ -1334,35 +1323,30 @@ public:
     {
         _LIBCPP_ASSERT(__n < bucket_count(),
             "unordered container::cend(n) called with n >= bucket_count()");
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         return const_local_iterator(nullptr, __n, bucket_count(), this);
 #else
         return const_local_iterator(nullptr, __n, bucket_count());
 #endif
     }
 
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
 
     bool __dereferenceable(const const_iterator* __i) const;
     bool __decrementable(const const_iterator* __i) const;
     bool __addable(const const_iterator* __i, ptrdiff_t __n) const;
     bool __subscriptable(const const_iterator* __i, ptrdiff_t __n) const;
 
-#endif  // _LIBCPP_DEBUG_LEVEL >= 2
+#endif  // _LIBCPP_DEBUG_LEVEL == 2
 
 private:
     void __rehash(size_type __n);
 
-#ifndef _LIBCPP_CXX03_LANG
     template <class ..._Args>
     __node_holder __construct_node(_Args&& ...__args);
 
     template <class _First, class ..._Rest>
     __node_holder __construct_node_hash(size_t __hash, _First&& __f, _Rest&&... __rest);
-#else // _LIBCPP_CXX03_LANG
-    __node_holder __construct_node(const __container_value_type& __v);
-    __node_holder __construct_node_hash(size_t __hash, const __container_value_type& __v);
-#endif
 
 
     _LIBCPP_INLINE_VISIBILITY
@@ -1373,7 +1357,6 @@ private:
     _LIBCPP_INLINE_VISIBILITY
         void __copy_assign_alloc(const __hash_table&, false_type) {}
 
-#ifndef _LIBCPP_CXX03_LANG
     void __move_assign(__hash_table& __u, false_type);
     void __move_assign(__hash_table& __u, true_type)
         _NOEXCEPT_(
@@ -1400,7 +1383,6 @@ private:
     }
     _LIBCPP_INLINE_VISIBILITY
         void __move_assign_alloc(__hash_table&, false_type) _NOEXCEPT {}
-#endif // _LIBCPP_CXX03_LANG
 
     void __deallocate_node(__next_pointer __np) _NOEXCEPT;
     __next_pointer __detach() _NOEXCEPT;
@@ -1477,8 +1459,6 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__hash_table(const __hash_table& __u,
 {
 }
 
-#ifndef _LIBCPP_CXX03_LANG
-
 template <class _Tp, class _Hash, class _Equal, class _Alloc>
 __hash_table<_Tp, _Hash, _Equal, _Alloc>::__hash_table(__hash_table&& __u)
         _NOEXCEPT_(
@@ -1526,8 +1506,6 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__hash_table(__hash_table&& __u,
     }
 }
 
-#endif  // _LIBCPP_CXX03_LANG
-
 template <class _Tp, class _Hash, class _Equal, class _Alloc>
 __hash_table<_Tp, _Hash, _Equal, _Alloc>::~__hash_table()
 {
@@ -1539,7 +1517,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::~__hash_table()
 #endif
 
     __deallocate_node(__p1_.first().__next_);
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__erase_c(this);
 #endif
 }
@@ -1583,7 +1561,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__deallocate_node(__next_pointer __np)
     while (__np != nullptr)
     {
         __next_pointer __next = __np->__next_;
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         __c_node* __c = __get_db()->__find_c_and_lock(this);
         for (__i_node** __p = __c->end_; __p != __c->beg_; )
         {
@@ -1593,7 +1571,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__deallocate_node(__next_pointer __np)
             {
                 (*__p)->__c_ = nullptr;
                 if (--__c->end_ != __p)
-                    memmove(__p, __p+1, (__c->end_ - __p)*sizeof(__i_node*));
+                    _VSTD::memmove(__p, __p+1, (__c->end_ - __p)*sizeof(__i_node*));
             }
         }
         __get_db()->unlock();
@@ -1618,8 +1596,6 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__detach() _NOEXCEPT
     return __cache;
 }
 
-#ifndef _LIBCPP_CXX03_LANG
-
 template <class _Tp, class _Hash, class _Equal, class _Alloc>
 void
 __hash_table<_Tp, _Hash, _Equal, _Alloc>::__move_assign(
@@ -1646,7 +1622,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__move_assign(
         __u.__p1_.first().__next_ = nullptr;
         __u.size() = 0;
     }
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->swap(this, &__u);
 #endif
 }
@@ -1714,8 +1690,6 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::operator=(__hash_table&& __u)
     return *this;
 }
 
-#endif  // _LIBCPP_CXX03_LANG
-
 template <class _Tp, class _Hash, class _Equal, class _Alloc>
 template <class _InputIterator>
 void
@@ -1800,7 +1774,7 @@ inline
 typename __hash_table<_Tp, _Hash, _Equal, _Alloc>::iterator
 __hash_table<_Tp, _Hash, _Equal, _Alloc>::begin() _NOEXCEPT
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     return iterator(__p1_.first().__next_, this);
 #else
     return iterator(__p1_.first().__next_);
@@ -1812,7 +1786,7 @@ inline
 typename __hash_table<_Tp, _Hash, _Equal, _Alloc>::iterator
 __hash_table<_Tp, _Hash, _Equal, _Alloc>::end() _NOEXCEPT
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     return iterator(nullptr, this);
 #else
     return iterator(nullptr);
@@ -1824,7 +1798,7 @@ inline
 typename __hash_table<_Tp, _Hash, _Equal, _Alloc>::const_iterator
 __hash_table<_Tp, _Hash, _Equal, _Alloc>::begin() const _NOEXCEPT
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     return const_iterator(__p1_.first().__next_, this);
 #else
     return const_iterator(__p1_.first().__next_);
@@ -1836,7 +1810,7 @@ inline
 typename __hash_table<_Tp, _Hash, _Equal, _Alloc>::const_iterator
 __hash_table<_Tp, _Hash, _Equal, _Alloc>::end() const _NOEXCEPT
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     return const_iterator(nullptr, this);
 #else
     return const_iterator(nullptr);
@@ -1945,7 +1919,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__node_insert_unique(__node_pointer __
         __existing_node = __nd->__ptr();
         __inserted = true;
     }
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     return pair<iterator, bool>(iterator(__existing_node, this), __inserted);
 #else
     return pair<iterator, bool>(iterator(__existing_node), __inserted);
@@ -1955,7 +1929,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__node_insert_unique(__node_pointer __
 // Prepare the container for an insertion of the value __cp_val with the hash
 // __cp_hash. This does a lookup into the container to see if __cp_value is
 // already present, and performs a rehash if necessary. Returns a pointer to the
-// last occurance of __cp_val in the map.
+// last occurrence of __cp_val in the map.
 //
 // Note that this function does forward exceptions if key_eq() throws, and never
 // mutates __value or actually inserts into the map.
@@ -2043,7 +2017,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__node_insert_multi(__node_pointer __c
     __next_pointer __pn = __node_insert_multi_prepare(__cp->__hash(), __cp->__value_);
     __node_insert_multi_perform(__cp, __pn);
 
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     return iterator(__cp->__ptr(), this);
 #else
     return iterator(__cp->__ptr());
@@ -2055,7 +2029,7 @@ typename __hash_table<_Tp, _Hash, _Equal, _Alloc>::iterator
 __hash_table<_Tp, _Hash, _Equal, _Alloc>::__node_insert_multi(
         const_iterator __p, __node_pointer __cp)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__p) == this,
         "unordered container::emplace_hint(const_iterator, args...) called with an iterator not"
         " referring to this unordered container");
@@ -2078,7 +2052,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__node_insert_multi(
         __cp->__next_ = __np;
         __pp->__next_ = static_cast<__next_pointer>(__cp);
         ++size();
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         return iterator(static_cast<__next_pointer>(__cp), this);
 #else
         return iterator(static_cast<__next_pointer>(__cp));
@@ -2089,17 +2063,10 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__node_insert_multi(
 
 
 
-#ifndef _LIBCPP_CXX03_LANG
 template <class _Tp, class _Hash, class _Equal, class _Alloc>
 template <class _Key, class ..._Args>
 pair<typename __hash_table<_Tp, _Hash, _Equal, _Alloc>::iterator, bool>
 __hash_table<_Tp, _Hash, _Equal, _Alloc>::__emplace_unique_key_args(_Key const& __k, _Args&&... __args)
-#else
-template <class _Tp, class _Hash, class _Equal, class _Alloc>
-template <class _Key, class _Args>
-pair<typename __hash_table<_Tp, _Hash, _Equal, _Alloc>::iterator, bool>
-__hash_table<_Tp, _Hash, _Equal, _Alloc>::__emplace_unique_key_args(_Key const& __k, _Args& __args)
-#endif
 {
 
     size_t __hash = hash_function()(__k);
@@ -2123,11 +2090,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__emplace_unique_key_args(_Key const&
         }
     }
     {
-#ifndef _LIBCPP_CXX03_LANG
         __node_holder __h = __construct_node_hash(__hash, _VSTD::forward<_Args>(__args)...);
-#else
-        __node_holder __h = __construct_node_hash(__hash, __args);
-#endif
         if (size()+1 > __bc * max_load_factor() || __bc == 0)
         {
             rehash(_VSTD::max<size_type>(2 * __bc + !__is_hash_power2(__bc),
@@ -2159,15 +2122,13 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__emplace_unique_key_args(_Key const&
         __inserted = true;
     }
 __done:
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     return pair<iterator, bool>(iterator(__nd, this), __inserted);
 #else
     return pair<iterator, bool>(iterator(__nd), __inserted);
 #endif
 }
 
-#ifndef _LIBCPP_CXX03_LANG
-
 template <class _Tp, class _Hash, class _Equal, class _Alloc>
 template <class... _Args>
 pair<typename __hash_table<_Tp, _Hash, _Equal, _Alloc>::iterator, bool>
@@ -2197,7 +2158,7 @@ typename __hash_table<_Tp, _Hash, _Equal, _Alloc>::iterator
 __hash_table<_Tp, _Hash, _Equal, _Alloc>::__emplace_hint_multi(
         const_iterator __p, _Args&&... __args)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__p) == this,
         "unordered container::emplace_hint(const_iterator, args...) called with an iterator not"
         " referring to this unordered container");
@@ -2208,36 +2169,6 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__emplace_hint_multi(
     return __r;
 }
 
-#else // _LIBCPP_CXX03_LANG
-
-template <class _Tp, class _Hash, class _Equal, class _Alloc>
-typename __hash_table<_Tp, _Hash, _Equal, _Alloc>::iterator
-__hash_table<_Tp, _Hash, _Equal, _Alloc>::__insert_multi(const __container_value_type& __x)
-{
-    __node_holder __h = __construct_node(__x);
-    iterator __r = __node_insert_multi(__h.get());
-    __h.release();
-    return __r;
-}
-
-template <class _Tp, class _Hash, class _Equal, class _Alloc>
-typename __hash_table<_Tp, _Hash, _Equal, _Alloc>::iterator
-__hash_table<_Tp, _Hash, _Equal, _Alloc>::__insert_multi(const_iterator __p,
-                                                         const __container_value_type& __x)
-{
-#if _LIBCPP_DEBUG_LEVEL >= 2
-    _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__p) == this,
-        "unordered container::insert(const_iterator, lvalue) called with an iterator not"
-        " referring to this unordered container");
-#endif
-    __node_holder __h = __construct_node(__x);
-    iterator __r = __node_insert_multi(__p, __h.get());
-    __h.release();
-    return __r;
-}
-
-#endif  // _LIBCPP_CXX03_LANG
-
 #if _LIBCPP_STD_VER > 14
 template <class _Tp, class _Hash, class _Equal, class _Alloc>
 template <class _NodeHandle, class _InsertReturnType>
@@ -2399,9 +2330,9 @@ template <class _Tp, class _Hash, class _Equal, class _Alloc>
 void
 __hash_table<_Tp, _Hash, _Equal, _Alloc>::__rehash(size_type __nbc)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__invalidate_all(this);
-#endif  // _LIBCPP_DEBUG_LEVEL >= 2
+#endif
     __pointer_allocator& __npa = __bucket_list_.get_deleter().__alloc();
     __bucket_list_.reset(__nbc > 0 ?
                       __pointer_alloc_traits::allocate(__npa, __nbc) : nullptr);
@@ -2470,7 +2401,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::find(const _Key& __k)
             {
                 if ((__nd->__hash() == __hash)
                     && key_eq()(__nd->__upcast()->__value_, __k))
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
                     return iterator(__nd, this);
 #else
                     return iterator(__nd);
@@ -2501,7 +2432,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::find(const _Key& __k) const
             {
                 if ((__nd->__hash() == __hash)
                     && key_eq()(__nd->__upcast()->__value_, __k))
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
                     return const_iterator(__nd, this);
 #else
                     return const_iterator(__nd);
@@ -2513,8 +2444,6 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::find(const _Key& __k) const
     return end();
 }
 
-#ifndef _LIBCPP_CXX03_LANG
-
 template <class _Tp, class _Hash, class _Equal, class _Alloc>
 template <class ..._Args>
 typename __hash_table<_Tp, _Hash, _Equal, _Alloc>::__node_holder
@@ -2550,43 +2479,12 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__construct_node_hash(
     return __h;
 }
 
-#else  // _LIBCPP_CXX03_LANG
-
-template <class _Tp, class _Hash, class _Equal, class _Alloc>
-typename __hash_table<_Tp, _Hash, _Equal, _Alloc>::__node_holder
-__hash_table<_Tp, _Hash, _Equal, _Alloc>::__construct_node(const __container_value_type& __v)
-{
-    __node_allocator& __na = __node_alloc();
-    __node_holder __h(__node_traits::allocate(__na, 1), _Dp(__na));
-    __node_traits::construct(__na, _NodeTypes::__get_ptr(__h->__value_), __v);
-    __h.get_deleter().__value_constructed = true;
-    __h->__hash_ = hash_function()(__h->__value_);
-    __h->__next_ = nullptr;
-    return _LIBCPP_EXPLICIT_MOVE(__h);  // explicitly moved for C++03
-}
-
-template <class _Tp, class _Hash, class _Equal, class _Alloc>
-typename __hash_table<_Tp, _Hash, _Equal, _Alloc>::__node_holder
-__hash_table<_Tp, _Hash, _Equal, _Alloc>::__construct_node_hash(size_t __hash,
-                                                                const __container_value_type& __v)
-{
-    __node_allocator& __na = __node_alloc();
-    __node_holder __h(__node_traits::allocate(__na, 1), _Dp(__na));
-    __node_traits::construct(__na, _NodeTypes::__get_ptr(__h->__value_), __v);
-    __h.get_deleter().__value_constructed = true;
-    __h->__hash_ = __hash;
-    __h->__next_ = nullptr;
-    return _LIBCPP_EXPLICIT_MOVE(__h);  // explicitly moved for C++03
-}
-
-#endif  // _LIBCPP_CXX03_LANG
-
 template <class _Tp, class _Hash, class _Equal, class _Alloc>
 typename __hash_table<_Tp, _Hash, _Equal, _Alloc>::iterator
 __hash_table<_Tp, _Hash, _Equal, _Alloc>::erase(const_iterator __p)
 {
     __next_pointer __np = __p.__node_;
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__p) == this,
         "unordered container erase(iterator) called with an iterator not"
         " referring to this container");
@@ -2606,7 +2504,7 @@ typename __hash_table<_Tp, _Hash, _Equal, _Alloc>::iterator
 __hash_table<_Tp, _Hash, _Equal, _Alloc>::erase(const_iterator __first,
                                                 const_iterator __last)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__first) == this,
         "unodered container::erase(iterator, iterator) called with an iterator not"
         " referring to this unodered container");
@@ -2620,7 +2518,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::erase(const_iterator __first,
         erase(__p);
     }
     __next_pointer __np = __last.__node_;
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     return iterator (__np, this);
 #else
     return iterator (__np);
@@ -2691,7 +2589,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::remove(const_iterator __p) _NOEXCEPT
     __pn->__next_ = __cn->__next_;
     __cn->__next_ = nullptr;
     --size();
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __c_node* __c = __get_db()->__find_c_and_lock(this);
     for (__i_node** __dp = __c->end_; __dp != __c->beg_; )
     {
@@ -2701,7 +2599,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::remove(const_iterator __p) _NOEXCEPT
         {
             (*__dp)->__c_ = nullptr;
             if (--__c->end_ != __dp)
-                memmove(__dp, __dp+1, (__c->end_ - __dp)*sizeof(__i_node*));
+                _VSTD::memmove(__dp, __dp+1, (__c->end_ - __dp)*sizeof(__i_node*));
         }
     }
     __get_db()->unlock();
@@ -2830,9 +2728,9 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::swap(__hash_table& __u)
     __u.__bucket_list_.reset(__npp);
     }
     _VSTD::swap(__bucket_list_.get_deleter().size(), __u.__bucket_list_.get_deleter().size());
-    __swap_allocator(__bucket_list_.get_deleter().__alloc(),
+    _VSTD::__swap_allocator(__bucket_list_.get_deleter().__alloc(),
              __u.__bucket_list_.get_deleter().__alloc());
-    __swap_allocator(__node_alloc(), __u.__node_alloc());
+    _VSTD::__swap_allocator(__node_alloc(), __u.__node_alloc());
     _VSTD::swap(__p1_.first().__next_, __u.__p1_.first().__next_);
     __p2_.swap(__u.__p2_);
     __p3_.swap(__u.__p3_);
@@ -2842,7 +2740,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::swap(__hash_table& __u)
     if (__u.size() > 0)
         __u.__bucket_list_[__constrain_hash(__u.__p1_.first().__next_->__hash(), __u.bucket_count())] =
             __u.__p1_.first().__ptr();
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->swap(this, &__u);
 #endif
 }
@@ -2876,7 +2774,7 @@ swap(__hash_table<_Tp, _Hash, _Equal, _Alloc>& __x,
     __x.swap(__y);
 }
 
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
 
 template <class _Tp, class _Hash, class _Equal, class _Alloc>
 bool
@@ -2906,7 +2804,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__subscriptable(const const_iterator*,
     return false;
 }
 
-#endif  // _LIBCPP_DEBUG_LEVEL >= 2
+#endif  // _LIBCPP_DEBUG_LEVEL == 2
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/lib/libcxx/include/__libcpp_version b/lib/libcxx/include/__libcpp_version
index 82b3803a20..e334181b40 100644
--- a/lib/libcxx/include/__libcpp_version
+++ b/lib/libcxx/include/__libcpp_version
@@ -1 +1 @@
-11000
+12000
diff --git a/lib/libcxx/include/__locale b/lib/libcxx/include/__locale
index 6d10fa4d3d..77e5faab26 100644
--- a/lib/libcxx/include/__locale
+++ b/lib/libcxx/include/__locale
@@ -11,6 +11,7 @@
 #define _LIBCPP___LOCALE
 
 #include <__config>
+#include <__availability>
 #include <string>
 #include <memory>
 #include <utility>
@@ -20,26 +21,30 @@
 #include <locale.h>
 #if defined(_LIBCPP_MSVCRT_LIKE)
 # include <cstring>
-# include <support/win32/locale_win32.h>
-#elif defined(_AIX)
-# include <support/ibm/xlocale.h>
+# include <__support/win32/locale_win32.h>
+#elif defined(__NuttX__)
+# include <__support/nuttx/xlocale.h>
+#elif defined(_AIX) || defined(__MVS__)
+# include <__support/ibm/xlocale.h>
 #elif defined(__ANDROID__)
-# include <support/android/locale_bionic.h>
+# include <__support/android/locale_bionic.h>
 #elif defined(__sun__)
 # include <xlocale.h>
-# include <support/solaris/xlocale.h>
+# include <__support/solaris/xlocale.h>
 #elif defined(_NEWLIB_VERSION)
-# include <support/newlib/xlocale.h>
+# include <__support/newlib/xlocale.h>
+#elif defined(__OpenBSD__)
+# include <__support/openbsd/xlocale.h>
 #elif (defined(__APPLE__)      || defined(__FreeBSD__) \
     || defined(__EMSCRIPTEN__) || defined(__IBMCPP__))
 # include <xlocale.h>
 #elif defined(__Fuchsia__)
-# include <support/fuchsia/xlocale.h>
+# include <__support/fuchsia/xlocale.h>
 #elif defined(__wasi__)
 // WASI libc uses musl's locales support.
-# include <support/musl/xlocale.h>
+# include <__support/musl/xlocale.h>
 #elif defined(_LIBCPP_HAS_MUSL_LIBC)
-# include <support/musl/xlocale.h>
+# include <__support/musl/xlocale.h>
 #endif
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -76,7 +81,7 @@ struct __libcpp_locale_guard {
       // locale name, otherwise it will be a semicolon-separated string listing
       // each category.  In the second case, we know at least one category won't
       // be what we want, so we only have to check the first case.
-      if (strcmp(__l.__get_locale(), __lc) != 0) {
+      if (_VSTD::strcmp(__l.__get_locale(), __lc) != 0) {
         __locale_all = _strdup(__lc);
         if (__locale_all == nullptr)
           __throw_bad_alloc();
@@ -105,7 +110,6 @@ struct __libcpp_locale_guard {
 };
 #endif
 
-
 class _LIBCPP_TYPE_VIS locale;
 
 template <class _Facet>
@@ -335,8 +339,8 @@ collate<_CharT>::do_hash(const char_type* __lo, const char_type* __hi) const
     return static_cast<long>(__h);
 }
 
-_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS collate<char>)
-_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS collate<wchar_t>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS collate<char>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS collate<wchar_t>)
 
 // template <class CharT> class collate_byname;
 
@@ -396,7 +400,26 @@ locale::operator()(const basic_string<_CharT, _Traits, _Allocator>& __x,
 class _LIBCPP_TYPE_VIS ctype_base
 {
 public:
-#if defined(__GLIBC__)
+#if defined(_LIBCPP_PROVIDES_DEFAULT_RUNE_TABLE)
+    typedef unsigned long mask;
+    static const mask space  = 1<<0;
+    static const mask print  = 1<<1;
+    static const mask cntrl  = 1<<2;
+    static const mask upper  = 1<<3;
+    static const mask lower  = 1<<4;
+    static const mask alpha  = 1<<5;
+    static const mask digit  = 1<<6;
+    static const mask punct  = 1<<7;
+    static const mask xdigit = 1<<8;
+    static const mask blank  = 1<<9;
+#if defined(__BIONIC__)
+    // Historically this was a part of regex_traits rather than ctype_base. The
+    // historical value of the constant is preserved for ABI compatibility.
+    static const mask __regex_word = 0x8000;
+#else
+    static const mask __regex_word = 1<<10;
+#endif // defined(__BIONIC__)
+#elif defined(__GLIBC__)
     typedef unsigned short mask;
     static const mask space  = _ISspace;
     static const mask print  = _ISprint;
@@ -485,24 +508,7 @@ public:
 # define _LIBCPP_CTYPE_MASK_IS_COMPOSITE_ALPHA
 # define _LIBCPP_CTYPE_MASK_IS_COMPOSITE_XDIGIT
 #else
-    typedef unsigned long mask;
-    static const mask space  = 1<<0;
-    static const mask print  = 1<<1;
-    static const mask cntrl  = 1<<2;
-    static const mask upper  = 1<<3;
-    static const mask lower  = 1<<4;
-    static const mask alpha  = 1<<5;
-    static const mask digit  = 1<<6;
-    static const mask punct  = 1<<7;
-    static const mask xdigit = 1<<8;
-    static const mask blank  = 1<<9;
-#if defined(__BIONIC__)
-    // Historically this was a part of regex_traits rather than ctype_base. The
-    // historical value of the constant is preserved for ABI compatibility.
-    static const mask __regex_word = 0x8000;
-#else
-    static const mask __regex_word = 1<<10;
-#endif // defined(__BIONIC__)
+# error unknown rune table for this platform -- do you mean to define _LIBCPP_PROVIDES_DEFAULT_RUNE_TABLE?
 #endif
     static const mask alnum  = alpha | digit;
     static const mask graph  = alnum | punct;
@@ -623,7 +629,7 @@ class _LIBCPP_TYPE_VIS ctype<char>
 public:
     typedef char char_type;
 
-    explicit ctype(const mask* __tab = 0, bool __del = false, size_t __refs = 0);
+    explicit ctype(const mask* __tab = nullptr, bool __del = false, size_t __refs = 0);
 
     _LIBCPP_INLINE_VISIBILITY
     bool is(mask __m, char_type __c) const
@@ -1069,10 +1075,10 @@ protected:
     virtual int do_max_length() const  _NOEXCEPT;
 };
 
-// template <> class codecvt<char16_t, char, mbstate_t>
+// template <> class codecvt<char16_t, char, mbstate_t> // deprecated in C++20
 
 template <>
-class _LIBCPP_TYPE_VIS codecvt<char16_t, char, mbstate_t>
+class _LIBCPP_DEPRECATED_IN_CXX20 _LIBCPP_TYPE_VIS codecvt<char16_t, char, mbstate_t>
     : public locale::facet,
       public codecvt_base
 {
@@ -1155,10 +1161,100 @@ protected:
     virtual int do_max_length() const  _NOEXCEPT;
 };
 
-// template <> class codecvt<char32_t, char, mbstate_t>
+#ifndef _LIBCPP_NO_HAS_CHAR8_T
+
+// template <> class codecvt<char16_t, char8_t, mbstate_t> // C++20
 
 template <>
-class _LIBCPP_TYPE_VIS codecvt<char32_t, char, mbstate_t>
+class _LIBCPP_TYPE_VIS codecvt<char16_t, char8_t, mbstate_t>
+    : public locale::facet,
+      public codecvt_base
+{
+public:
+    typedef char16_t  intern_type;
+    typedef char8_t   extern_type;
+    typedef mbstate_t state_type;
+
+    _LIBCPP_INLINE_VISIBILITY
+    explicit codecvt(size_t __refs = 0)
+        : locale::facet(__refs) {}
+
+    _LIBCPP_INLINE_VISIBILITY
+    result out(state_type& __st,
+               const intern_type* __frm, const intern_type* __frm_end, const intern_type*& __frm_nxt,
+               extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const
+    {
+        return do_out(__st, __frm, __frm_end, __frm_nxt, __to, __to_end, __to_nxt);
+    }
+
+    _LIBCPP_INLINE_VISIBILITY
+    result unshift(state_type& __st,
+                   extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const
+    {
+        return do_unshift(__st, __to, __to_end, __to_nxt);
+    }
+
+    _LIBCPP_INLINE_VISIBILITY
+    result in(state_type& __st,
+              const extern_type* __frm, const extern_type* __frm_end, const extern_type*& __frm_nxt,
+              intern_type* __to, intern_type* __to_end, intern_type*& __to_nxt) const
+    {
+        return do_in(__st, __frm, __frm_end, __frm_nxt, __to, __to_end, __to_nxt);
+    }
+
+    _LIBCPP_INLINE_VISIBILITY
+    int encoding() const  _NOEXCEPT
+    {
+        return do_encoding();
+    }
+
+    _LIBCPP_INLINE_VISIBILITY
+    bool always_noconv() const  _NOEXCEPT
+    {
+        return do_always_noconv();
+    }
+
+    _LIBCPP_INLINE_VISIBILITY
+    int length(state_type& __st, const extern_type* __frm, const extern_type* __end, size_t __mx) const
+    {
+        return do_length(__st, __frm, __end, __mx);
+    }
+
+    _LIBCPP_INLINE_VISIBILITY
+    int max_length() const  _NOEXCEPT
+    {
+        return do_max_length();
+    }
+
+    static locale::id id;
+
+protected:
+    _LIBCPP_INLINE_VISIBILITY
+    explicit codecvt(const char*, size_t __refs = 0)
+        : locale::facet(__refs) {}
+
+    ~codecvt();
+
+    virtual result do_out(state_type& __st,
+                          const intern_type* __frm, const intern_type* __frm_end, const intern_type*& __frm_nxt,
+                          extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const;
+    virtual result do_in(state_type& __st,
+                         const extern_type* __frm, const extern_type* __frm_end, const extern_type*& __frm_nxt,
+                         intern_type* __to, intern_type* __to_end, intern_type*& __to_nxt) const;
+    virtual result do_unshift(state_type& __st,
+                              extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const;
+    virtual int do_encoding() const  _NOEXCEPT;
+    virtual bool do_always_noconv() const  _NOEXCEPT;
+    virtual int do_length(state_type&, const extern_type* __frm, const extern_type* __end, size_t __mx) const;
+    virtual int do_max_length() const  _NOEXCEPT;
+};
+
+#endif
+
+// template <> class codecvt<char32_t, char, mbstate_t> // deprecated in C++20
+
+template <>
+class _LIBCPP_DEPRECATED_IN_CXX20 _LIBCPP_TYPE_VIS codecvt<char32_t, char, mbstate_t>
     : public locale::facet,
       public codecvt_base
 {
@@ -1241,6 +1337,96 @@ protected:
     virtual int do_max_length() const  _NOEXCEPT;
 };
 
+#ifndef _LIBCPP_NO_HAS_CHAR8_T
+
+// template <> class codecvt<char32_t, char8_t, mbstate_t> // C++20
+
+template <>
+class _LIBCPP_TYPE_VIS codecvt<char32_t, char8_t, mbstate_t>
+    : public locale::facet,
+      public codecvt_base
+{
+public:
+    typedef char32_t  intern_type;
+    typedef char8_t   extern_type;
+    typedef mbstate_t state_type;
+
+    _LIBCPP_INLINE_VISIBILITY
+    explicit codecvt(size_t __refs = 0)
+        : locale::facet(__refs) {}
+
+    _LIBCPP_INLINE_VISIBILITY
+    result out(state_type& __st,
+               const intern_type* __frm, const intern_type* __frm_end, const intern_type*& __frm_nxt,
+               extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const
+    {
+        return do_out(__st, __frm, __frm_end, __frm_nxt, __to, __to_end, __to_nxt);
+    }
+
+    _LIBCPP_INLINE_VISIBILITY
+    result unshift(state_type& __st,
+                   extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const
+    {
+        return do_unshift(__st, __to, __to_end, __to_nxt);
+    }
+
+    _LIBCPP_INLINE_VISIBILITY
+    result in(state_type& __st,
+              const extern_type* __frm, const extern_type* __frm_end, const extern_type*& __frm_nxt,
+              intern_type* __to, intern_type* __to_end, intern_type*& __to_nxt) const
+    {
+        return do_in(__st, __frm, __frm_end, __frm_nxt, __to, __to_end, __to_nxt);
+    }
+
+    _LIBCPP_INLINE_VISIBILITY
+    int encoding() const  _NOEXCEPT
+    {
+        return do_encoding();
+    }
+
+    _LIBCPP_INLINE_VISIBILITY
+    bool always_noconv() const  _NOEXCEPT
+    {
+        return do_always_noconv();
+    }
+
+    _LIBCPP_INLINE_VISIBILITY
+    int length(state_type& __st, const extern_type* __frm, const extern_type* __end, size_t __mx) const
+    {
+        return do_length(__st, __frm, __end, __mx);
+    }
+
+    _LIBCPP_INLINE_VISIBILITY
+    int max_length() const  _NOEXCEPT
+    {
+        return do_max_length();
+    }
+
+    static locale::id id;
+
+protected:
+    _LIBCPP_INLINE_VISIBILITY
+    explicit codecvt(const char*, size_t __refs = 0)
+        : locale::facet(__refs) {}
+
+    ~codecvt();
+
+    virtual result do_out(state_type& __st,
+                          const intern_type* __frm, const intern_type* __frm_end, const intern_type*& __frm_nxt,
+                          extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const;
+    virtual result do_in(state_type& __st,
+                         const extern_type* __frm, const extern_type* __frm_end, const extern_type*& __frm_nxt,
+                         intern_type* __to, intern_type* __to_end, intern_type*& __to_nxt) const;
+    virtual result do_unshift(state_type& __st,
+                              extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const;
+    virtual int do_encoding() const  _NOEXCEPT;
+    virtual bool do_always_noconv() const  _NOEXCEPT;
+    virtual int do_length(state_type&, const extern_type* __frm, const extern_type* __end, size_t __mx) const;
+    virtual int do_max_length() const  _NOEXCEPT;
+};
+
+#endif
+
 // template <class _InternT, class _ExternT, class _StateT> class codecvt_byname
 
 template <class _InternT, class _ExternT, class _StateT>
@@ -1258,15 +1444,21 @@ protected:
     ~codecvt_byname();
 };
 
+_LIBCPP_SUPPRESS_DEPRECATED_PUSH
 template <class _InternT, class _ExternT, class _StateT>
 codecvt_byname<_InternT, _ExternT, _StateT>::~codecvt_byname()
 {
 }
+_LIBCPP_SUPPRESS_DEPRECATED_POP
 
-_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname<char, char, mbstate_t>)
-_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname<wchar_t, char, mbstate_t>)
-_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname<char16_t, char, mbstate_t>)
-_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname<char32_t, char, mbstate_t>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname<char, char, mbstate_t>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname<wchar_t, char, mbstate_t>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_DEPRECATED_IN_CXX20 _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname<char16_t, char, mbstate_t>) // deprecated in C++20
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_DEPRECATED_IN_CXX20 _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname<char32_t, char, mbstate_t>) // deprecated in C++20
+#ifndef _LIBCPP_NO_HAS_CHAR8_T
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname<char16_t, char8_t, mbstate_t>) // C++20
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname<char32_t, char8_t, mbstate_t>) // C++20
+#endif
 
 template <size_t _Np>
 struct __narrow_to_utf8
@@ -1290,12 +1482,14 @@ struct __narrow_to_utf8<8>
     }
 };
 
+_LIBCPP_SUPPRESS_DEPRECATED_PUSH
 template <>
 struct _LIBCPP_TEMPLATE_VIS __narrow_to_utf8<16>
     : public codecvt<char16_t, char, mbstate_t>
 {
     _LIBCPP_INLINE_VISIBILITY
     __narrow_to_utf8() : codecvt<char16_t, char, mbstate_t>(1) {}
+_LIBCPP_SUPPRESS_DEPRECATED_POP
 
     _LIBCPP_EXPORTED_FROM_ABI ~__narrow_to_utf8();
 
@@ -1324,12 +1518,14 @@ struct _LIBCPP_TEMPLATE_VIS __narrow_to_utf8<16>
     }
 };
 
+_LIBCPP_SUPPRESS_DEPRECATED_PUSH
 template <>
 struct _LIBCPP_TEMPLATE_VIS __narrow_to_utf8<32>
     : public codecvt<char32_t, char, mbstate_t>
 {
     _LIBCPP_INLINE_VISIBILITY
     __narrow_to_utf8() : codecvt<char32_t, char, mbstate_t>(1) {}
+_LIBCPP_SUPPRESS_DEPRECATED_POP
 
     _LIBCPP_EXPORTED_FROM_ABI ~__narrow_to_utf8();
 
@@ -1380,12 +1576,14 @@ struct __widen_from_utf8<8>
     }
 };
 
+_LIBCPP_SUPPRESS_DEPRECATED_PUSH
 template <>
 struct _LIBCPP_TEMPLATE_VIS __widen_from_utf8<16>
     : public codecvt<char16_t, char, mbstate_t>
 {
     _LIBCPP_INLINE_VISIBILITY
     __widen_from_utf8() : codecvt<char16_t, char, mbstate_t>(1) {}
+_LIBCPP_SUPPRESS_DEPRECATED_POP
 
     _LIBCPP_EXPORTED_FROM_ABI ~__widen_from_utf8();
 
@@ -1407,19 +1605,21 @@ struct _LIBCPP_TEMPLATE_VIS __widen_from_utf8<16>
             if (__r == codecvt_base::error || __nn == __nb)
                 __throw_runtime_error("locale not supported");
             for (const char16_t* __p = __buf; __p < __bn; ++__p, ++__s)
-                *__s = (wchar_t)*__p;
+                *__s = *__p;
             __nb = __nn;
         }
         return __s;
     }
 };
 
+_LIBCPP_SUPPRESS_DEPRECATED_PUSH
 template <>
 struct _LIBCPP_TEMPLATE_VIS __widen_from_utf8<32>
     : public codecvt<char32_t, char, mbstate_t>
 {
     _LIBCPP_INLINE_VISIBILITY
     __widen_from_utf8() : codecvt<char32_t, char, mbstate_t>(1) {}
+_LIBCPP_SUPPRESS_DEPRECATED_POP
 
     _LIBCPP_EXPORTED_FROM_ABI ~__widen_from_utf8();
 
@@ -1441,7 +1641,7 @@ struct _LIBCPP_TEMPLATE_VIS __widen_from_utf8<32>
             if (__r == codecvt_base::error || __nn == __nb)
                 __throw_runtime_error("locale not supported");
             for (const char32_t* __p = __buf; __p < __bn; ++__p, ++__s)
-                *__s = (wchar_t)*__p;
+                *__s = *__p;
             __nb = __nn;
         }
         return __s;
diff --git a/lib/libcxx/include/__memory/allocator_traits.h b/lib/libcxx/include/__memory/allocator_traits.h
new file mode 100644
index 0000000000..9443f61b71
--- /dev/null
+++ b/lib/libcxx/include/__memory/allocator_traits.h
@@ -0,0 +1,401 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___MEMORY_ALLOCATOR_TRAITS_H
+#define _LIBCPP___MEMORY_ALLOCATOR_TRAITS_H
+
+#include <__config>
+#include <__memory/base.h>
+#include <__memory/pointer_traits.h>
+#include <type_traits>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#pragma GCC system_header
+#endif
+
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+#define _LIBCPP_ALLOCATOR_TRAITS_HAS_XXX(NAME, PROPERTY)                \
+    template <class _Tp, class = void> struct NAME : false_type { };    \
+    template <class _Tp>               struct NAME<_Tp, typename __void_t<typename _Tp:: PROPERTY >::type> : true_type { }
+
+// __pointer
+_LIBCPP_ALLOCATOR_TRAITS_HAS_XXX(__has_pointer, pointer);
+template <class _Tp, class _Alloc,
+          class _RawAlloc = typename remove_reference<_Alloc>::type,
+          bool = __has_pointer<_RawAlloc>::value>
+struct __pointer {
+    using type _LIBCPP_NODEBUG_TYPE = typename _RawAlloc::pointer;
+};
+template <class _Tp, class _Alloc, class _RawAlloc>
+struct __pointer<_Tp, _Alloc, _RawAlloc, false> {
+    using type _LIBCPP_NODEBUG_TYPE = _Tp*;
+};
+
+// __const_pointer
+_LIBCPP_ALLOCATOR_TRAITS_HAS_XXX(__has_const_pointer, const_pointer);
+template <class _Tp, class _Ptr, class _Alloc,
+          bool = __has_const_pointer<_Alloc>::value>
+struct __const_pointer {
+    using type _LIBCPP_NODEBUG_TYPE = typename _Alloc::const_pointer;
+};
+template <class _Tp, class _Ptr, class _Alloc>
+struct __const_pointer<_Tp, _Ptr, _Alloc, false> {
+#ifdef _LIBCPP_CXX03_LANG
+    using type = typename pointer_traits<_Ptr>::template rebind<const _Tp>::other;
+#else
+    using type _LIBCPP_NODEBUG_TYPE = typename pointer_traits<_Ptr>::template rebind<const _Tp>;
+#endif
+};
+
+// __void_pointer
+_LIBCPP_ALLOCATOR_TRAITS_HAS_XXX(__has_void_pointer, void_pointer);
+template <class _Ptr, class _Alloc,
+          bool = __has_void_pointer<_Alloc>::value>
+struct __void_pointer {
+    using type _LIBCPP_NODEBUG_TYPE = typename _Alloc::void_pointer;
+};
+template <class _Ptr, class _Alloc>
+struct __void_pointer<_Ptr, _Alloc, false> {
+#ifdef _LIBCPP_CXX03_LANG
+    using type _LIBCPP_NODEBUG_TYPE = typename pointer_traits<_Ptr>::template rebind<void>::other;
+#else
+    using type _LIBCPP_NODEBUG_TYPE = typename pointer_traits<_Ptr>::template rebind<void>;
+#endif
+};
+
+// __const_void_pointer
+_LIBCPP_ALLOCATOR_TRAITS_HAS_XXX(__has_const_void_pointer, const_void_pointer);
+template <class _Ptr, class _Alloc,
+          bool = __has_const_void_pointer<_Alloc>::value>
+struct __const_void_pointer {
+    using type _LIBCPP_NODEBUG_TYPE = typename _Alloc::const_void_pointer;
+};
+template <class _Ptr, class _Alloc>
+struct __const_void_pointer<_Ptr, _Alloc, false> {
+#ifdef _LIBCPP_CXX03_LANG
+    using type _LIBCPP_NODEBUG_TYPE = typename pointer_traits<_Ptr>::template rebind<const void>::other;
+#else
+    using type _LIBCPP_NODEBUG_TYPE = typename pointer_traits<_Ptr>::template rebind<const void>;
+#endif
+};
+
+// __size_type
+_LIBCPP_ALLOCATOR_TRAITS_HAS_XXX(__has_size_type, size_type);
+template <class _Alloc, class _DiffType, bool = __has_size_type<_Alloc>::value>
+struct __size_type : make_unsigned<_DiffType> { };
+template <class _Alloc, class _DiffType>
+struct __size_type<_Alloc, _DiffType, true> {
+    using type _LIBCPP_NODEBUG_TYPE = typename _Alloc::size_type;
+};
+
+// __alloc_traits_difference_type
+_LIBCPP_ALLOCATOR_TRAITS_HAS_XXX(__has_alloc_traits_difference_type, difference_type);
+template <class _Alloc, class _Ptr, bool = __has_alloc_traits_difference_type<_Alloc>::value>
+struct __alloc_traits_difference_type {
+    using type _LIBCPP_NODEBUG_TYPE = typename pointer_traits<_Ptr>::difference_type;
+};
+template <class _Alloc, class _Ptr>
+struct __alloc_traits_difference_type<_Alloc, _Ptr, true> {
+    using type _LIBCPP_NODEBUG_TYPE = typename _Alloc::difference_type;
+};
+
+// __propagate_on_container_copy_assignment
+_LIBCPP_ALLOCATOR_TRAITS_HAS_XXX(__has_propagate_on_container_copy_assignment, propagate_on_container_copy_assignment);
+template <class _Alloc, bool = __has_propagate_on_container_copy_assignment<_Alloc>::value>
+struct __propagate_on_container_copy_assignment : false_type { };
+template <class _Alloc>
+struct __propagate_on_container_copy_assignment<_Alloc, true> {
+    using type _LIBCPP_NODEBUG_TYPE = typename _Alloc::propagate_on_container_copy_assignment;
+};
+
+// __propagate_on_container_move_assignment
+_LIBCPP_ALLOCATOR_TRAITS_HAS_XXX(__has_propagate_on_container_move_assignment, propagate_on_container_move_assignment);
+template <class _Alloc, bool = __has_propagate_on_container_move_assignment<_Alloc>::value>
+struct __propagate_on_container_move_assignment : false_type { };
+template <class _Alloc>
+struct __propagate_on_container_move_assignment<_Alloc, true> {
+    using type _LIBCPP_NODEBUG_TYPE = typename _Alloc::propagate_on_container_move_assignment;
+};
+
+// __propagate_on_container_swap
+_LIBCPP_ALLOCATOR_TRAITS_HAS_XXX(__has_propagate_on_container_swap, propagate_on_container_swap);
+template <class _Alloc, bool = __has_propagate_on_container_swap<_Alloc>::value>
+struct __propagate_on_container_swap : false_type { };
+template <class _Alloc>
+struct __propagate_on_container_swap<_Alloc, true> {
+    using type _LIBCPP_NODEBUG_TYPE = typename _Alloc::propagate_on_container_swap;
+};
+
+// __is_always_equal
+_LIBCPP_ALLOCATOR_TRAITS_HAS_XXX(__has_is_always_equal, is_always_equal);
+template <class _Alloc, bool = __has_is_always_equal<_Alloc>::value>
+struct __is_always_equal : is_empty<_Alloc> { };
+template <class _Alloc>
+struct __is_always_equal<_Alloc, true> {
+    using type _LIBCPP_NODEBUG_TYPE = typename _Alloc::is_always_equal;
+};
+
+// __allocator_traits_rebind
+_LIBCPP_SUPPRESS_DEPRECATED_PUSH
+template <class _Tp, class _Up, class = void>
+struct __has_rebind_other : false_type { };
+template <class _Tp, class _Up>
+struct __has_rebind_other<_Tp, _Up, typename __void_t<
+    typename _Tp::template rebind<_Up>::other
+>::type> : true_type { };
+
+template <class _Tp, class _Up, bool = __has_rebind_other<_Tp, _Up>::value>
+struct __allocator_traits_rebind {
+    using type _LIBCPP_NODEBUG_TYPE = typename _Tp::template rebind<_Up>::other;
+};
+template <template <class, class...> class _Alloc, class _Tp, class ..._Args, class _Up>
+struct __allocator_traits_rebind<_Alloc<_Tp, _Args...>, _Up, true> {
+    using type _LIBCPP_NODEBUG_TYPE = typename _Alloc<_Tp, _Args...>::template rebind<_Up>::other;
+};
+template <template <class, class...> class _Alloc, class _Tp, class ..._Args, class _Up>
+struct __allocator_traits_rebind<_Alloc<_Tp, _Args...>, _Up, false> {
+    using type _LIBCPP_NODEBUG_TYPE = _Alloc<_Up, _Args...>;
+};
+_LIBCPP_SUPPRESS_DEPRECATED_POP
+
+template<class _Alloc, class _Tp>
+using __allocator_traits_rebind_t = typename __allocator_traits_rebind<_Alloc, _Tp>::type;
+
+_LIBCPP_SUPPRESS_DEPRECATED_PUSH
+
+// __has_allocate_hint
+template <class _Alloc, class _SizeType, class _ConstVoidPtr, class = void>
+struct __has_allocate_hint : false_type { };
+
+template <class _Alloc, class _SizeType, class _ConstVoidPtr>
+struct __has_allocate_hint<_Alloc, _SizeType, _ConstVoidPtr, decltype(
+    (void)declval<_Alloc>().allocate(declval<_SizeType>(), declval<_ConstVoidPtr>())
+)> : true_type { };
+
+// __has_construct
+template <class, class _Alloc, class ..._Args>
+struct __has_construct_impl : false_type { };
+
+template <class _Alloc, class ..._Args>
+struct __has_construct_impl<decltype(
+    (void)declval<_Alloc>().construct(declval<_Args>()...)
+), _Alloc, _Args...> : true_type { };
+
+template <class _Alloc, class ..._Args>
+struct __has_construct : __has_construct_impl<void, _Alloc, _Args...> { };
+
+// __has_destroy
+template <class _Alloc, class _Pointer, class = void>
+struct __has_destroy : false_type { };
+
+template <class _Alloc, class _Pointer>
+struct __has_destroy<_Alloc, _Pointer, decltype(
+    (void)declval<_Alloc>().destroy(declval<_Pointer>())
+)> : true_type { };
+
+// __has_max_size
+template <class _Alloc, class = void>
+struct __has_max_size : false_type { };
+
+template <class _Alloc>
+struct __has_max_size<_Alloc, decltype(
+    (void)declval<_Alloc&>().max_size()
+)> : true_type { };
+
+// __has_select_on_container_copy_construction
+template <class _Alloc, class = void>
+struct __has_select_on_container_copy_construction : false_type { };
+
+template <class _Alloc>
+struct __has_select_on_container_copy_construction<_Alloc, decltype(
+    (void)declval<_Alloc>().select_on_container_copy_construction()
+)> : true_type { };
+
+_LIBCPP_SUPPRESS_DEPRECATED_POP
+
+template <class _Alloc>
+struct _LIBCPP_TEMPLATE_VIS allocator_traits
+{
+    using allocator_type = _Alloc;
+    using value_type = typename allocator_type::value_type;
+    using pointer = typename __pointer<value_type, allocator_type>::type;
+    using const_pointer = typename __const_pointer<value_type, pointer, allocator_type>::type;
+    using void_pointer = typename __void_pointer<pointer, allocator_type>::type;
+    using const_void_pointer = typename __const_void_pointer<pointer, allocator_type>::type;
+    using difference_type = typename __alloc_traits_difference_type<allocator_type, pointer>::type;
+    using size_type = typename __size_type<allocator_type, difference_type>::type;
+    using propagate_on_container_copy_assignment = typename __propagate_on_container_copy_assignment<allocator_type>::type;
+    using propagate_on_container_move_assignment = typename __propagate_on_container_move_assignment<allocator_type>::type;
+    using propagate_on_container_swap = typename __propagate_on_container_swap<allocator_type>::type;
+    using is_always_equal = typename __is_always_equal<allocator_type>::type;
+
+#ifndef _LIBCPP_CXX03_LANG
+    template <class _Tp>
+    using rebind_alloc = __allocator_traits_rebind_t<allocator_type, _Tp>;
+    template <class _Tp>
+    using rebind_traits = allocator_traits<rebind_alloc<_Tp> >;
+#else  // _LIBCPP_CXX03_LANG
+    template <class _Tp>
+    struct rebind_alloc {
+        using other = __allocator_traits_rebind_t<allocator_type, _Tp>;
+    };
+    template <class _Tp>
+    struct rebind_traits {
+        using other = allocator_traits<typename rebind_alloc<_Tp>::other>;
+    };
+#endif  // _LIBCPP_CXX03_LANG
+
+    _LIBCPP_NODISCARD_AFTER_CXX17 _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
+    static pointer allocate(allocator_type& __a, size_type __n) {
+        return __a.allocate(__n);
+    }
+
+    template <class _Ap = _Alloc, class =
+        _EnableIf<__has_allocate_hint<_Ap, size_type, const_void_pointer>::value> >
+    _LIBCPP_NODISCARD_AFTER_CXX17 _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
+    static pointer allocate(allocator_type& __a, size_type __n, const_void_pointer __hint) {
+        _LIBCPP_SUPPRESS_DEPRECATED_PUSH
+        return __a.allocate(__n, __hint);
+        _LIBCPP_SUPPRESS_DEPRECATED_POP
+    }
+    template <class _Ap = _Alloc, class = void, class =
+        _EnableIf<!__has_allocate_hint<_Ap, size_type, const_void_pointer>::value> >
+    _LIBCPP_NODISCARD_AFTER_CXX17 _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
+    static pointer allocate(allocator_type& __a, size_type __n, const_void_pointer) {
+        return __a.allocate(__n);
+    }
+
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
+    static void deallocate(allocator_type& __a, pointer __p, size_type __n) _NOEXCEPT {
+        __a.deallocate(__p, __n);
+    }
+
+    template <class _Tp, class... _Args, class =
+        _EnableIf<__has_construct<allocator_type, _Tp*, _Args...>::value> >
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
+    static void construct(allocator_type& __a, _Tp* __p, _Args&&... __args) {
+        _LIBCPP_SUPPRESS_DEPRECATED_PUSH
+        __a.construct(__p, _VSTD::forward<_Args>(__args)...);
+        _LIBCPP_SUPPRESS_DEPRECATED_POP
+    }
+    template <class _Tp, class... _Args, class = void, class =
+        _EnableIf<!__has_construct<allocator_type, _Tp*, _Args...>::value> >
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
+    static void construct(allocator_type&, _Tp* __p, _Args&&... __args) {
+#if _LIBCPP_STD_VER > 17
+        _VSTD::construct_at(__p, _VSTD::forward<_Args>(__args)...);
+#else
+        ::new ((void*)__p) _Tp(_VSTD::forward<_Args>(__args)...);
+#endif
+    }
+
+    template <class _Tp, class =
+        _EnableIf<__has_destroy<allocator_type, _Tp*>::value> >
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
+    static void destroy(allocator_type& __a, _Tp* __p) {
+        _LIBCPP_SUPPRESS_DEPRECATED_PUSH
+        __a.destroy(__p);
+        _LIBCPP_SUPPRESS_DEPRECATED_POP
+    }
+    template <class _Tp, class = void, class =
+        _EnableIf<!__has_destroy<allocator_type, _Tp*>::value> >
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
+    static void destroy(allocator_type&, _Tp* __p) {
+#if _LIBCPP_STD_VER > 17
+        _VSTD::destroy_at(__p);
+#else
+        __p->~_Tp();
+#endif
+    }
+
+    template <class _Ap = _Alloc, class =
+        _EnableIf<__has_max_size<const _Ap>::value> >
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
+    static size_type max_size(const allocator_type& __a) _NOEXCEPT {
+        _LIBCPP_SUPPRESS_DEPRECATED_PUSH
+        return __a.max_size();
+        _LIBCPP_SUPPRESS_DEPRECATED_POP
+    }
+    template <class _Ap = _Alloc, class = void, class =
+        _EnableIf<!__has_max_size<const _Ap>::value> >
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
+    static size_type max_size(const allocator_type&) _NOEXCEPT {
+        return numeric_limits<size_type>::max() / sizeof(value_type);
+    }
+
+    template <class _Ap = _Alloc, class =
+        _EnableIf<__has_select_on_container_copy_construction<const _Ap>::value> >
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
+    static allocator_type select_on_container_copy_construction(const allocator_type& __a) {
+        return __a.select_on_container_copy_construction();
+    }
+    template <class _Ap = _Alloc, class = void, class =
+        _EnableIf<!__has_select_on_container_copy_construction<const _Ap>::value> >
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
+    static allocator_type select_on_container_copy_construction(const allocator_type& __a) {
+        return __a;
+    }
+};
+
+template <class _Traits, class _Tp>
+struct __rebind_alloc_helper {
+#ifndef _LIBCPP_CXX03_LANG
+    using type _LIBCPP_NODEBUG_TYPE = typename _Traits::template rebind_alloc<_Tp>;
+#else
+    using type = typename _Traits::template rebind_alloc<_Tp>::other;
+#endif
+};
+
+// __is_default_allocator
+template <class _Tp>
+struct __is_default_allocator : false_type { };
+
+template <class _Tp>
+struct __is_default_allocator<_VSTD::allocator<_Tp> > : true_type { };
+
+// __is_cpp17_move_insertable
+template <class _Alloc, class = void>
+struct __is_cpp17_move_insertable
+    : is_move_constructible<typename _Alloc::value_type>
+{ };
+
+template <class _Alloc>
+struct __is_cpp17_move_insertable<_Alloc, _EnableIf<
+    !__is_default_allocator<_Alloc>::value &&
+    __has_construct<_Alloc, typename _Alloc::value_type*, typename _Alloc::value_type&&>::value
+> > : true_type { };
+
+// __is_cpp17_copy_insertable
+template <class _Alloc, class = void>
+struct __is_cpp17_copy_insertable
+    : integral_constant<bool,
+        is_copy_constructible<typename _Alloc::value_type>::value &&
+        __is_cpp17_move_insertable<_Alloc>::value
+    >
+{ };
+
+template <class _Alloc>
+struct __is_cpp17_copy_insertable<_Alloc, _EnableIf<
+    !__is_default_allocator<_Alloc>::value &&
+    __has_construct<_Alloc, typename _Alloc::value_type*, const typename _Alloc::value_type&>::value
+> >
+    : __is_cpp17_move_insertable<_Alloc>
+{ };
+
+#undef _LIBCPP_ALLOCATOR_TRAITS_HAS_XXX
+
+_LIBCPP_END_NAMESPACE_STD
+
+_LIBCPP_POP_MACROS
+
+#endif  // _LIBCPP___MEMORY_ALLOCATOR_TRAITS_H
diff --git a/lib/libcxx/include/__memory/base.h b/lib/libcxx/include/__memory/base.h
new file mode 100644
index 0000000000..70728bd7f8
--- /dev/null
+++ b/lib/libcxx/include/__memory/base.h
@@ -0,0 +1,127 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___MEMORY_BASE_H
+#define _LIBCPP___MEMORY_BASE_H
+
+#include <__config>
+#include <__debug>
+#include <type_traits>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#pragma GCC system_header
+#endif
+
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+// addressof
+#ifndef _LIBCPP_HAS_NO_BUILTIN_ADDRESSOF
+
+template <class _Tp>
+inline _LIBCPP_CONSTEXPR_AFTER_CXX14
+_LIBCPP_NO_CFI _LIBCPP_INLINE_VISIBILITY
+_Tp*
+addressof(_Tp& __x) _NOEXCEPT
+{
+    return __builtin_addressof(__x);
+}
+
+#else
+
+template <class _Tp>
+inline _LIBCPP_NO_CFI _LIBCPP_INLINE_VISIBILITY
+_Tp*
+addressof(_Tp& __x) _NOEXCEPT
+{
+  return reinterpret_cast<_Tp *>(
+      const_cast<char *>(&reinterpret_cast<const volatile char &>(__x)));
+}
+
+#endif // _LIBCPP_HAS_NO_BUILTIN_ADDRESSOF
+
+#if defined(_LIBCPP_HAS_OBJC_ARC) && !defined(_LIBCPP_PREDEFINED_OBJC_ARC_ADDRESSOF)
+// Objective-C++ Automatic Reference Counting uses qualified pointers
+// that require special addressof() signatures. When
+// _LIBCPP_PREDEFINED_OBJC_ARC_ADDRESSOF is defined, the compiler
+// itself is providing these definitions. Otherwise, we provide them.
+template <class _Tp>
+inline _LIBCPP_INLINE_VISIBILITY
+__strong _Tp*
+addressof(__strong _Tp& __x) _NOEXCEPT
+{
+  return &__x;
+}
+
+#ifdef _LIBCPP_HAS_OBJC_ARC_WEAK
+template <class _Tp>
+inline _LIBCPP_INLINE_VISIBILITY
+__weak _Tp*
+addressof(__weak _Tp& __x) _NOEXCEPT
+{
+  return &__x;
+}
+#endif
+
+template <class _Tp>
+inline _LIBCPP_INLINE_VISIBILITY
+__autoreleasing _Tp*
+addressof(__autoreleasing _Tp& __x) _NOEXCEPT
+{
+  return &__x;
+}
+
+template <class _Tp>
+inline _LIBCPP_INLINE_VISIBILITY
+__unsafe_unretained _Tp*
+addressof(__unsafe_unretained _Tp& __x) _NOEXCEPT
+{
+  return &__x;
+}
+#endif
+
+#if !defined(_LIBCPP_CXX03_LANG)
+template <class _Tp> _Tp* addressof(const _Tp&&) noexcept = delete;
+#endif
+
+// construct_at
+
+#if _LIBCPP_STD_VER > 17
+
+template<class _Tp, class ..._Args, class = decltype(
+    ::new (_VSTD::declval<void*>()) _Tp(_VSTD::declval<_Args>()...)
+)>
+_LIBCPP_INLINE_VISIBILITY
+constexpr _Tp* construct_at(_Tp* __location, _Args&& ...__args) {
+    _LIBCPP_ASSERT(__location, "null pointer given to construct_at");
+    return ::new ((void*)__location) _Tp(_VSTD::forward<_Args>(__args)...);
+}
+
+#endif
+
+// destroy_at
+
+#if _LIBCPP_STD_VER > 14
+
+template <class _Tp>
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
+void destroy_at(_Tp* __loc) {
+    _LIBCPP_ASSERT(__loc, "null pointer given to destroy_at");
+    __loc->~_Tp();
+}
+
+#endif
+
+_LIBCPP_END_NAMESPACE_STD
+
+_LIBCPP_POP_MACROS
+
+#endif  // _LIBCPP___MEMORY_BASE_H
diff --git a/lib/libcxx/include/__memory/pointer_traits.h b/lib/libcxx/include/__memory/pointer_traits.h
new file mode 100644
index 0000000000..b2c5d34cb0
--- /dev/null
+++ b/lib/libcxx/include/__memory/pointer_traits.h
@@ -0,0 +1,169 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___MEMORY_POINTER_TRAITS_H
+#define _LIBCPP___MEMORY_POINTER_TRAITS_H
+
+#include <__config>
+#include <type_traits>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#pragma GCC system_header
+#endif
+
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _Tp, class = void>
+struct __has_element_type : false_type {};
+
+template <class _Tp>
+struct __has_element_type<_Tp,
+              typename __void_t<typename _Tp::element_type>::type> : true_type {};
+
+template <class _Ptr, bool = __has_element_type<_Ptr>::value>
+struct __pointer_traits_element_type;
+
+template <class _Ptr>
+struct __pointer_traits_element_type<_Ptr, true>
+{
+    typedef _LIBCPP_NODEBUG_TYPE typename _Ptr::element_type type;
+};
+
+template <template <class, class...> class _Sp, class _Tp, class ..._Args>
+struct __pointer_traits_element_type<_Sp<_Tp, _Args...>, true>
+{
+    typedef _LIBCPP_NODEBUG_TYPE typename _Sp<_Tp, _Args...>::element_type type;
+};
+
+template <template <class, class...> class _Sp, class _Tp, class ..._Args>
+struct __pointer_traits_element_type<_Sp<_Tp, _Args...>, false>
+{
+    typedef _LIBCPP_NODEBUG_TYPE _Tp type;
+};
+
+template <class _Tp, class = void>
+struct __has_difference_type : false_type {};
+
+template <class _Tp>
+struct __has_difference_type<_Tp,
+            typename __void_t<typename _Tp::difference_type>::type> : true_type {};
+
+template <class _Ptr, bool = __has_difference_type<_Ptr>::value>
+struct __pointer_traits_difference_type
+{
+    typedef _LIBCPP_NODEBUG_TYPE ptrdiff_t type;
+};
+
+template <class _Ptr>
+struct __pointer_traits_difference_type<_Ptr, true>
+{
+    typedef _LIBCPP_NODEBUG_TYPE typename _Ptr::difference_type type;
+};
+
+template <class _Tp, class _Up>
+struct __has_rebind
+{
+private:
+    struct __two {char __lx; char __lxx;};
+    template <class _Xp> static __two __test(...);
+    _LIBCPP_SUPPRESS_DEPRECATED_PUSH
+    template <class _Xp> static char __test(typename _Xp::template rebind<_Up>* = 0);
+    _LIBCPP_SUPPRESS_DEPRECATED_POP
+public:
+    static const bool value = sizeof(__test<_Tp>(0)) == 1;
+};
+
+template <class _Tp, class _Up, bool = __has_rebind<_Tp, _Up>::value>
+struct __pointer_traits_rebind
+{
+#ifndef _LIBCPP_CXX03_LANG
+    typedef _LIBCPP_NODEBUG_TYPE typename _Tp::template rebind<_Up> type;
+#else
+    typedef _LIBCPP_NODEBUG_TYPE typename _Tp::template rebind<_Up>::other type;
+#endif
+};
+
+template <template <class, class...> class _Sp, class _Tp, class ..._Args, class _Up>
+struct __pointer_traits_rebind<_Sp<_Tp, _Args...>, _Up, true>
+{
+#ifndef _LIBCPP_CXX03_LANG
+    typedef _LIBCPP_NODEBUG_TYPE typename _Sp<_Tp, _Args...>::template rebind<_Up> type;
+#else
+    typedef _LIBCPP_NODEBUG_TYPE typename _Sp<_Tp, _Args...>::template rebind<_Up>::other type;
+#endif
+};
+
+template <template <class, class...> class _Sp, class _Tp, class ..._Args, class _Up>
+struct __pointer_traits_rebind<_Sp<_Tp, _Args...>, _Up, false>
+{
+    typedef _Sp<_Up, _Args...> type;
+};
+
+template <class _Ptr>
+struct _LIBCPP_TEMPLATE_VIS pointer_traits
+{
+    typedef _Ptr                                                     pointer;
+    typedef typename __pointer_traits_element_type<pointer>::type    element_type;
+    typedef typename __pointer_traits_difference_type<pointer>::type difference_type;
+
+#ifndef _LIBCPP_CXX03_LANG
+    template <class _Up> using rebind = typename __pointer_traits_rebind<pointer, _Up>::type;
+#else
+    template <class _Up> struct rebind
+        {typedef typename __pointer_traits_rebind<pointer, _Up>::type other;};
+#endif  // _LIBCPP_CXX03_LANG
+
+private:
+    struct __nat {};
+public:
+    _LIBCPP_INLINE_VISIBILITY
+    static pointer pointer_to(typename conditional<is_void<element_type>::value,
+                                           __nat, element_type>::type& __r)
+        {return pointer::pointer_to(__r);}
+};
+
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS pointer_traits<_Tp*>
+{
+    typedef _Tp*      pointer;
+    typedef _Tp       element_type;
+    typedef ptrdiff_t difference_type;
+
+#ifndef _LIBCPP_CXX03_LANG
+    template <class _Up> using rebind = _Up*;
+#else
+    template <class _Up> struct rebind {typedef _Up* other;};
+#endif
+
+private:
+    struct __nat {};
+public:
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
+    static pointer pointer_to(typename conditional<is_void<element_type>::value,
+                                      __nat, element_type>::type& __r) _NOEXCEPT
+        {return _VSTD::addressof(__r);}
+};
+
+template <class _From, class _To>
+struct __rebind_pointer {
+#ifndef _LIBCPP_CXX03_LANG
+    typedef typename pointer_traits<_From>::template rebind<_To>        type;
+#else
+    typedef typename pointer_traits<_From>::template rebind<_To>::other type;
+#endif
+};
+
+_LIBCPP_END_NAMESPACE_STD
+
+_LIBCPP_POP_MACROS
+
+#endif  // _LIBCPP___MEMORY_POINTER_TRAITS_H
diff --git a/lib/libcxx/include/__memory/utilities.h b/lib/libcxx/include/__memory/utilities.h
new file mode 100644
index 0000000000..aac3d11cab
--- /dev/null
+++ b/lib/libcxx/include/__memory/utilities.h
@@ -0,0 +1,88 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___MEMORY_UTILITIES_H
+#define _LIBCPP___MEMORY_UTILITIES_H
+
+#include <__config>
+#include <__memory/allocator_traits.h>
+#include <cstddef>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#pragma GCC system_header
+#endif
+
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+// Helper class to allocate memory using an Allocator in an exception safe
+// manner.
+//
+// The intended usage of this class is as follows:
+//
+// 0
+// 1     __allocation_guard<SomeAllocator> guard(alloc, 10);
+// 2     do_some_initialization_that_may_throw(guard.__get());
+// 3     save_allocated_pointer_in_a_noexcept_operation(guard.__release_ptr());
+// 4
+//
+// If line (2) throws an exception during initialization of the memory, the
+// guard's destructor will be called, and the memory will be released using
+// Allocator deallocation. Otherwise, we release the memory from the guard on
+// line (3) in an operation that can't throw -- after that, the guard is not
+// responsible for the memory anymore.
+//
+// This is similar to a unique_ptr, except it's easier to use with a
+// custom allocator.
+template<class _Alloc>
+struct __allocation_guard {
+    using _Pointer = typename allocator_traits<_Alloc>::pointer;
+    using _Size = typename allocator_traits<_Alloc>::size_type;
+
+    template<class _AllocT> // we perform the allocator conversion inside the constructor
+    _LIBCPP_HIDE_FROM_ABI
+    explicit __allocation_guard(_AllocT __alloc, _Size __n)
+        : __alloc_(_VSTD::move(__alloc))
+        , __n_(__n)
+        , __ptr_(allocator_traits<_Alloc>::allocate(__alloc_, __n_)) // initialization order is important
+    { }
+
+    _LIBCPP_HIDE_FROM_ABI
+    ~__allocation_guard() _NOEXCEPT {
+        if (__ptr_ != nullptr) {
+            allocator_traits<_Alloc>::deallocate(__alloc_, __ptr_, __n_);
+        }
+    }
+
+    _LIBCPP_HIDE_FROM_ABI
+    _Pointer __release_ptr() _NOEXCEPT { // not called __release() because it's a keyword in objective-c++
+        _Pointer __tmp = __ptr_;
+        __ptr_ = nullptr;
+        return __tmp;
+    }
+
+    _LIBCPP_HIDE_FROM_ABI
+    _Pointer __get() const _NOEXCEPT {
+        return __ptr_;
+    }
+
+private:
+    _Alloc __alloc_;
+    _Size __n_;
+    _Pointer __ptr_;
+};
+
+_LIBCPP_END_NAMESPACE_STD
+
+_LIBCPP_POP_MACROS
+
+#endif  // _LIBCPP___MEMORY_UTILITIES_H
diff --git a/lib/libcxx/include/__mutex_base b/lib/libcxx/include/__mutex_base
index 8b4b74802b..96454ace9f 100644
--- a/lib/libcxx/include/__mutex_base
+++ b/lib/libcxx/include/__mutex_base
@@ -146,7 +146,6 @@ private:
     unique_lock& operator=(unique_lock const&); // = delete;
 
 public:
-#ifndef _LIBCPP_CXX03_LANG
     _LIBCPP_INLINE_VISIBILITY
     unique_lock(unique_lock&& __u) _NOEXCEPT
         : __m_(__u.__m_), __owns_(__u.__owns_)
@@ -163,8 +162,6 @@ public:
             return *this;
         }
 
-#endif  // _LIBCPP_CXX03_LANG
-
     void lock();
     bool try_lock();
 
@@ -382,12 +379,12 @@ __safe_nanosecond_cast(chrono::duration<_Rep, _Period> __d)
 
     using __ratio = ratio_divide<_Period, nano>;
     using __ns_rep = nanoseconds::rep;
-    __ns_rep __result_max = std::numeric_limits<__ns_rep>::max();
+    __ns_rep __result_max = numeric_limits<__ns_rep>::max();
     if (__d.count() > 0 && __d.count() > __result_max / __ratio::num) {
         return nanoseconds::max();
     }
 
-    __ns_rep __result_min = std::numeric_limits<__ns_rep>::min();
+    __ns_rep __result_min = numeric_limits<__ns_rep>::min();
     if (__d.count() < 0 && __d.count() < __result_min / __ratio::num) {
         return nanoseconds::min();
     }
@@ -421,7 +418,7 @@ condition_variable::wait_until(unique_lock<mutex>& __lk,
     if (__t <= __now)
         return cv_status::timeout;
 
-    __clock_tp_ns __t_ns = __clock_tp_ns(__safe_nanosecond_cast(__t.time_since_epoch()));
+    __clock_tp_ns __t_ns = __clock_tp_ns(_VSTD::__safe_nanosecond_cast(__t.time_since_epoch()));
 
     __do_timed_wait(__lk, __t_ns);
     return _Clock::now() < __t ? cv_status::no_timeout : cv_status::timeout;
@@ -454,13 +451,13 @@ condition_variable::wait_for(unique_lock<mutex>& __lk,
 
 #if defined(_LIBCPP_HAS_COND_CLOCKWAIT)
     using __clock_tp_ns = time_point<steady_clock, nanoseconds>;
-    __ns_rep __now_count_ns = __safe_nanosecond_cast(__c_now.time_since_epoch()).count();
+    __ns_rep __now_count_ns = _VSTD::__safe_nanosecond_cast(__c_now.time_since_epoch()).count();
 #else
     using __clock_tp_ns = time_point<system_clock, nanoseconds>;
-    __ns_rep __now_count_ns = __safe_nanosecond_cast(system_clock::now().time_since_epoch()).count();
+    __ns_rep __now_count_ns = _VSTD::__safe_nanosecond_cast(system_clock::now().time_since_epoch()).count();
 #endif
 
-    __ns_rep __d_ns_count = __safe_nanosecond_cast(__d).count();
+    __ns_rep __d_ns_count = _VSTD::__safe_nanosecond_cast(__d).count();
 
     if (__now_count_ns > numeric_limits<__ns_rep>::max() - __d_ns_count) {
         __do_timed_wait(__lk, __clock_tp_ns::max());
diff --git a/lib/libcxx/include/__split_buffer b/lib/libcxx/include/__split_buffer
index fce209f828..20480d19d3 100644
--- a/lib/libcxx/include/__split_buffer
+++ b/lib/libcxx/include/__split_buffer
@@ -68,7 +68,6 @@ public:
     __split_buffer(size_type __cap, size_type __start, __alloc_rr& __a);
     ~__split_buffer();
 
-#ifndef _LIBCPP_CXX03_LANG
     __split_buffer(__split_buffer&& __c)
         _NOEXCEPT_(is_nothrow_move_constructible<allocator_type>::value);
     __split_buffer(__split_buffer&& __c, const __alloc_rr& __a);
@@ -76,7 +75,6 @@ public:
         _NOEXCEPT_((__alloc_traits::propagate_on_container_move_assignment::value &&
                 is_nothrow_move_assignable<allocator_type>::value) ||
                !__alloc_traits::propagate_on_container_move_assignment::value);
-#endif  // _LIBCPP_CXX03_LANG
 
     _LIBCPP_INLINE_VISIBILITY       iterator begin() _NOEXCEPT       {return __begin_;}
     _LIBCPP_INLINE_VISIBILITY const_iterator begin() const _NOEXCEPT {return __begin_;}
@@ -101,12 +99,10 @@ public:
     void shrink_to_fit() _NOEXCEPT;
     void push_front(const_reference __x);
     _LIBCPP_INLINE_VISIBILITY void push_back(const_reference __x);
-#ifndef _LIBCPP_CXX03_LANG
     void push_front(value_type&& __x);
     void push_back(value_type&& __x);
     template <class... _Args>
         void emplace_back(_Args&&... __args);
-#endif  // !defined(_LIBCPP_CXX03_LANG)
 
     _LIBCPP_INLINE_VISIBILITY void pop_front() {__destruct_at_begin(__begin_+1);}
     _LIBCPP_INLINE_VISIBILITY void pop_back() {__destruct_at_end(__end_-1);}
@@ -270,7 +266,7 @@ typename enable_if
 >::type
 __split_buffer<_Tp, _Allocator>::__construct_at_end(_ForwardIterator __first, _ForwardIterator __last)
 {
-    _ConstructTransaction __tx(&this->__end_, std::distance(__first, __last));
+    _ConstructTransaction __tx(&this->__end_, _VSTD::distance(__first, __last));
     for (; __tx.__pos_ != __tx.__end_; ++__tx.__pos_, ++__first) {
         __alloc_traits::construct(this->__alloc(),
             _VSTD::__to_address(__tx.__pos_), *__first);
@@ -283,7 +279,7 @@ void
 __split_buffer<_Tp, _Allocator>::__destruct_at_begin(pointer __new_begin, false_type)
 {
     while (__begin_ != __new_begin)
-        __alloc_traits::destroy(__alloc(), __to_address(__begin_++));
+        __alloc_traits::destroy(__alloc(), _VSTD::__to_address(__begin_++));
 }
 
 template <class _Tp, class _Allocator>
@@ -300,7 +296,7 @@ void
 __split_buffer<_Tp, _Allocator>::__destruct_at_end(pointer __new_last, false_type) _NOEXCEPT
 {
     while (__new_last != __end_)
-        __alloc_traits::destroy(__alloc(), __to_address(--__end_));
+        __alloc_traits::destroy(__alloc(), _VSTD::__to_address(--__end_));
 }
 
 template <class _Tp, class _Allocator>
@@ -350,8 +346,6 @@ __split_buffer<_Tp, _Allocator>::~__split_buffer()
         __alloc_traits::deallocate(__alloc(), __first_, capacity());
 }
 
-#ifndef _LIBCPP_CXX03_LANG
-
 template <class _Tp, class _Allocator>
 __split_buffer<_Tp, _Allocator>::__split_buffer(__split_buffer&& __c)
     _NOEXCEPT_(is_nothrow_move_constructible<allocator_type>::value)
@@ -412,8 +406,6 @@ __split_buffer<_Tp, _Allocator>::operator=(__split_buffer&& __c)
     return *this;
 }
 
-#endif  // _LIBCPP_CXX03_LANG
-
 template <class _Tp, class _Allocator>
 void
 __split_buffer<_Tp, _Allocator>::swap(__split_buffer& __x)
@@ -424,7 +416,7 @@ __split_buffer<_Tp, _Allocator>::swap(__split_buffer& __x)
     _VSTD::swap(__begin_, __x.__begin_);
     _VSTD::swap(__end_, __x.__end_);
     _VSTD::swap(__end_cap(), __x.__end_cap());
-    __swap_allocator(__alloc(), __x.__alloc());
+    _VSTD::__swap_allocator(__alloc(), __x.__alloc());
 }
 
 template <class _Tp, class _Allocator>
@@ -499,8 +491,6 @@ __split_buffer<_Tp, _Allocator>::push_front(const_reference __x)
     --__begin_;
 }
 
-#ifndef _LIBCPP_CXX03_LANG
-
 template <class _Tp, class _Allocator>
 void
 __split_buffer<_Tp, _Allocator>::push_front(value_type&& __x)
@@ -531,8 +521,6 @@ __split_buffer<_Tp, _Allocator>::push_front(value_type&& __x)
     --__begin_;
 }
 
-#endif  // _LIBCPP_CXX03_LANG
-
 template <class _Tp, class _Allocator>
 inline _LIBCPP_INLINE_VISIBILITY
 void
@@ -563,8 +551,6 @@ __split_buffer<_Tp, _Allocator>::push_back(const_reference __x)
     ++__end_;
 }
 
-#ifndef _LIBCPP_CXX03_LANG
-
 template <class _Tp, class _Allocator>
 void
 __split_buffer<_Tp, _Allocator>::push_back(value_type&& __x)
@@ -626,8 +612,6 @@ __split_buffer<_Tp, _Allocator>::emplace_back(_Args&&... __args)
     ++__end_;
 }
 
-#endif  // _LIBCPP_CXX03_LANG
-
 template <class _Tp, class _Allocator>
 inline _LIBCPP_INLINE_VISIBILITY
 void
diff --git a/lib/libcxx/include/__sso_allocator b/lib/libcxx/include/__sso_allocator
index 3930128738..c50ae24c92 100644
--- a/lib/libcxx/include/__sso_allocator
+++ b/lib/libcxx/include/__sso_allocator
@@ -11,8 +11,9 @@
 #define _LIBCPP___SSO_ALLOCATOR
 
 #include <__config>
-#include <type_traits>
+#include <memory>
 #include <new>
+#include <type_traits>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #pragma GCC system_header
@@ -47,21 +48,21 @@ public:
 private:
     __sso_allocator& operator=(const __sso_allocator&);
 public:
-    _LIBCPP_INLINE_VISIBILITY pointer allocate(size_type __n, typename __sso_allocator<void, _Np>::const_pointer = 0)
+    _LIBCPP_INLINE_VISIBILITY pointer allocate(size_type __n, typename __sso_allocator<void, _Np>::const_pointer = nullptr)
     {
         if (!__allocated_ && __n <= _Np)
         {
             __allocated_ = true;
             return (pointer)&buf_;
         }
-        return static_cast<pointer>(_VSTD::__libcpp_allocate(__n * sizeof(_Tp), _LIBCPP_ALIGNOF(_Tp)));
+        return allocator<_Tp>().allocate(__n);
     }
     _LIBCPP_INLINE_VISIBILITY void deallocate(pointer __p, size_type __n)
     {
         if (__p == (pointer)&buf_)
             __allocated_ = false;
         else
-            _VSTD::__libcpp_deallocate(__p, __n * sizeof(_Tp), _LIBCPP_ALIGNOF(_Tp));
+            allocator<_Tp>().deallocate(__p, __n);
     }
     _LIBCPP_INLINE_VISIBILITY size_type max_size() const throw() {return size_type(~0) / sizeof(_Tp);}
 
diff --git a/lib/libcxx/include/__string b/lib/libcxx/include/__string
index 9060bf98ad..d8b672e4c1 100644
--- a/lib/libcxx/include/__string
+++ b/lib/libcxx/include/__string
@@ -55,7 +55,9 @@ template <> struct char_traits<char8_t>;  // c++20
 
 #include <__config>
 #include <algorithm>  // for search and min
-#include <cstdio>     // For EOF.
+#include <cstdio>     // for EOF
+#include <cstring>    // for memcpy
+#include <cwchar>     // for wmemcpy
 #include <memory>     // for __murmur2_or_cityhash
 
 #include <__debug>
@@ -92,7 +94,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
   _Func(_LIBCPP_FUNC_VIS void basic_string<_CharType>::__init(value_type const*, size_type, size_type)) \
   _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::basic_string(basic_string const&)) \
   _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::replace(size_type, size_type, value_type const*)) \
-  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::basic_string(basic_string const&, std::allocator<_CharType> const&)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::basic_string(basic_string const&, allocator<_CharType> const&)) \
   _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::size_type basic_string<_CharType>::find_last_not_of(value_type const*, size_type, size_type) const) \
   _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::~basic_string()) \
   _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::size_type basic_string<_CharType>::find_first_not_of(value_type const*, size_type, size_type) const) \
@@ -108,7 +110,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
   _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::append(value_type const*, size_type)) \
   _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::assign(basic_string const&, size_type, size_type)) \
   _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::size_type basic_string<_CharType>::copy(value_type*, size_type, size_type) const) \
-  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::basic_string(basic_string const&, size_type, size_type, std::allocator<_CharType> const&)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::basic_string(basic_string const&, size_type, size_type, allocator<_CharType> const&)) \
   _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::size_type basic_string<_CharType>::find(value_type, size_type) const) \
   _Func(_LIBCPP_FUNC_VIS void basic_string<_CharType>::__init(size_type, value_type)) \
   _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::insert(size_type, value_type const*)) \
@@ -158,7 +160,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
   _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::append(value_type const*, size_type)) \
   _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::assign(basic_string const&, size_type, size_type)) \
   _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::size_type basic_string<_CharType>::copy(value_type*, size_type, size_type) const) \
-  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::basic_string(basic_string const&, size_type, size_type, std::allocator<_CharType> const&)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::basic_string(basic_string const&, size_type, size_type, allocator<_CharType> const&)) \
   _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::size_type basic_string<_CharType>::find(value_type, size_type) const) \
   _Func(_LIBCPP_FUNC_VIS void basic_string<_CharType>::__init(size_type, value_type)) \
   _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::insert(size_type, value_type const*)) \
@@ -268,7 +270,7 @@ char_traits<_CharT>::find(const char_type* __s, size_t __n, const char_type& __a
             return __s;
         ++__s;
     }
-    return 0;
+    return nullptr;
 }
 
 template <class _CharT>
@@ -318,7 +320,7 @@ char_traits<_CharT>::assign(char_type* __s, size_t __n, char_type __a)
 // constexpr versions of move/copy/assign.
 
 template <class _CharT>
-static inline _LIBCPP_CONSTEXPR_AFTER_CXX17_WITH_IS_CONSTANT_EVALUATED
+static inline _LIBCPP_CONSTEXPR_AFTER_CXX17
 _CharT* __move_constexpr(_CharT* __s1, const _CharT* __s2, size_t __n) _NOEXCEPT
 {
     if (__n == 0) return __s1;
@@ -331,7 +333,7 @@ _CharT* __move_constexpr(_CharT* __s1, const _CharT* __s2, size_t __n) _NOEXCEPT
 }
 
 template <class _CharT>
-static inline _LIBCPP_CONSTEXPR_AFTER_CXX17_WITH_IS_CONSTANT_EVALUATED
+static inline _LIBCPP_CONSTEXPR_AFTER_CXX17
 _CharT* __copy_constexpr(_CharT* __s1, const _CharT* __s2, size_t __n) _NOEXCEPT
 {
     _VSTD::copy_n(__s2, __n, __s1);
@@ -339,7 +341,7 @@ _CharT* __copy_constexpr(_CharT* __s1, const _CharT* __s2, size_t __n) _NOEXCEPT
 }
 
 template <class _CharT>
-static inline _LIBCPP_CONSTEXPR_AFTER_CXX17_WITH_IS_CONSTANT_EVALUATED
+static inline _LIBCPP_CONSTEXPR_AFTER_CXX17
 _CharT* __assign_constexpr(_CharT* __s, size_t __n, _CharT __a) _NOEXCEPT
 {
      _VSTD::fill_n(__s, __n, __a);
@@ -370,27 +372,27 @@ struct _LIBCPP_TEMPLATE_VIS char_traits<char>
     length(const char_type* __s)  _NOEXCEPT {return __builtin_strlen(__s);}
     static _LIBCPP_CONSTEXPR_AFTER_CXX14
     const char_type* find(const char_type* __s, size_t __n, const char_type& __a) _NOEXCEPT;
-    static inline _LIBCPP_CONSTEXPR_AFTER_CXX17_WITH_IS_CONSTANT_EVALUATED
+    static inline _LIBCPP_CONSTEXPR_AFTER_CXX17
     char_type* move(char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT
         {
             return __libcpp_is_constant_evaluated()
-                       ? __move_constexpr(__s1, __s2, __n)
-                       : __n == 0 ? __s1 : (char_type*)memmove(__s1, __s2, __n);
+                       ? _VSTD::__move_constexpr(__s1, __s2, __n)
+                       : __n == 0 ? __s1 : (char_type*)_VSTD::memmove(__s1, __s2, __n);
         }
-    static inline _LIBCPP_CONSTEXPR_AFTER_CXX17_WITH_IS_CONSTANT_EVALUATED
+    static inline _LIBCPP_CONSTEXPR_AFTER_CXX17
     char_type* copy(char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT
         {
             _LIBCPP_ASSERT(__s2 < __s1 || __s2 >= __s1+__n, "char_traits::copy overlapped range");
             return __libcpp_is_constant_evaluated()
-                       ? __copy_constexpr(__s1, __s2, __n)
-                       : __n == 0 ? __s1 : (char_type*)memcpy(__s1, __s2, __n);
+                       ? _VSTD::__copy_constexpr(__s1, __s2, __n)
+                       : __n == 0 ? __s1 : (char_type*)_VSTD::memcpy(__s1, __s2, __n);
         }
-    static inline _LIBCPP_CONSTEXPR_AFTER_CXX17_WITH_IS_CONSTANT_EVALUATED
+    static inline _LIBCPP_CONSTEXPR_AFTER_CXX17
     char_type* assign(char_type* __s, size_t __n, char_type __a) _NOEXCEPT
         {
             return __libcpp_is_constant_evaluated()
-                       ? __assign_constexpr(__s, __n, __a)
-                       : __n == 0 ? __s : (char_type*)memset(__s, to_int_type(__a), __n);
+                       ? _VSTD::__assign_constexpr(__s, __n, __a)
+                       : __n == 0 ? __s : (char_type*)_VSTD::memset(__s, to_int_type(__a), __n);
         }
 
     static inline _LIBCPP_CONSTEXPR int_type  not_eof(int_type __c) _NOEXCEPT
@@ -414,7 +416,7 @@ char_traits<char>::compare(const char_type* __s1, const char_type* __s2, size_t
 #if __has_feature(cxx_constexpr_string_builtins)
     return __builtin_memcmp(__s1, __s2, __n);
 #elif _LIBCPP_STD_VER <= 14
-    return memcmp(__s1, __s2, __n);
+    return _VSTD::memcmp(__s1, __s2, __n);
 #else
     for (; __n; --__n, ++__s1, ++__s2)
     {
@@ -436,7 +438,7 @@ char_traits<char>::find(const char_type* __s, size_t __n, const char_type& __a)
 #if __has_feature(cxx_constexpr_string_builtins)
     return __builtin_char_memchr(__s, to_int_type(__a), __n);
 #elif _LIBCPP_STD_VER <= 14
-    return (const char_type*) memchr(__s, to_int_type(__a), __n);
+    return (const char_type*) _VSTD::memchr(__s, to_int_type(__a), __n);
 #else
     for (; __n; --__n)
     {
@@ -473,27 +475,27 @@ struct _LIBCPP_TEMPLATE_VIS char_traits<wchar_t>
     size_t length(const char_type* __s) _NOEXCEPT;
     static _LIBCPP_CONSTEXPR_AFTER_CXX14
     const char_type* find(const char_type* __s, size_t __n, const char_type& __a) _NOEXCEPT;
-    static inline _LIBCPP_CONSTEXPR_AFTER_CXX17_WITH_IS_CONSTANT_EVALUATED
+    static inline _LIBCPP_CONSTEXPR_AFTER_CXX17
     char_type* move(char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT
         {
             return __libcpp_is_constant_evaluated()
-                       ? __move_constexpr(__s1, __s2, __n)
-                       : __n == 0 ? __s1 : wmemmove(__s1, __s2, __n);
+                       ? _VSTD::__move_constexpr(__s1, __s2, __n)
+                       : __n == 0 ? __s1 : _VSTD::wmemmove(__s1, __s2, __n);
         }
-    static inline _LIBCPP_CONSTEXPR_AFTER_CXX17_WITH_IS_CONSTANT_EVALUATED
+    static inline _LIBCPP_CONSTEXPR_AFTER_CXX17
     char_type* copy(char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT
         {
             _LIBCPP_ASSERT(__s2 < __s1 || __s2 >= __s1+__n, "char_traits::copy overlapped range");
             return __libcpp_is_constant_evaluated()
-                       ? __copy_constexpr(__s1, __s2, __n)
-                       : __n == 0 ? __s1 : wmemcpy(__s1, __s2, __n);
+                       ? _VSTD::__copy_constexpr(__s1, __s2, __n)
+                       : __n == 0 ? __s1 : _VSTD::wmemcpy(__s1, __s2, __n);
         }
-    static inline _LIBCPP_CONSTEXPR_AFTER_CXX17_WITH_IS_CONSTANT_EVALUATED
+    static inline _LIBCPP_CONSTEXPR_AFTER_CXX17
     char_type* assign(char_type* __s, size_t __n, char_type __a) _NOEXCEPT
         {
             return __libcpp_is_constant_evaluated()
-                       ? __assign_constexpr(__s, __n, __a)
-                       : __n == 0 ? __s : wmemset(__s, __a, __n);
+                       ? _VSTD::__assign_constexpr(__s, __n, __a)
+                       : __n == 0 ? __s : _VSTD::wmemset(__s, __a, __n);
         }
     static inline _LIBCPP_CONSTEXPR int_type  not_eof(int_type __c) _NOEXCEPT
         {return eq_int_type(__c, eof()) ? ~eof() : __c;}
@@ -516,7 +518,7 @@ char_traits<wchar_t>::compare(const char_type* __s1, const char_type* __s2, size
 #if __has_feature(cxx_constexpr_string_builtins)
     return __builtin_wmemcmp(__s1, __s2, __n);
 #elif _LIBCPP_STD_VER <= 14
-    return wmemcmp(__s1, __s2, __n);
+    return _VSTD::wmemcmp(__s1, __s2, __n);
 #else
     for (; __n; --__n, ++__s1, ++__s2)
     {
@@ -548,7 +550,7 @@ char_traits<wchar_t>::length(const char_type* __s) _NOEXCEPT
 #if __has_feature(cxx_constexpr_string_builtins)
     return __builtin_wcslen(__s);
 #elif _LIBCPP_STD_VER <= 14
-    return wcslen(__s);
+    return _VSTD::wcslen(__s);
 #else
     size_t __len = 0;
     for (; !eq(*__s, char_type(0)); ++__s)
@@ -566,7 +568,7 @@ char_traits<wchar_t>::find(const char_type* __s, size_t __n, const char_type& __
 #if __has_feature(cxx_constexpr_string_builtins)
     return __builtin_wmemchr(__s, __a, __n);
 #elif _LIBCPP_STD_VER <= 14
-    return wmemchr(__s, __a, __n);
+    return _VSTD::wmemchr(__s, __a, __n);
 #else
     for (; __n; --__n)
     {
@@ -606,29 +608,29 @@ struct _LIBCPP_TEMPLATE_VIS char_traits<char8_t>
     _LIBCPP_INLINE_VISIBILITY static constexpr
     const char_type* find(const char_type* __s, size_t __n, const char_type& __a) _NOEXCEPT;
 
-    static _LIBCPP_CONSTEXPR_AFTER_CXX17_WITH_IS_CONSTANT_EVALUATED
+    static _LIBCPP_CONSTEXPR_AFTER_CXX17
     char_type*       move(char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT
         {
             return __libcpp_is_constant_evaluated()
-                       ? __move_constexpr(__s1, __s2, __n)
-                       : __n == 0 ? __s1 : (char_type*)memmove(__s1, __s2, __n);
+                       ? _VSTD::__move_constexpr(__s1, __s2, __n)
+                       : __n == 0 ? __s1 : (char_type*)_VSTD::memmove(__s1, __s2, __n);
         }
 
-    static _LIBCPP_CONSTEXPR_AFTER_CXX17_WITH_IS_CONSTANT_EVALUATED
+    static _LIBCPP_CONSTEXPR_AFTER_CXX17
     char_type*       copy(char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT
        {
             _LIBCPP_ASSERT(__s2 < __s1 || __s2 >= __s1+__n, "char_traits::copy overlapped range");
             return __libcpp_is_constant_evaluated()
-                       ? __copy_constexpr(__s1, __s2, __n)
-                       : __n == 0 ? __s1 : (char_type*)memcpy(__s1, __s2, __n);
+                       ? _VSTD::__copy_constexpr(__s1, __s2, __n)
+                       : __n == 0 ? __s1 : (char_type*)_VSTD::memcpy(__s1, __s2, __n);
         }
 
-    static _LIBCPP_CONSTEXPR_AFTER_CXX17_WITH_IS_CONSTANT_EVALUATED
+    static _LIBCPP_CONSTEXPR_AFTER_CXX17
     char_type*       assign(char_type* __s, size_t __n, char_type __a) _NOEXCEPT
         {
             return __libcpp_is_constant_evaluated()
-                       ? __assign_constexpr(__s, __n, __a)
-                       : __n == 0 ? __s : (char_type*)memset(__s, to_int_type(__a), __n);
+                       ? _VSTD::__assign_constexpr(__s, __n, __a)
+                       : __n == 0 ? __s : (char_type*)_VSTD::memset(__s, to_int_type(__a), __n);
         }
 
     static inline constexpr int_type  not_eof(int_type __c) noexcept
@@ -683,7 +685,7 @@ char_traits<char8_t>::find(const char_type* __s, size_t __n, const char_type& __
             return __s;
         ++__s;
     }
-    return 0;
+    return nullptr;
 }
 
 #endif // #_LIBCPP_NO_HAS_CHAR8_T
@@ -765,7 +767,7 @@ char_traits<char16_t>::find(const char_type* __s, size_t __n, const char_type& _
             return __s;
         ++__s;
     }
-    return 0;
+    return nullptr;
 }
 
 inline _LIBCPP_CONSTEXPR_AFTER_CXX17
@@ -885,7 +887,7 @@ char_traits<char32_t>::find(const char_type* __s, size_t __n, const char_type& _
             return __s;
         ++__s;
     }
-    return 0;
+    return nullptr;
 }
 
 inline _LIBCPP_CONSTEXPR_AFTER_CXX17
@@ -943,7 +945,7 @@ __str_find(const _CharT *__p, _SizeT __sz,
     if (__pos >= __sz)
         return __npos;
     const _CharT* __r = _Traits::find(__p + __pos, __sz - __pos, __c);
-    if (__r == 0)
+    if (__r == nullptr)
         return __npos;
     return static_cast<_SizeT>(__r - __p);
 }
@@ -972,7 +974,7 @@ __search_substring(const _CharT *__first1, const _CharT *__last1,
 
     // Find __f2 the first byte matching in __first1.
     __first1 = _Traits::find(__first1, __len1 - __len2 + 1, __f2);
-    if (__first1 == 0)
+    if (__first1 == nullptr)
       return __last1;
 
     // It is faster to compare from the first byte of __first1 even if we
@@ -1095,7 +1097,7 @@ __str_find_first_not_of(const _CharT *__p, _SizeT __sz,
     {
         const _CharT* __pe = __p + __sz;
         for (const _CharT* __ps = __p + __pos; __ps != __pe; ++__ps)
-            if (_Traits::find(__s, __n, *__ps) == 0)
+            if (_Traits::find(__s, __n, *__ps) == nullptr)
                 return static_cast<_SizeT>(__ps - __p);
     }
     return __npos;
@@ -1129,7 +1131,7 @@ __str_find_last_not_of(const _CharT *__p, _SizeT __sz,
     else
         __pos = __sz;
     for (const _CharT* __ps = __p + __pos; __ps != __p;)
-        if (_Traits::find(__s, __n, *--__ps) == 0)
+        if (_Traits::find(__s, __n, *--__ps) == nullptr)
             return static_cast<_SizeT>(__ps - __p);
     return __npos;
 }
diff --git a/lib/libcxx/include/support/android/locale_bionic.h b/lib/libcxx/include/__support/android/locale_bionic.h
similarity index 90%
rename from lib/libcxx/include/support/android/locale_bionic.h
rename to lib/libcxx/include/__support/android/locale_bionic.h
index f05a6a0522..8c6d4bd0dc 100644
--- a/lib/libcxx/include/support/android/locale_bionic.h
+++ b/lib/libcxx/include/__support/android/locale_bionic.h
@@ -1,5 +1,5 @@
 // -*- C++ -*-
-//===------------------- support/android/locale_bionic.h ------------------===//
+//===-----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -28,13 +28,13 @@ extern "C" {
 #include <android/api-level.h>
 #include <android/ndk-version.h>
 #if __ANDROID_API__ < 21
-#include <support/xlocale/__posix_l_fallback.h>
+#include <__support/xlocale/__posix_l_fallback.h>
 #endif
 // In NDK versions later than 16, locale-aware functions are provided by
 // legacy_stdlib_inlines.h
 #if __NDK_MAJOR__ <= 16
 #if __ANDROID_API__ < 21
-#include <support/xlocale/__strtonum_fallback.h>
+#include <__support/xlocale/__strtonum_fallback.h>
 #elif __ANDROID_API__ < 26
 
 #if defined(__cplusplus)
diff --git a/lib/libcxx/include/support/fuchsia/xlocale.h b/lib/libcxx/include/__support/fuchsia/xlocale.h
similarity index 74%
rename from lib/libcxx/include/support/fuchsia/xlocale.h
rename to lib/libcxx/include/__support/fuchsia/xlocale.h
index b86ce9efbd..e8def81480 100644
--- a/lib/libcxx/include/support/fuchsia/xlocale.h
+++ b/lib/libcxx/include/__support/fuchsia/xlocale.h
@@ -1,5 +1,5 @@
 // -*- C++ -*-
-//===------------------- support/fuchsia/xlocale.h ------------------------===//
+//===-----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -14,8 +14,8 @@
 
 #include <cstdlib>
 #include <cwchar>
-#include <support/xlocale/__posix_l_fallback.h>
-#include <support/xlocale/__strtonum_fallback.h>
+#include <__support/xlocale/__posix_l_fallback.h>
+#include <__support/xlocale/__strtonum_fallback.h>
 
 #endif // defined(__Fuchsia__)
 
diff --git a/lib/libcxx/include/support/ibm/limits.h b/lib/libcxx/include/__support/ibm/limits.h
similarity index 97%
rename from lib/libcxx/include/support/ibm/limits.h
rename to lib/libcxx/include/__support/ibm/limits.h
index d1c59f066a..45f1f1e368 100644
--- a/lib/libcxx/include/support/ibm/limits.h
+++ b/lib/libcxx/include/__support/ibm/limits.h
@@ -1,5 +1,5 @@
 // -*- C++ -*-
-//===--------------------- support/ibm/limits.h ---------------------------===//
+//===-----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/lib/libcxx/include/support/ibm/locale_mgmt_aix.h b/lib/libcxx/include/__support/ibm/locale_mgmt_aix.h
similarity index 96%
rename from lib/libcxx/include/support/ibm/locale_mgmt_aix.h
rename to lib/libcxx/include/__support/ibm/locale_mgmt_aix.h
index e452dc3252..4f658c3eee 100644
--- a/lib/libcxx/include/support/ibm/locale_mgmt_aix.h
+++ b/lib/libcxx/include/__support/ibm/locale_mgmt_aix.h
@@ -1,5 +1,5 @@
 // -*- C++ -*-
-//===------------------- support/ibm/locale_mgmt_aix.h --------------------===//
+//===-----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/lib/libcxx/include/__support/ibm/nanosleep.h b/lib/libcxx/include/__support/ibm/nanosleep.h
new file mode 100644
index 0000000000..c82f4eb0cd
--- /dev/null
+++ b/lib/libcxx/include/__support/ibm/nanosleep.h
@@ -0,0 +1,38 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP_SUPPORT_IBM_NANOSLEEP_H
+#define _LIBCPP_SUPPORT_IBM_NANOSLEEP_H
+
+#include <unistd.h>
+
+inline int nanosleep(const struct timespec* req, struct timespec* rem)
+{
+   // The nanosleep() function is not available on z/OS. Therefore, we will call
+   // sleep() to sleep for whole seconds and usleep() to sleep for any remaining
+   // fraction of a second. Any remaining nanoseconds will round up to the next
+   // microsecond.
+
+   useconds_t __micro_sec = (rem->tv_nsec + 999) / 1000;
+   if (__micro_sec > 999999)
+   {
+     ++rem->tv_sec;
+     __micro_sec -= 1000000;
+   }
+   while (rem->tv_sec)
+      rem->tv_sec = sleep(rem->tv_sec);
+   if (__micro_sec) {
+     rem->tv_nsec = __micro_sec * 1000;
+     return usleep(__micro_sec);
+   }
+   rem->tv_nsec = 0;
+   return 0;
+}
+
+#endif // _LIBCPP_SUPPORT_IBM_NANOSLEEP_H
diff --git a/lib/libcxx/include/support/ibm/support.h b/lib/libcxx/include/__support/ibm/support.h
similarity index 95%
rename from lib/libcxx/include/support/ibm/support.h
rename to lib/libcxx/include/__support/ibm/support.h
index 0569cbe746..a7751b0176 100644
--- a/lib/libcxx/include/support/ibm/support.h
+++ b/lib/libcxx/include/__support/ibm/support.h
@@ -1,5 +1,5 @@
 // -*- C++ -*-
-//===----------------------- support/ibm/support.h ----------------------===//
+//===-----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/lib/libcxx/include/support/ibm/xlocale.h b/lib/libcxx/include/__support/ibm/xlocale.h
similarity index 93%
rename from lib/libcxx/include/support/ibm/xlocale.h
rename to lib/libcxx/include/__support/ibm/xlocale.h
index 431bf62687..ad07a255fc 100644
--- a/lib/libcxx/include/support/ibm/xlocale.h
+++ b/lib/libcxx/include/__support/ibm/xlocale.h
@@ -1,5 +1,5 @@
 // -*- C++ -*-
-//===--------------------- support/ibm/xlocale.h -------------------===//
+//===-----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -9,15 +9,16 @@
 
 #ifndef _LIBCPP_SUPPORT_IBM_XLOCALE_H
 #define _LIBCPP_SUPPORT_IBM_XLOCALE_H
-#include <support/ibm/locale_mgmt_aix.h>
 
-#if defined(_AIX)
+#include <__support/ibm/locale_mgmt_aix.h>
+
 #include "cstdlib"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+#if defined(_AIX)
 #if !defined(_AIX71)
 // AIX 7.1 and higher has these definitions.  Definitions and stubs
 // are provied here as a temporary workaround on AIX 6.1.
@@ -207,14 +208,20 @@ size_t wcsxfrm_l(wchar_t *__ws1, const wchar_t *__ws2, size_t __n,
 }
 #endif // !defined(_AIX71)
 
-// strftime_l() is defined by POSIX. However, AIX 7.1 does not have it
-// implemented yet.
+// strftime_l() is defined by POSIX. However, AIX 7.1 and z/OS do not have it
+// implemented yet. z/OS retrieves it from the POSIX fallbacks.
 static inline
 size_t strftime_l(char *__s, size_t __size, const char *__fmt,
                   const struct tm *__tm, locale_t locale) {
   return __xstrftime(locale, __s, __size, __fmt, __tm);
 }
 
+#elif defined(__MVS__)
+#include <wctype.h>
+// POSIX routines
+#include <__support/xlocale/__posix_l_fallback.h>
+#endif // defined(__MVS__)
+
 // The following are not POSIX routines.  These are quick-and-dirty hacks
 // to make things pretend to work
 static inline
@@ -266,5 +273,4 @@ int vasprintf(char **strp, const char *fmt, va_list ap)
 #ifdef __cplusplus
 }
 #endif
-#endif // defined(_AIX)
 #endif // _LIBCPP_SUPPORT_IBM_XLOCALE_H
diff --git a/lib/libcxx/include/support/musl/xlocale.h b/lib/libcxx/include/__support/musl/xlocale.h
similarity index 95%
rename from lib/libcxx/include/support/musl/xlocale.h
rename to lib/libcxx/include/__support/musl/xlocale.h
index 722d13fa1d..2508a8e8e0 100644
--- a/lib/libcxx/include/support/musl/xlocale.h
+++ b/lib/libcxx/include/__support/musl/xlocale.h
@@ -1,5 +1,5 @@
 // -*- C++ -*-
-//===------------------- support/musl/xlocale.h ------------------------===//
+//===-----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/lib/libcxx/include/support/newlib/xlocale.h b/lib/libcxx/include/__support/newlib/xlocale.h
similarity index 82%
rename from lib/libcxx/include/support/newlib/xlocale.h
rename to lib/libcxx/include/__support/newlib/xlocale.h
index 25fa798b6d..b75f9263a4 100644
--- a/lib/libcxx/include/support/newlib/xlocale.h
+++ b/lib/libcxx/include/__support/newlib/xlocale.h
@@ -17,9 +17,9 @@
 #include <ctype.h>
 #if !defined(__NEWLIB__) || __NEWLIB__ < 2 || \
     __NEWLIB__ == 2 && __NEWLIB_MINOR__ < 5
-#include <support/xlocale/__nop_locale_mgmt.h>
-#include <support/xlocale/__posix_l_fallback.h>
-#include <support/xlocale/__strtonum_fallback.h>
+#include <__support/xlocale/__nop_locale_mgmt.h>
+#include <__support/xlocale/__posix_l_fallback.h>
+#include <__support/xlocale/__strtonum_fallback.h>
 #endif
 
 #endif // _NEWLIB_VERSION
diff --git a/lib/libcxx/include/__support/nuttx/xlocale.h b/lib/libcxx/include/__support/nuttx/xlocale.h
new file mode 100644
index 0000000000..be738e3b64
--- /dev/null
+++ b/lib/libcxx/include/__support/nuttx/xlocale.h
@@ -0,0 +1,18 @@
+// -*- C++ -*-
+//===-----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP_SUPPORT_NUTTX_XLOCALE_H
+#define _LIBCPP_SUPPORT_NUTTX_XLOCALE_H
+
+#if defined(__NuttX__)
+#include <__support/xlocale/__posix_l_fallback.h>
+#include <__support/xlocale/__strtonum_fallback.h>
+#endif // __NuttX__
+
+#endif
diff --git a/lib/libcxx/include/__support/openbsd/xlocale.h b/lib/libcxx/include/__support/openbsd/xlocale.h
new file mode 100644
index 0000000000..1136fa327f
--- /dev/null
+++ b/lib/libcxx/include/__support/openbsd/xlocale.h
@@ -0,0 +1,19 @@
+// -*- C++ -*-
+//===-----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP_SUPPORT_OPENBSD_XLOCALE_H
+#define _LIBCPP_SUPPORT_OPENBSD_XLOCALE_H
+
+#include <cstdlib>
+#include <clocale>
+#include <cwctype>
+#include <ctype.h>
+#include <__support/xlocale/__strtonum_fallback.h>
+
+#endif
diff --git a/lib/libcxx/include/support/solaris/floatingpoint.h b/lib/libcxx/include/__support/solaris/floatingpoint.h
similarity index 100%
rename from lib/libcxx/include/support/solaris/floatingpoint.h
rename to lib/libcxx/include/__support/solaris/floatingpoint.h
diff --git a/lib/libcxx/include/support/solaris/wchar.h b/lib/libcxx/include/__support/solaris/wchar.h
similarity index 100%
rename from lib/libcxx/include/support/solaris/wchar.h
rename to lib/libcxx/include/__support/solaris/wchar.h
diff --git a/lib/libcxx/include/support/solaris/xlocale.h b/lib/libcxx/include/__support/solaris/xlocale.h
similarity index 100%
rename from lib/libcxx/include/support/solaris/xlocale.h
rename to lib/libcxx/include/__support/solaris/xlocale.h
diff --git a/lib/libcxx/include/support/win32/limits_msvc_win32.h b/lib/libcxx/include/__support/win32/limits_msvc_win32.h
similarity index 96%
rename from lib/libcxx/include/support/win32/limits_msvc_win32.h
rename to lib/libcxx/include/__support/win32/limits_msvc_win32.h
index 7bb835559a..758d24647b 100644
--- a/lib/libcxx/include/support/win32/limits_msvc_win32.h
+++ b/lib/libcxx/include/__support/win32/limits_msvc_win32.h
@@ -1,5 +1,5 @@
 // -*- C++ -*-
-//===------------------ support/win32/limits_msvc_win32.h -----------------===//
+//===-----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/lib/libcxx/include/support/win32/locale_win32.h b/lib/libcxx/include/__support/win32/locale_win32.h
similarity index 97%
rename from lib/libcxx/include/support/win32/locale_win32.h
rename to lib/libcxx/include/__support/win32/locale_win32.h
index 8d7779e0cc..d32a7a8ad3 100644
--- a/lib/libcxx/include/support/win32/locale_win32.h
+++ b/lib/libcxx/include/__support/win32/locale_win32.h
@@ -1,5 +1,5 @@
 // -*- C++ -*-
-//===--------------------- support/win32/locale_win32.h -------------------===//
+//===-----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -201,8 +201,8 @@ decltype(MB_CUR_MAX) MB_CUR_MAX_L( locale_t __l );
 #define strtof_l _strtof_l
 #define strtold_l _strtold_l
 #else
-float strtof_l(const char*, char**, locale_t);
-long double strtold_l(const char*, char**, locale_t);
+_LIBCPP_FUNC_VIS float strtof_l(const char*, char**, locale_t);
+_LIBCPP_FUNC_VIS long double strtold_l(const char*, char**, locale_t);
 #endif
 inline _LIBCPP_INLINE_VISIBILITY
 int
diff --git a/lib/libcxx/include/support/xlocale/__nop_locale_mgmt.h b/lib/libcxx/include/__support/xlocale/__nop_locale_mgmt.h
similarity index 94%
rename from lib/libcxx/include/support/xlocale/__nop_locale_mgmt.h
rename to lib/libcxx/include/__support/xlocale/__nop_locale_mgmt.h
index f33d3894c3..57b18842ff 100644
--- a/lib/libcxx/include/support/xlocale/__nop_locale_mgmt.h
+++ b/lib/libcxx/include/__support/xlocale/__nop_locale_mgmt.h
@@ -1,5 +1,5 @@
 // -*- C++ -*-
-//===------------  support/xlocale/__nop_locale_mgmt.h -----------------===//
+//===-----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/lib/libcxx/include/support/xlocale/__posix_l_fallback.h b/lib/libcxx/include/__support/xlocale/__posix_l_fallback.h
similarity index 98%
rename from lib/libcxx/include/support/xlocale/__posix_l_fallback.h
rename to lib/libcxx/include/__support/xlocale/__posix_l_fallback.h
index f3df6c46fb..00d69d19e8 100644
--- a/lib/libcxx/include/support/xlocale/__posix_l_fallback.h
+++ b/lib/libcxx/include/__support/xlocale/__posix_l_fallback.h
@@ -1,5 +1,5 @@
 // -*- C++ -*-
-//===--------------- support/xlocale/__posix_l_fallback.h -----------------===//
+//===-----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/lib/libcxx/include/support/xlocale/__strtonum_fallback.h b/lib/libcxx/include/__support/xlocale/__strtonum_fallback.h
similarity index 96%
rename from lib/libcxx/include/support/xlocale/__strtonum_fallback.h
rename to lib/libcxx/include/__support/xlocale/__strtonum_fallback.h
index df38598056..1172a5d572 100644
--- a/lib/libcxx/include/support/xlocale/__strtonum_fallback.h
+++ b/lib/libcxx/include/__support/xlocale/__strtonum_fallback.h
@@ -1,5 +1,5 @@
 // -*- C++ -*-
-//===-------------- support/xlocale/__strtonum_fallback.h -----------------===//
+//===-----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/lib/libcxx/include/__threading_support b/lib/libcxx/include/__threading_support
index 072c4c7bcc..de572f3ff8 100644
--- a/lib/libcxx/include/__threading_support
+++ b/lib/libcxx/include/__threading_support
@@ -11,10 +11,15 @@
 #define _LIBCPP_THREADING_SUPPORT
 
 #include <__config>
+#include <__availability>
 #include <chrono>
 #include <iosfwd>
 #include <errno.h>
 
+#ifdef __MVS__
+# include <__support/ibm/nanosleep.h>
+#endif
+
 #ifndef _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER
 #pragma GCC system_header
 #endif
@@ -26,7 +31,7 @@
 #if defined(_LIBCPP_HAS_THREAD_API_PTHREAD)
 # include <pthread.h>
 # include <sched.h>
-# ifdef __APPLE__
+# if defined(__APPLE__) || defined(__MVS__)
 #  define _LIBCPP_NO_NATIVE_SEMAPHORES
 # endif
 # ifndef _LIBCPP_NO_NATIVE_SEMAPHORES
@@ -82,11 +87,14 @@ typedef pthread_once_t __libcpp_exec_once_flag;
 #define _LIBCPP_EXEC_ONCE_INITIALIZER PTHREAD_ONCE_INIT
 
 // Thread id
-typedef pthread_t __libcpp_thread_id;
+#if defined(__MVS__)
+  typedef unsigned long long __libcpp_thread_id;
+#else
+  typedef pthread_t __libcpp_thread_id;
+#endif
 
 // Thread
-#define _LIBCPP_NULL_THREAD 0U
-
+#define _LIBCPP_NULL_THREAD ((__libcpp_thread_t()))
 typedef pthread_t __libcpp_thread_t;
 
 // Thread Local Storage
@@ -278,24 +286,21 @@ int __libcpp_tls_set(__libcpp_tls_key __key, void *__p);
 #endif // !defined(_LIBCPP_HAS_THREAD_API_EXTERNAL)
 
 struct __libcpp_timed_backoff_policy {
-  _LIBCPP_THREAD_ABI_VISIBILITY
-  bool operator()(chrono::nanoseconds __elapsed) const;
+  _LIBCPP_INLINE_VISIBILITY
+  bool operator()(chrono::nanoseconds __elapsed) const
+  {
+      if(__elapsed > chrono::milliseconds(128))
+          __libcpp_thread_sleep_for(chrono::milliseconds(8));
+      else if(__elapsed > chrono::microseconds(64))
+          __libcpp_thread_sleep_for(__elapsed / 2);
+      else if(__elapsed > chrono::microseconds(4))
+        __libcpp_thread_yield();
+      else
+        {} // poll
+      return false;
+  }
 };
 
-inline _LIBCPP_INLINE_VISIBILITY
-bool __libcpp_timed_backoff_policy::operator()(chrono::nanoseconds __elapsed) const
-{
-    if(__elapsed > chrono::milliseconds(128))
-        __libcpp_thread_sleep_for(chrono::milliseconds(8));
-    else if(__elapsed > chrono::microseconds(64))
-        __libcpp_thread_sleep_for(__elapsed / 2);
-    else if(__elapsed > chrono::microseconds(4))
-      __libcpp_thread_yield();
-    else
-      ; // poll
-    return false;
-}
-
 static _LIBCPP_CONSTEXPR const int __libcpp_polling_count = 64;
 
 template<class _Fn, class _BFn>
@@ -484,7 +489,7 @@ int __libcpp_execute_once(__libcpp_exec_once_flag *flag,
 // Returns non-zero if the thread ids are equal, otherwise 0
 bool __libcpp_thread_id_equal(__libcpp_thread_id t1, __libcpp_thread_id t2)
 {
-  return pthread_equal(t1, t2) != 0;
+  return t1 == t2;
 }
 
 // Returns non-zero if t1 < t2, otherwise 0
@@ -495,28 +500,33 @@ bool __libcpp_thread_id_less(__libcpp_thread_id t1, __libcpp_thread_id t2)
 
 // Thread
 bool __libcpp_thread_isnull(const __libcpp_thread_t *__t) {
-  return *__t == 0;
+  return *__t == __libcpp_thread_t();
 }
 
 int __libcpp_thread_create(__libcpp_thread_t *__t, void *(*__func)(void *),
                            void *__arg)
 {
-  return pthread_create(__t, 0, __func, __arg);
+  return pthread_create(__t, nullptr, __func, __arg);
 }
 
 __libcpp_thread_id __libcpp_thread_get_current_id()
 {
-  return pthread_self();
+  const __libcpp_thread_t thread = pthread_self();
+  return __libcpp_thread_get_id(&thread);
 }
 
 __libcpp_thread_id __libcpp_thread_get_id(const __libcpp_thread_t *__t)
 {
+#if defined(__MVS__)
+  return __t->__;
+#else
   return *__t;
+#endif
 }
 
 int __libcpp_thread_join(__libcpp_thread_t *__t)
 {
-  return pthread_join(*__t, 0);
+  return pthread_join(*__t, nullptr);
 }
 
 int __libcpp_thread_detach(__libcpp_thread_t *__t)
@@ -651,7 +661,7 @@ bool __libcpp_thread_id_less(__libcpp_thread_id t1, __libcpp_thread_id t2)
 
 // Thread
 bool __libcpp_thread_isnull(const __libcpp_thread_t *__t) {
-  return *__t == 0;
+  return __libcpp_thread_get_id(__t) == 0;
 }
 
 int __libcpp_thread_create(__libcpp_thread_t *__t, void *(*__func)(void *),
diff --git a/lib/libcxx/include/__tree b/lib/libcxx/include/__tree
index cb7a1022e6..0f6e4ec379 100644
--- a/lib/libcxx/include/__tree
+++ b/lib/libcxx/include/__tree
@@ -108,10 +108,10 @@ __tree_sub_invariant(_NodePtr __x)
         if (__x->__right_ && !__x->__right_->__is_black_)
             return 0;
     }
-    unsigned __h = __tree_sub_invariant(__x->__left_);
+    unsigned __h = _VSTD::__tree_sub_invariant(__x->__left_);
     if (__h == 0)
         return 0;  // invalid left subtree
-    if (__h != __tree_sub_invariant(__x->__right_))
+    if (__h != _VSTD::__tree_sub_invariant(__x->__right_))
         return 0;  // invalid or different height right subtree
     return __h + __x->__is_black_;  // return black height of this node
 }
@@ -128,13 +128,13 @@ __tree_invariant(_NodePtr __root)
     // check __x->__parent_ consistency
     if (__root->__parent_ == nullptr)
         return false;
-    if (!__tree_is_left_child(__root))
+    if (!_VSTD::__tree_is_left_child(__root))
         return false;
     // root must be black
     if (!__root->__is_black_)
         return false;
     // do normal node checks
-    return __tree_sub_invariant(__root) != 0;
+    return _VSTD::__tree_sub_invariant(__root) != 0;
 }
 
 // Returns:  pointer to the left-most node under __x.
@@ -168,8 +168,8 @@ _NodePtr
 __tree_next(_NodePtr __x) _NOEXCEPT
 {
     if (__x->__right_ != nullptr)
-        return __tree_min(__x->__right_);
-    while (!__tree_is_left_child(__x))
+        return _VSTD::__tree_min(__x->__right_);
+    while (!_VSTD::__tree_is_left_child(__x))
         __x = __x->__parent_unsafe();
     return __x->__parent_unsafe();
 }
@@ -180,8 +180,8 @@ _EndNodePtr
 __tree_next_iter(_NodePtr __x) _NOEXCEPT
 {
     if (__x->__right_ != nullptr)
-        return static_cast<_EndNodePtr>(__tree_min(__x->__right_));
-    while (!__tree_is_left_child(__x))
+        return static_cast<_EndNodePtr>(_VSTD::__tree_min(__x->__right_));
+    while (!_VSTD::__tree_is_left_child(__x))
         __x = __x->__parent_unsafe();
     return static_cast<_EndNodePtr>(__x->__parent_);
 }
@@ -195,9 +195,9 @@ _NodePtr
 __tree_prev_iter(_EndNodePtr __x) _NOEXCEPT
 {
     if (__x->__left_ != nullptr)
-        return __tree_max(__x->__left_);
+        return _VSTD::__tree_max(__x->__left_);
     _NodePtr __xx = static_cast<_NodePtr>(__x);
-    while (__tree_is_left_child(__xx))
+    while (_VSTD::__tree_is_left_child(__xx))
         __xx = __xx->__parent_unsafe();
     return __xx->__parent_unsafe();
 }
@@ -237,7 +237,7 @@ __tree_left_rotate(_NodePtr __x) _NOEXCEPT
     if (__x->__right_ != nullptr)
         __x->__right_->__set_parent(__x);
     __y->__parent_ = __x->__parent_;
-    if (__tree_is_left_child(__x))
+    if (_VSTD::__tree_is_left_child(__x))
         __x->__parent_->__left_ = __y;
     else
         __x->__parent_unsafe()->__right_ = __y;
@@ -257,7 +257,7 @@ __tree_right_rotate(_NodePtr __x) _NOEXCEPT
     if (__x->__left_ != nullptr)
         __x->__left_->__set_parent(__x);
     __y->__parent_ = __x->__parent_;
-    if (__tree_is_left_child(__x))
+    if (_VSTD::__tree_is_left_child(__x))
         __x->__parent_->__left_ = __y;
     else
         __x->__parent_unsafe()->__right_ = __y;
@@ -281,7 +281,7 @@ __tree_balance_after_insert(_NodePtr __root, _NodePtr __x) _NOEXCEPT
     while (__x != __root && !__x->__parent_unsafe()->__is_black_)
     {
         // __x->__parent_ != __root because __x->__parent_->__is_black == false
-        if (__tree_is_left_child(__x->__parent_unsafe()))
+        if (_VSTD::__tree_is_left_child(__x->__parent_unsafe()))
         {
             _NodePtr __y = __x->__parent_unsafe()->__parent_unsafe()->__right_;
             if (__y != nullptr && !__y->__is_black_)
@@ -294,16 +294,16 @@ __tree_balance_after_insert(_NodePtr __root, _NodePtr __x) _NOEXCEPT
             }
             else
             {
-                if (!__tree_is_left_child(__x))
+                if (!_VSTD::__tree_is_left_child(__x))
                 {
                     __x = __x->__parent_unsafe();
-                    __tree_left_rotate(__x);
+                    _VSTD::__tree_left_rotate(__x);
                 }
                 __x = __x->__parent_unsafe();
                 __x->__is_black_ = true;
                 __x = __x->__parent_unsafe();
                 __x->__is_black_ = false;
-                __tree_right_rotate(__x);
+                _VSTD::__tree_right_rotate(__x);
                 break;
             }
         }
@@ -320,16 +320,16 @@ __tree_balance_after_insert(_NodePtr __root, _NodePtr __x) _NOEXCEPT
             }
             else
             {
-                if (__tree_is_left_child(__x))
+                if (_VSTD::__tree_is_left_child(__x))
                 {
                     __x = __x->__parent_unsafe();
-                    __tree_right_rotate(__x);
+                    _VSTD::__tree_right_rotate(__x);
                 }
                 __x = __x->__parent_unsafe();
                 __x->__is_black_ = true;
                 __x = __x->__parent_unsafe();
                 __x->__is_black_ = false;
-                __tree_left_rotate(__x);
+                _VSTD::__tree_left_rotate(__x);
                 break;
             }
         }
@@ -352,7 +352,7 @@ __tree_remove(_NodePtr __root, _NodePtr __z) _NOEXCEPT
     // __y will have at most one child.
     // __y will be the initial hole in the tree (make the hole at a leaf)
     _NodePtr __y = (__z->__left_ == nullptr || __z->__right_ == nullptr) ?
-                    __z : __tree_next(__z);
+                    __z : _VSTD::__tree_next(__z);
     // __x is __y's possibly null single child
     _NodePtr __x = __y->__left_ != nullptr ? __y->__left_ : __y->__right_;
     // __w is __x's possibly null uncle (will become __x's sibling)
@@ -360,7 +360,7 @@ __tree_remove(_NodePtr __root, _NodePtr __z) _NOEXCEPT
     // link __x to __y's parent, and find __w
     if (__x != nullptr)
         __x->__parent_ = __y->__parent_;
-    if (__tree_is_left_child(__y))
+    if (_VSTD::__tree_is_left_child(__y))
     {
         __y->__parent_->__left_ = __x;
         if (__y != __root)
@@ -381,7 +381,7 @@ __tree_remove(_NodePtr __root, _NodePtr __z) _NOEXCEPT
     {
         // __z->__left_ != nulptr but __z->__right_ might == __x == nullptr
         __y->__parent_ = __z->__parent_;
-        if (__tree_is_left_child(__z))
+        if (_VSTD::__tree_is_left_child(__z))
             __y->__parent_->__left_ = __y;
         else
             __y->__parent_unsafe()->__right_ = __y;
@@ -421,13 +421,13 @@ __tree_remove(_NodePtr __root, _NodePtr __z) _NOEXCEPT
             //     with a non-null black child).
             while (true)
             {
-                if (!__tree_is_left_child(__w))  // if x is left child
+                if (!_VSTD::__tree_is_left_child(__w))  // if x is left child
                 {
                     if (!__w->__is_black_)
                     {
                         __w->__is_black_ = true;
                         __w->__parent_unsafe()->__is_black_ = false;
-                        __tree_left_rotate(__w->__parent_unsafe());
+                        _VSTD::__tree_left_rotate(__w->__parent_unsafe());
                         // __x is still valid
                         // reset __root only if necessary
                         if (__root == __w->__left_)
@@ -448,7 +448,7 @@ __tree_remove(_NodePtr __root, _NodePtr __z) _NOEXCEPT
                             break;
                         }
                         // reset sibling, and it still can't be null
-                        __w = __tree_is_left_child(__x) ?
+                        __w = _VSTD::__tree_is_left_child(__x) ?
                                     __x->__parent_unsafe()->__right_ :
                                     __x->__parent_->__left_;
                         // continue;
@@ -460,7 +460,7 @@ __tree_remove(_NodePtr __root, _NodePtr __z) _NOEXCEPT
                             // __w left child is non-null and red
                             __w->__left_->__is_black_ = true;
                             __w->__is_black_ = false;
-                            __tree_right_rotate(__w);
+                            _VSTD::__tree_right_rotate(__w);
                             // __w is known not to be root, so root hasn't changed
                             // reset sibling, and it still can't be null
                             __w = __w->__parent_unsafe();
@@ -469,7 +469,7 @@ __tree_remove(_NodePtr __root, _NodePtr __z) _NOEXCEPT
                         __w->__is_black_ = __w->__parent_unsafe()->__is_black_;
                         __w->__parent_unsafe()->__is_black_ = true;
                         __w->__right_->__is_black_ = true;
-                        __tree_left_rotate(__w->__parent_unsafe());
+                        _VSTD::__tree_left_rotate(__w->__parent_unsafe());
                         break;
                     }
                 }
@@ -479,7 +479,7 @@ __tree_remove(_NodePtr __root, _NodePtr __z) _NOEXCEPT
                     {
                         __w->__is_black_ = true;
                         __w->__parent_unsafe()->__is_black_ = false;
-                        __tree_right_rotate(__w->__parent_unsafe());
+                        _VSTD::__tree_right_rotate(__w->__parent_unsafe());
                         // __x is still valid
                         // reset __root only if necessary
                         if (__root == __w->__right_)
@@ -500,7 +500,7 @@ __tree_remove(_NodePtr __root, _NodePtr __z) _NOEXCEPT
                             break;
                         }
                         // reset sibling, and it still can't be null
-                        __w = __tree_is_left_child(__x) ?
+                        __w = _VSTD::__tree_is_left_child(__x) ?
                                     __x->__parent_unsafe()->__right_ :
                                     __x->__parent_->__left_;
                         // continue;
@@ -512,7 +512,7 @@ __tree_remove(_NodePtr __root, _NodePtr __z) _NOEXCEPT
                             // __w right child is non-null and red
                             __w->__right_->__is_black_ = true;
                             __w->__is_black_ = false;
-                            __tree_left_rotate(__w);
+                            _VSTD::__tree_left_rotate(__w);
                             // __w is known not to be root, so root hasn't changed
                             // reset sibling, and it still can't be null
                             __w = __w->__parent_unsafe();
@@ -521,7 +521,7 @@ __tree_remove(_NodePtr __root, _NodePtr __z) _NOEXCEPT
                         __w->__is_black_ = __w->__parent_unsafe()->__is_black_;
                         __w->__parent_unsafe()->__is_black_ = true;
                         __w->__left_->__is_black_ = true;
-                        __tree_right_rotate(__w->__parent_unsafe());
+                        _VSTD::__tree_right_rotate(__w->__parent_unsafe());
                         break;
                     }
                 }
@@ -533,19 +533,17 @@ __tree_remove(_NodePtr __root, _NodePtr __z) _NOEXCEPT
 // node traits
 
 
-#ifndef _LIBCPP_CXX03_LANG
 template <class _Tp>
 struct __is_tree_value_type_imp : false_type {};
 
 template <class _Key, class _Value>
-struct __is_tree_value_type_imp<__value_type<_Key, _Value>> : true_type {};
+struct __is_tree_value_type_imp<__value_type<_Key, _Value> > : true_type {};
 
 template <class ..._Args>
 struct __is_tree_value_type : false_type {};
 
 template <class _One>
 struct __is_tree_value_type<_One> : __is_tree_value_type_imp<typename __uncvref<_One>::type> {};
-#endif
 
 template <class _Tp>
 struct __tree_key_value_types {
@@ -566,12 +564,10 @@ struct __tree_key_value_types {
   static __container_value_type* __get_ptr(__node_value_type& __n) {
     return _VSTD::addressof(__n);
   }
-#ifndef _LIBCPP_CXX03_LANG
   _LIBCPP_INLINE_VISIBILITY
   static __container_value_type&& __move(__node_value_type& __v) {
     return _VSTD::move(__v);
   }
-#endif
 };
 
 template <class _Key, class _Tp>
@@ -616,12 +612,10 @@ struct __tree_key_value_types<__value_type<_Key, _Tp> > {
     return _VSTD::addressof(__n.__get_value());
   }
 
-#ifndef _LIBCPP_CXX03_LANG
   _LIBCPP_INLINE_VISIBILITY
   static pair<key_type&&, mapped_type&&> __move(__node_value_type& __v) {
     return __v.__move();
   }
-#endif
 };
 
 template <class _VoidPtr>
@@ -845,7 +839,7 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     __tree_iterator& operator++() {
       __ptr_ = static_cast<__iter_pointer>(
-          __tree_next_iter<__end_node_pointer>(static_cast<__node_base_pointer>(__ptr_)));
+          _VSTD::__tree_next_iter<__end_node_pointer>(static_cast<__node_base_pointer>(__ptr_)));
       return *this;
     }
     _LIBCPP_INLINE_VISIBILITY
@@ -854,7 +848,7 @@ public:
 
     _LIBCPP_INLINE_VISIBILITY
     __tree_iterator& operator--() {
-      __ptr_ = static_cast<__iter_pointer>(__tree_prev_iter<__node_base_pointer>(
+      __ptr_ = static_cast<__iter_pointer>(_VSTD::__tree_prev_iter<__node_base_pointer>(
           static_cast<__end_node_pointer>(__ptr_)));
       return *this;
     }
@@ -926,7 +920,7 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     __tree_const_iterator& operator++() {
       __ptr_ = static_cast<__iter_pointer>(
-          __tree_next_iter<__end_node_pointer>(static_cast<__node_base_pointer>(__ptr_)));
+          _VSTD::__tree_next_iter<__end_node_pointer>(static_cast<__node_base_pointer>(__ptr_)));
       return *this;
     }
 
@@ -936,7 +930,7 @@ public:
 
     _LIBCPP_INLINE_VISIBILITY
     __tree_const_iterator& operator--() {
-      __ptr_ = static_cast<__iter_pointer>(__tree_prev_iter<__node_base_pointer>(
+      __ptr_ = static_cast<__iter_pointer>(_VSTD::__tree_prev_iter<__node_base_pointer>(
           static_cast<__end_node_pointer>(__ptr_)));
       return *this;
     }
@@ -973,7 +967,7 @@ private:
 
 template<class _Tp, class _Compare>
 #ifndef _LIBCPP_CXX03_LANG
-    _LIBCPP_DIAGNOSE_WARNING(!std::__invokable<_Compare const&, _Tp const&, _Tp const&>::value,
+    _LIBCPP_DIAGNOSE_WARNING(!__invokable<_Compare const&, _Tp const&, _Tp const&>::value,
         "the specified comparator type does not provide a viable const call operator")
 #endif
 int __diagnose_non_const_comparator();
@@ -1103,7 +1097,6 @@ public:
         void __assign_unique(_ForwardIterator __first, _ForwardIterator __last);
     template <class _InputIterator>
         void __assign_multi(_InputIterator __first, _InputIterator __last);
-#ifndef _LIBCPP_CXX03_LANG
     __tree(__tree&& __t)
         _NOEXCEPT_(
             is_nothrow_move_constructible<__node_allocator>::value &&
@@ -1114,8 +1107,6 @@ public:
             __node_traits::propagate_on_container_move_assignment::value &&
             is_nothrow_move_assignable<value_compare>::value &&
             is_nothrow_move_assignable<__node_allocator>::value);
-#endif // _LIBCPP_CXX03_LANG
-
     ~__tree();
 
     _LIBCPP_INLINE_VISIBILITY
@@ -1129,7 +1120,7 @@ public:
 
     _LIBCPP_INLINE_VISIBILITY
     size_type max_size() const _NOEXCEPT
-        {return std::min<size_type>(
+        {return _VSTD::min<size_type>(
                 __node_traits::max_size(__node_alloc()),
                 numeric_limits<difference_type >::max());}
 
@@ -1146,12 +1137,11 @@ public:
         _NOEXCEPT_(__is_nothrow_swappable<value_compare>::value);
 #endif
 
-#ifndef _LIBCPP_CXX03_LANG
     template <class _Key, class ..._Args>
     pair<iterator, bool>
     __emplace_unique_key_args(_Key const&, _Args&&... __args);
     template <class _Key, class ..._Args>
-    iterator
+    pair<iterator, bool>
     __emplace_hint_unique_key_args(const_iterator, _Key const&, _Args&&...);
 
     template <class... _Args>
@@ -1225,7 +1215,7 @@ public:
     >::type __emplace_hint_unique(const_iterator __p, _First&& __f, _Second&& __s) {
         return __emplace_hint_unique_key_args(__p, __f,
                                               _VSTD::forward<_First>(__f),
-                                              _VSTD::forward<_Second>(__s));
+                                              _VSTD::forward<_Second>(__s)).first;
     }
 
     template <class... _Args>
@@ -1245,25 +1235,16 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     iterator
     __emplace_hint_unique_extract_key(const_iterator __p, _Pp&& __x, __extract_key_self_tag) {
-      return __emplace_hint_unique_key_args(__p, __x, _VSTD::forward<_Pp>(__x));
+      return __emplace_hint_unique_key_args(__p, __x, _VSTD::forward<_Pp>(__x)).first;
     }
 
     template <class _Pp>
     _LIBCPP_INLINE_VISIBILITY
     iterator
     __emplace_hint_unique_extract_key(const_iterator __p, _Pp&& __x, __extract_key_first_tag) {
-      return __emplace_hint_unique_key_args(__p, __x.first, _VSTD::forward<_Pp>(__x));
+      return __emplace_hint_unique_key_args(__p, __x.first, _VSTD::forward<_Pp>(__x)).first;
     }
 
-#else
-    template <class _Key, class _Args>
-    _LIBCPP_INLINE_VISIBILITY
-    pair<iterator, bool> __emplace_unique_key_args(_Key const&, _Args& __args);
-    template <class _Key, class _Args>
-    _LIBCPP_INLINE_VISIBILITY
-    iterator __emplace_hint_unique_key_args(const_iterator, _Key const&, _Args&);
-#endif
-
     _LIBCPP_INLINE_VISIBILITY
     pair<iterator, bool> __insert_unique(const __container_value_type& __v) {
         return __emplace_unique_key_args(_NodeTypes::__get_key(__v), __v);
@@ -1271,15 +1252,9 @@ public:
 
     _LIBCPP_INLINE_VISIBILITY
     iterator __insert_unique(const_iterator __p, const __container_value_type& __v) {
-        return __emplace_hint_unique_key_args(__p, _NodeTypes::__get_key(__v), __v);
+        return __emplace_hint_unique_key_args(__p, _NodeTypes::__get_key(__v), __v).first;
     }
 
-#ifdef _LIBCPP_CXX03_LANG
-    _LIBCPP_INLINE_VISIBILITY
-    iterator __insert_multi(const __container_value_type& __v);
-    _LIBCPP_INLINE_VISIBILITY
-    iterator __insert_multi(const_iterator __p, const __container_value_type& __v);
-#else
     _LIBCPP_INLINE_VISIBILITY
     pair<iterator, bool> __insert_unique(__container_value_type&& __v) {
         return __emplace_unique_key_args(_NodeTypes::__get_key(__v), _VSTD::move(__v));
@@ -1287,7 +1262,7 @@ public:
 
     _LIBCPP_INLINE_VISIBILITY
     iterator __insert_unique(const_iterator __p, __container_value_type&& __v) {
-        return __emplace_hint_unique_key_args(__p, _NodeTypes::__get_key(__v), _VSTD::move(__v));
+        return __emplace_hint_unique_key_args(__p, _NodeTypes::__get_key(__v), _VSTD::move(__v)).first;
     }
 
     template <class _Vp, class = typename enable_if<
@@ -1332,8 +1307,6 @@ public:
         return __emplace_hint_multi(__p, _VSTD::forward<_Vp>(__v));
     }
 
-#endif // !_LIBCPP_CXX03_LANG
-
     _LIBCPP_INLINE_VISIBILITY
     pair<iterator, bool> __node_assign_unique(const __container_value_type& __v, __node_pointer __dest);
 
@@ -1455,7 +1428,7 @@ private:
     __node_base_pointer&
         __find_leaf(const_iterator __hint,
                     __parent_pointer& __parent, const key_type& __v);
-    // FIXME: Make this function const qualified. Unfortunetly doing so
+    // FIXME: Make this function const qualified. Unfortunately doing so
     // breaks existing code which uses non-const callable comparators.
     template <class _Key>
     __node_base_pointer&
@@ -1471,12 +1444,8 @@ private:
                      __node_base_pointer& __dummy,
                      const _Key& __v);
 
-#ifndef _LIBCPP_CXX03_LANG
     template <class ..._Args>
     __node_holder __construct_node(_Args&& ...__args);
-#else
-    __node_holder __construct_node(const __container_value_type& __v);
-#endif
 
     void destroy(__node_pointer __nd) _NOEXCEPT;
 
@@ -1621,20 +1590,20 @@ __tree<_Tp, _Compare, _Allocator>::_DetachedTreeCache::__detach_next(__node_poin
 {
     if (__cache->__parent_ == nullptr)
         return nullptr;
-    if (__tree_is_left_child(static_cast<__node_base_pointer>(__cache)))
+    if (_VSTD::__tree_is_left_child(static_cast<__node_base_pointer>(__cache)))
     {
         __cache->__parent_->__left_ = nullptr;
         __cache = static_cast<__node_pointer>(__cache->__parent_);
         if (__cache->__right_ == nullptr)
             return __cache;
-        return static_cast<__node_pointer>(__tree_leaf(__cache->__right_));
+        return static_cast<__node_pointer>(_VSTD::__tree_leaf(__cache->__right_));
     }
     // __cache is right child
     __cache->__parent_unsafe()->__right_ = nullptr;
     __cache = static_cast<__node_pointer>(__cache->__parent_);
     if (__cache->__left_ == nullptr)
         return __cache;
-    return static_cast<__node_pointer>(__tree_leaf(__cache->__left_));
+    return static_cast<__node_pointer>(_VSTD::__tree_leaf(__cache->__left_));
 }
 
 template <class _Tp, class _Compare, class _Allocator>
@@ -1706,8 +1675,6 @@ __tree<_Tp, _Compare, _Allocator>::__tree(const __tree& __t)
     __begin_node() = __end_node();
 }
 
-#ifndef _LIBCPP_CXX03_LANG
-
 template <class _Tp, class _Compare, class _Allocator>
 __tree<_Tp, _Compare, _Allocator>::__tree(__tree&& __t)
     _NOEXCEPT_(
@@ -1814,8 +1781,6 @@ __tree<_Tp, _Compare, _Allocator>::operator=(__tree&& __t)
     return *this;
 }
 
-#endif  // _LIBCPP_CXX03_LANG
-
 template <class _Tp, class _Compare, class _Allocator>
 __tree<_Tp, _Compare, _Allocator>::~__tree()
 {
@@ -1854,7 +1819,7 @@ __tree<_Tp, _Compare, _Allocator>::swap(__tree& __t)
     using _VSTD::swap;
     swap(__begin_node_, __t.__begin_node_);
     swap(__pair1_.first(), __t.__pair1_.first());
-    __swap_allocator(__node_alloc(), __t.__node_alloc());
+    _VSTD::__swap_allocator(__node_alloc(), __t.__node_alloc());
     __pair3_.swap(__t.__pair3_);
     if (size() == 0)
         __begin_node() = __end_node();
@@ -2113,21 +2078,14 @@ void __tree<_Tp, _Compare, _Allocator>::__insert_node_at(
     __child = __new_node;
     if (__begin_node()->__left_ != nullptr)
         __begin_node() = static_cast<__iter_pointer>(__begin_node()->__left_);
-    __tree_balance_after_insert(__end_node()->__left_, __child);
+    _VSTD::__tree_balance_after_insert(__end_node()->__left_, __child);
     ++size();
 }
 
-#ifndef _LIBCPP_CXX03_LANG
 template <class _Tp, class _Compare, class _Allocator>
 template <class _Key, class... _Args>
 pair<typename __tree<_Tp, _Compare, _Allocator>::iterator, bool>
 __tree<_Tp, _Compare, _Allocator>::__emplace_unique_key_args(_Key const& __k, _Args&&... __args)
-#else
-template <class _Tp, class _Compare, class _Allocator>
-template <class _Key, class _Args>
-pair<typename __tree<_Tp, _Compare, _Allocator>::iterator, bool>
-__tree<_Tp, _Compare, _Allocator>::__emplace_unique_key_args(_Key const& __k, _Args& __args)
-#endif
 {
     __parent_pointer __parent;
     __node_base_pointer& __child = __find_equal(__parent, __k);
@@ -2135,11 +2093,7 @@ __tree<_Tp, _Compare, _Allocator>::__emplace_unique_key_args(_Key const& __k, _A
     bool __inserted = false;
     if (__child == nullptr)
     {
-#ifndef _LIBCPP_CXX03_LANG
         __node_holder __h = __construct_node(_VSTD::forward<_Args>(__args)...);
-#else
-        __node_holder __h = __construct_node(__args);
-#endif
         __insert_node_at(__parent, __child, static_cast<__node_base_pointer>(__h.get()));
         __r = __h.release();
         __inserted = true;
@@ -2147,41 +2101,27 @@ __tree<_Tp, _Compare, _Allocator>::__emplace_unique_key_args(_Key const& __k, _A
     return pair<iterator, bool>(iterator(__r), __inserted);
 }
 
-
-#ifndef _LIBCPP_CXX03_LANG
 template <class _Tp, class _Compare, class _Allocator>
 template <class _Key, class... _Args>
-typename __tree<_Tp, _Compare, _Allocator>::iterator
+pair<typename __tree<_Tp, _Compare, _Allocator>::iterator, bool>
 __tree<_Tp, _Compare, _Allocator>::__emplace_hint_unique_key_args(
     const_iterator __p, _Key const& __k, _Args&&... __args)
-#else
-template <class _Tp, class _Compare, class _Allocator>
-template <class _Key, class _Args>
-typename __tree<_Tp, _Compare, _Allocator>::iterator
-__tree<_Tp, _Compare, _Allocator>::__emplace_hint_unique_key_args(
-    const_iterator __p, _Key const& __k, _Args& __args)
-#endif
 {
     __parent_pointer __parent;
     __node_base_pointer __dummy;
     __node_base_pointer& __child = __find_equal(__p, __parent, __dummy, __k);
     __node_pointer __r = static_cast<__node_pointer>(__child);
+    bool __inserted = false;
     if (__child == nullptr)
     {
-#ifndef _LIBCPP_CXX03_LANG
         __node_holder __h = __construct_node(_VSTD::forward<_Args>(__args)...);
-#else
-        __node_holder __h = __construct_node(__args);
-#endif
         __insert_node_at(__parent, __child, static_cast<__node_base_pointer>(__h.get()));
         __r = __h.release();
+        __inserted = true;
     }
-    return iterator(__r);
+    return pair<iterator, bool>(iterator(__r), __inserted);
 }
 
-
-#ifndef _LIBCPP_CXX03_LANG
-
 template <class _Tp, class _Compare, class _Allocator>
 template <class ..._Args>
 typename __tree<_Tp, _Compare, _Allocator>::__node_holder
@@ -2259,46 +2199,6 @@ __tree<_Tp, _Compare, _Allocator>::__emplace_hint_multi(const_iterator __p,
     return iterator(static_cast<__node_pointer>(__h.release()));
 }
 
-
-#else  // _LIBCPP_CXX03_LANG
-
-template <class _Tp, class _Compare, class _Allocator>
-typename __tree<_Tp, _Compare, _Allocator>::__node_holder
-__tree<_Tp, _Compare, _Allocator>::__construct_node(const __container_value_type& __v)
-{
-    __node_allocator& __na = __node_alloc();
-    __node_holder __h(__node_traits::allocate(__na, 1), _Dp(__na));
-    __node_traits::construct(__na, _NodeTypes::__get_ptr(__h->__value_), __v);
-    __h.get_deleter().__value_constructed = true;
-    return _LIBCPP_EXPLICIT_MOVE(__h);  // explicitly moved for C++03
-}
-
-#endif  // _LIBCPP_CXX03_LANG
-
-#ifdef _LIBCPP_CXX03_LANG
-template <class _Tp, class _Compare, class _Allocator>
-typename __tree<_Tp, _Compare, _Allocator>::iterator
-__tree<_Tp, _Compare, _Allocator>::__insert_multi(const __container_value_type& __v)
-{
-    __parent_pointer __parent;
-    __node_base_pointer& __child = __find_leaf_high(__parent, _NodeTypes::__get_key(__v));
-    __node_holder __h = __construct_node(__v);
-    __insert_node_at(__parent, __child, static_cast<__node_base_pointer>(__h.get()));
-    return iterator(__h.release());
-}
-
-template <class _Tp, class _Compare, class _Allocator>
-typename __tree<_Tp, _Compare, _Allocator>::iterator
-__tree<_Tp, _Compare, _Allocator>::__insert_multi(const_iterator __p, const __container_value_type& __v)
-{
-    __parent_pointer __parent;
-    __node_base_pointer& __child = __find_leaf(__p, __parent, _NodeTypes::__get_key(__v));
-    __node_holder __h = __construct_node(__v);
-    __insert_node_at(__parent, __child, static_cast<__node_base_pointer>(__h.get()));
-    return iterator(__h.release());
-}
-#endif
-
 template <class _Tp, class _Compare, class _Allocator>
 pair<typename __tree<_Tp, _Compare, _Allocator>::iterator, bool>
 __tree<_Tp, _Compare, _Allocator>::__node_assign_unique(const __container_value_type& __v, __node_pointer __nd)
@@ -2348,8 +2248,8 @@ __tree<_Tp, _Compare, _Allocator>::__remove_node_pointer(__node_pointer __ptr) _
     if (__begin_node() == __ptr)
         __begin_node() = __r.__ptr_;
     --size();
-    __tree_remove(__end_node()->__left_,
-                  static_cast<__node_base_pointer>(__ptr));
+    _VSTD::__tree_remove(__end_node()->__left_,
+                         static_cast<__node_base_pointer>(__ptr));
     return __r;
 }
 
@@ -2727,7 +2627,7 @@ __tree<_Tp, _Compare, _Allocator>::__equal_range_unique(const _Key& __k)
             return _Pp(iterator(__rt),
                       iterator(
                           __rt->__right_ != nullptr ?
-                              static_cast<__iter_pointer>(__tree_min(__rt->__right_))
+                              static_cast<__iter_pointer>(_VSTD::__tree_min(__rt->__right_))
                             : __result));
     }
     return _Pp(iterator(__result), iterator(__result));
@@ -2755,7 +2655,7 @@ __tree<_Tp, _Compare, _Allocator>::__equal_range_unique(const _Key& __k) const
             return _Pp(const_iterator(__rt),
                       const_iterator(
                           __rt->__right_ != nullptr ?
-                              static_cast<__iter_pointer>(__tree_min(__rt->__right_))
+                              static_cast<__iter_pointer>(_VSTD::__tree_min(__rt->__right_))
                             : __result));
     }
     return _Pp(const_iterator(__result), const_iterator(__result));
@@ -2824,8 +2724,8 @@ __tree<_Tp, _Compare, _Allocator>::remove(const_iterator __p) _NOEXCEPT
             __begin_node() = static_cast<__iter_pointer>(__np->__parent_);
     }
     --size();
-    __tree_remove(__end_node()->__left_,
-                  static_cast<__node_base_pointer>(__np));
+    _VSTD::__tree_remove(__end_node()->__left_,
+                         static_cast<__node_base_pointer>(__np));
     return __node_holder(__np, _Dp(__node_alloc(), true));
 }
 
diff --git a/lib/libcxx/include/algorithm b/lib/libcxx/include/algorithm
index 83e49f19ab..f7fb2013a7 100644
--- a/lib/libcxx/include/algorithm
+++ b/lib/libcxx/include/algorithm
@@ -47,16 +47,16 @@ template <class InputIterator, class Predicate>
     find_if(InputIterator first, InputIterator last, Predicate pred);
 
 template<class InputIterator, class Predicate>
-    InputIterator               // constexpr in C++20
+    constexpr InputIterator     // constexpr in C++20
     find_if_not(InputIterator first, InputIterator last, Predicate pred);
 
 template <class ForwardIterator1, class ForwardIterator2>
-    ForwardIterator1            // constexpr in C++20
+    constexpr ForwardIterator1  // constexpr in C++20
     find_end(ForwardIterator1 first1, ForwardIterator1 last1,
              ForwardIterator2 first2, ForwardIterator2 last2);
 
 template <class ForwardIterator1, class ForwardIterator2, class BinaryPredicate>
-    ForwardIterator1            // constexpr in C++20
+    constexpr ForwardIterator1  // constexpr in C++20
     find_end(ForwardIterator1 first1, ForwardIterator1 last1,
              ForwardIterator2 first2, ForwardIterator2 last2, BinaryPredicate pred);
 
@@ -185,11 +185,11 @@ template <class BidirectionalIterator1, class BidirectionalIterator2>
                   BidirectionalIterator2 result);
 
 template <class ForwardIterator1, class ForwardIterator2>
-    ForwardIterator2
+    constexpr ForwardIterator2    // constexpr in C++20
     swap_ranges(ForwardIterator1 first1, ForwardIterator1 last1, ForwardIterator2 first2);
 
 template <class ForwardIterator1, class ForwardIterator2>
-    void
+    constexpr void                // constexpr in C++20
     iter_swap(ForwardIterator1 a, ForwardIterator2 b);
 
 template <class InputIterator, class OutputIterator, class UnaryOperation>
@@ -251,23 +251,23 @@ template <class InputIterator, class OutputIterator, class Predicate>
     remove_copy_if(InputIterator first, InputIterator last, OutputIterator result, Predicate pred);
 
 template <class ForwardIterator>
-    ForwardIterator
+    constexpr ForwardIterator    // constexpr in C++20
     unique(ForwardIterator first, ForwardIterator last);
 
 template <class ForwardIterator, class BinaryPredicate>
-    ForwardIterator
+    constexpr ForwardIterator    // constexpr in C++20
     unique(ForwardIterator first, ForwardIterator last, BinaryPredicate pred);
 
 template <class InputIterator, class OutputIterator>
-    OutputIterator
+    constexpr OutputIterator     // constexpr in C++20
     unique_copy(InputIterator first, InputIterator last, OutputIterator result);
 
 template <class InputIterator, class OutputIterator, class BinaryPredicate>
-    OutputIterator
+    constexpr OutputIterator     // constexpr in C++20
     unique_copy(InputIterator first, InputIterator last, OutputIterator result, BinaryPredicate pred);
 
 template <class BidirectionalIterator>
-    void
+    constexpr void               // constexpr in C++20
     reverse(BidirectionalIterator first, BidirectionalIterator last);
 
 template <class BidirectionalIterator, class OutputIterator>
@@ -275,11 +275,11 @@ template <class BidirectionalIterator, class OutputIterator>
     reverse_copy(BidirectionalIterator first, BidirectionalIterator last, OutputIterator result);
 
 template <class ForwardIterator>
-    ForwardIterator
+    constexpr ForwardIterator      // constexpr in C++20
     rotate(ForwardIterator first, ForwardIterator middle, ForwardIterator last);
 
 template <class ForwardIterator, class OutputIterator>
-    OutputIterator
+    constexpr OutputIterator       // constexpr in C++20
     rotate_copy(ForwardIterator first, ForwardIterator middle, ForwardIterator last, OutputIterator result);
 
 template <class RandomAccessIterator>
@@ -301,12 +301,22 @@ template<class RandomAccessIterator, class UniformRandomNumberGenerator>
     void shuffle(RandomAccessIterator first, RandomAccessIterator last,
                  UniformRandomNumberGenerator&& g);
 
+template<class ForwardIterator>
+  constexpr ForwardIterator
+    shift_left(ForwardIterator first, ForwardIterator last,
+               typename iterator_traits<ForwardIterator>::difference_type n); // C++20
+
+template<class ForwardIterator>
+  constexpr ForwardIterator
+    shift_right(ForwardIterator first, ForwardIterator last,
+                typename iterator_traits<ForwardIterator>::difference_type n); // C++20
+
 template <class InputIterator, class Predicate>
     constexpr bool  // constexpr in C++20
     is_partitioned(InputIterator first, InputIterator last, Predicate pred);
 
 template <class ForwardIterator, class Predicate>
-    ForwardIterator
+    constexpr ForwardIterator  // constexpr in C++20
     partition(ForwardIterator first, ForwardIterator last, Predicate pred);
 
 template <class InputIterator, class OutputIterator1,
@@ -329,7 +339,7 @@ template <class ForwardIterator>
     is_sorted(ForwardIterator first, ForwardIterator last);
 
 template <class ForwardIterator, class Compare>
-    bool
+    constexpr bool  // constexpr in C++20
     is_sorted(ForwardIterator first, ForwardIterator last, Compare comp);
 
 template<class ForwardIterator>
@@ -415,12 +425,12 @@ template <class ForwardIterator, class T, class Compare>
     binary_search(ForwardIterator first, ForwardIterator last, const T& value, Compare comp);
 
 template <class InputIterator1, class InputIterator2, class OutputIterator>
-    OutputIterator
+    constexpr OutputIterator                          // constexpr in C++20
     merge(InputIterator1 first1, InputIterator1 last1,
           InputIterator2 first2, InputIterator2 last2, OutputIterator result);
 
 template <class InputIterator1, class InputIterator2, class OutputIterator, class Compare>
-    OutputIterator
+    constexpr OutputIterator                          // constexpr in C++20
     merge(InputIterator1 first1, InputIterator1 last1,
           InputIterator2 first2, InputIterator2 last2, OutputIterator result, Compare comp);
 
@@ -441,12 +451,12 @@ template <class InputIterator1, class InputIterator2, class Compare>
     includes(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, Compare comp);
 
 template <class InputIterator1, class InputIterator2, class OutputIterator>
-    OutputIterator
+    constexpr OutputIterator                          // constexpr in C++20
     set_union(InputIterator1 first1, InputIterator1 last1,
               InputIterator2 first2, InputIterator2 last2, OutputIterator result);
 
 template <class InputIterator1, class InputIterator2, class OutputIterator, class Compare>
-    OutputIterator
+    constexpr OutputIterator                          // constexpr in C++20
     set_union(InputIterator1 first1, InputIterator1 last1,
               InputIterator2 first2, InputIterator2 last2, OutputIterator result, Compare comp);
 
@@ -461,22 +471,22 @@ template <class InputIterator1, class InputIterator2, class OutputIterator, clas
                      InputIterator2 first2, InputIterator2 last2, OutputIterator result, Compare comp);
 
 template <class InputIterator1, class InputIterator2, class OutputIterator>
-    OutputIterator
+    constexpr OutputIterator                         // constexpr in C++20
     set_difference(InputIterator1 first1, InputIterator1 last1,
                    InputIterator2 first2, InputIterator2 last2, OutputIterator result);
 
 template <class InputIterator1, class InputIterator2, class OutputIterator, class Compare>
-    OutputIterator
+    constexpr OutputIterator                         // constexpr in C++20
     set_difference(InputIterator1 first1, InputIterator1 last1,
                    InputIterator2 first2, InputIterator2 last2, OutputIterator result, Compare comp);
 
 template <class InputIterator1, class InputIterator2, class OutputIterator>
-    OutputIterator
+    constexpr OutputIterator                         // constexpr in C++20
     set_symmetric_difference(InputIterator1 first1, InputIterator1 last1,
                              InputIterator2 first2, InputIterator2 last2, OutputIterator result);
 
 template <class InputIterator1, class InputIterator2, class OutputIterator, class Compare>
-    OutputIterator
+    constexpr OutputIterator                         // constexpr in C++20
     set_symmetric_difference(InputIterator1 first1, InputIterator1 last1,
                              InputIterator2 first2, InputIterator2 last2, OutputIterator result, Compare comp);
 
@@ -529,82 +539,82 @@ template <class RandomAccessIterator, class Compare>
     is_heap_until(RandomAccessIterator first, RandomAccessiterator last, Compare comp);
 
 template <class ForwardIterator>
-    ForwardIterator
-    min_element(ForwardIterator first, ForwardIterator last);  // constexpr in C++14
+    constexpr ForwardIterator        // constexpr in C++14
+    min_element(ForwardIterator first, ForwardIterator last);
 
 template <class ForwardIterator, class Compare>
-    ForwardIterator
-    min_element(ForwardIterator first, ForwardIterator last, Compare comp);  // constexpr in C++14
+    constexpr ForwardIterator        // constexpr in C++14
+    min_element(ForwardIterator first, ForwardIterator last, Compare comp);
 
 template <class T>
-    const T&
-    min(const T& a, const T& b);  // constexpr in C++14
+    constexpr const T&               // constexpr in C++14
+    min(const T& a, const T& b);
 
 template <class T, class Compare>
-    const T&
-    min(const T& a, const T& b, Compare comp);  // constexpr in C++14
+    constexpr const T&               // constexpr in C++14
+    min(const T& a, const T& b, Compare comp);
 
 template<class T>
-    T
-    min(initializer_list<T> t);  // constexpr in C++14
+    constexpr T                      // constexpr in C++14
+    min(initializer_list<T> t);
 
 template<class T, class Compare>
-    T
-    min(initializer_list<T> t, Compare comp);  // constexpr in C++14
+    constexpr T                      // constexpr in C++14
+    min(initializer_list<T> t, Compare comp);
 
 template<class T>
-    constexpr const T& clamp( const T& v, const T& lo, const T& hi );               // C++17
+    constexpr const T& clamp(const T& v, const T& lo, const T& hi);               // C++17
 
 template<class T, class Compare>
-    constexpr const T& clamp( const T& v, const T& lo, const T& hi, Compare comp ); // C++17
+    constexpr const T& clamp(const T& v, const T& lo, const T& hi, Compare comp); // C++17
 
 template <class ForwardIterator>
-    ForwardIterator
-    max_element(ForwardIterator first, ForwardIterator last);  // constexpr in C++14
+    constexpr ForwardIterator        // constexpr in C++14
+    max_element(ForwardIterator first, ForwardIterator last);
 
 template <class ForwardIterator, class Compare>
-    ForwardIterator
-    max_element(ForwardIterator first, ForwardIterator last, Compare comp);  // constexpr in C++14
+    constexpr ForwardIterator        // constexpr in C++14
+    max_element(ForwardIterator first, ForwardIterator last, Compare comp);
 
 template <class T>
-    const T&
-    max(const T& a, const T& b); // constexpr in C++14
+    constexpr const T&               // constexpr in C++14
+    max(const T& a, const T& b);
 
 template <class T, class Compare>
-    const T&
-    max(const T& a, const T& b, Compare comp);  // constexpr in C++14
+    constexpr const T&               // constexpr in C++14
+    max(const T& a, const T& b, Compare comp);
 
 template<class T>
-    T
-    max(initializer_list<T> t);  // constexpr in C++14
+    constexpr T                      // constexpr in C++14
+    max(initializer_list<T> t);
 
 template<class T, class Compare>
-    T
-    max(initializer_list<T> t, Compare comp);  // constexpr in C++14
+    constexpr T                      // constexpr in C++14
+    max(initializer_list<T> t, Compare comp);
 
 template<class ForwardIterator>
-    pair<ForwardIterator, ForwardIterator>
-    minmax_element(ForwardIterator first, ForwardIterator last);   // constexpr in C++14
+    constexpr pair<ForwardIterator, ForwardIterator>  // constexpr in C++14
+    minmax_element(ForwardIterator first, ForwardIterator last);
 
 template<class ForwardIterator, class Compare>
-    pair<ForwardIterator, ForwardIterator>
-    minmax_element(ForwardIterator first, ForwardIterator last, Compare comp);   // constexpr in C++14
+    constexpr pair<ForwardIterator, ForwardIterator>  // constexpr in C++14
+    minmax_element(ForwardIterator first, ForwardIterator last, Compare comp);
 
 template<class T>
-    pair<const T&, const T&>
-    minmax(const T& a, const T& b);  // constexpr in C++14
+    constexpr pair<const T&, const T&>  // constexpr in C++14
+    minmax(const T& a, const T& b);
 
 template<class T, class Compare>
-    pair<const T&, const T&>
-    minmax(const T& a, const T& b, Compare comp);  // constexpr in C++14
+    constexpr pair<const T&, const T&>  // constexpr in C++14
+    minmax(const T& a, const T& b, Compare comp);
 
 template<class T>
-    pair<T, T>
-    minmax(initializer_list<T> t);  // constexpr in C++14
+    constexpr pair<T, T>                // constexpr in C++14
+    minmax(initializer_list<T> t);
 
 template<class T, class Compare>
-    pair<T, T>
-    minmax(initializer_list<T> t, Compare comp);  // constexpr in C++14
+    constexpr pair<T, T>                // constexpr in C++14
+    minmax(initializer_list<T> t, Compare comp);
 
 template <class InputIterator1, class InputIterator2>
     constexpr bool     // constexpr in C++20
@@ -616,19 +626,19 @@ template <class InputIterator1, class InputIterator2, class Compare>
                             InputIterator2 first2, InputIterator2 last2, Compare comp);
 
 template <class BidirectionalIterator>
-    bool
+    constexpr bool     // constexpr in C++20
     next_permutation(BidirectionalIterator first, BidirectionalIterator last);
 
 template <class BidirectionalIterator, class Compare>
-    bool
+    constexpr bool     // constexpr in C++20
     next_permutation(BidirectionalIterator first, BidirectionalIterator last, Compare comp);
 
 template <class BidirectionalIterator>
-    bool
+    constexpr bool     // constexpr in C++20
     prev_permutation(BidirectionalIterator first, BidirectionalIterator last);
 
 template <class BidirectionalIterator, class Compare>
-    bool
+    constexpr bool     // constexpr in C++20
     prev_permutation(BidirectionalIterator first, BidirectionalIterator last, Compare comp);
 
 }  // std
@@ -895,7 +905,7 @@ inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _InputIterator
 for_each_n(_InputIterator __first, _Size __orig_n, _Function __f)
 {
-    typedef decltype(__convert_to_integral(__orig_n)) _IntegralSize;
+    typedef decltype(_VSTD::__convert_to_integral(__orig_n)) _IntegralSize;
     _IntegralSize __n = __orig_n;
     while (__n > 0)
     {
@@ -1614,7 +1624,7 @@ search_n(_ForwardIterator __first, _ForwardIterator __last,
          _Size __count, const _Tp& __value_, _BinaryPredicate __pred)
 {
     return _VSTD::__search_n<typename add_lvalue_reference<_BinaryPredicate>::type>
-           (__first, __last, __convert_to_integral(__count), __value_, __pred,
+           (__first, __last, _VSTD::__convert_to_integral(__count), __value_, __pred,
            typename iterator_traits<_ForwardIterator>::iterator_category());
 }
 
@@ -1625,13 +1635,13 @@ _ForwardIterator
 search_n(_ForwardIterator __first, _ForwardIterator __last, _Size __count, const _Tp& __value_)
 {
     typedef typename iterator_traits<_ForwardIterator>::value_type __v;
-    return _VSTD::search_n(__first, __last, __convert_to_integral(__count),
+    return _VSTD::search_n(__first, __last, _VSTD::__convert_to_integral(__count),
                            __value_, __equal_to<__v, _Tp>());
 }
 
 // copy
 template <class _Iter>
-inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
 _Iter
 __unwrap_iter(_Iter __i)
 {
@@ -1639,7 +1649,7 @@ __unwrap_iter(_Iter __i)
 }
 
 template <class _Tp>
-inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
 typename enable_if
 <
     is_trivially_copy_assignable<_Tp>::value,
@@ -1653,7 +1663,7 @@ __unwrap_iter(move_iterator<_Tp*> __i)
 #if _LIBCPP_DEBUG_LEVEL < 2
 
 template <class _Tp>
-inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_IF_NODEBUG
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
 typename enable_if
 <
     is_trivially_copy_assignable<_Tp>::value,
@@ -1665,7 +1675,7 @@ __unwrap_iter(__wrap_iter<_Tp*> __i)
 }
 
 template <class _Tp>
-inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_IF_NODEBUG
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
 typename enable_if
 <
     is_trivially_copy_assignable<_Tp>::value,
@@ -1679,7 +1689,7 @@ __unwrap_iter(__wrap_iter<const _Tp*> __i)
 #else
 
 template <class _Tp>
-inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_IF_NODEBUG
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
 typename enable_if
 <
     is_trivially_copy_assignable<_Tp>::value,
@@ -1707,7 +1717,7 @@ inline _LIBCPP_INLINE_VISIBILITY
 _OutputIterator
 __copy(_InputIterator __first, _InputIterator __last, _OutputIterator __result)
 {
-    return __copy_constexpr(__first, __last, __result);
+    return _VSTD::__copy_constexpr(__first, __last, __result);
 }
 
 template <class _Tp, class _Up>
@@ -1727,16 +1737,16 @@ __copy(_Tp* __first, _Tp* __last, _Up* __result)
 }
 
 template <class _InputIterator, class _OutputIterator>
-inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17_WITH_IS_CONSTANT_EVALUATED
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _OutputIterator
 copy(_InputIterator __first, _InputIterator __last, _OutputIterator __result)
 {
     if (__libcpp_is_constant_evaluated()) {
         return _VSTD::__copy_constexpr(
-            __unwrap_iter(__first), __unwrap_iter(__last), __unwrap_iter(__result));
+            _VSTD::__unwrap_iter(__first), _VSTD::__unwrap_iter(__last), _VSTD::__unwrap_iter(__result));
     } else {
         return _VSTD::__copy(
-            __unwrap_iter(__first), __unwrap_iter(__last), __unwrap_iter(__result));
+            _VSTD::__unwrap_iter(__first), _VSTD::__unwrap_iter(__last), _VSTD::__unwrap_iter(__result));
     }
 }
 
@@ -1757,7 +1767,7 @@ inline _LIBCPP_INLINE_VISIBILITY
 _OutputIterator
 __copy_backward(_BidirectionalIterator __first, _BidirectionalIterator __last, _OutputIterator __result)
 {
-    return __copy_backward_constexpr(__first, __last, __result);
+    return _VSTD::__copy_backward_constexpr(__first, __last, __result);
 }
 
 template <class _Tp, class _Up>
@@ -1780,19 +1790,19 @@ __copy_backward(_Tp* __first, _Tp* __last, _Up* __result)
 }
 
 template <class _BidirectionalIterator1, class _BidirectionalIterator2>
-inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17_WITH_IS_CONSTANT_EVALUATED
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _BidirectionalIterator2
 copy_backward(_BidirectionalIterator1 __first, _BidirectionalIterator1 __last,
               _BidirectionalIterator2 __result)
 {
     if (__libcpp_is_constant_evaluated()) {
-        return _VSTD::__copy_backward_constexpr(__unwrap_iter(__first),
-                                                __unwrap_iter(__last),
-                                                __unwrap_iter(__result));
+        return _VSTD::__copy_backward_constexpr(_VSTD::__unwrap_iter(__first),
+                                                _VSTD::__unwrap_iter(__last),
+                                                _VSTD::__unwrap_iter(__result));
     } else {
-        return _VSTD::__copy_backward(__unwrap_iter(__first),
-                                      __unwrap_iter(__last),
-                                      __unwrap_iter(__result));
+        return _VSTD::__copy_backward(_VSTD::__unwrap_iter(__first),
+                                      _VSTD::__unwrap_iter(__last),
+                                      _VSTD::__unwrap_iter(__result));
     }
 }
 
@@ -1818,7 +1828,7 @@ copy_if(_InputIterator __first, _InputIterator __last,
 // copy_n
 
 template<class _InputIterator, class _Size, class _OutputIterator>
-inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17_WITH_IS_CONSTANT_EVALUATED
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 typename enable_if
 <
     __is_cpp17_input_iterator<_InputIterator>::value &&
@@ -1827,7 +1837,7 @@ typename enable_if
 >::type
 copy_n(_InputIterator __first, _Size __orig_n, _OutputIterator __result)
 {
-    typedef decltype(__convert_to_integral(__orig_n)) _IntegralSize;
+    typedef decltype(_VSTD::__convert_to_integral(__orig_n)) _IntegralSize;
     _IntegralSize __n = __orig_n;
     if (__n > 0)
     {
@@ -1844,7 +1854,7 @@ copy_n(_InputIterator __first, _Size __orig_n, _OutputIterator __result)
 }
 
 template<class _InputIterator, class _Size, class _OutputIterator>
-inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17_WITH_IS_CONSTANT_EVALUATED
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 typename enable_if
 <
     __is_cpp17_random_access_iterator<_InputIterator>::value,
@@ -1852,25 +1862,35 @@ typename enable_if
 >::type
 copy_n(_InputIterator __first, _Size __orig_n, _OutputIterator __result)
 {
-    typedef decltype(__convert_to_integral(__orig_n)) _IntegralSize;
+    typedef decltype(_VSTD::__convert_to_integral(__orig_n)) _IntegralSize;
     _IntegralSize __n = __orig_n;
     return _VSTD::copy(__first, __first + __n, __result);
 }
 
 // move
 
+// __move_constexpr exists so that __move doesn't call itself when delegating to the constexpr
+// version of __move.
 template <class _InputIterator, class _OutputIterator>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
 _OutputIterator
-__move(_InputIterator __first, _InputIterator __last, _OutputIterator __result)
+__move_constexpr(_InputIterator __first, _InputIterator __last, _OutputIterator __result)
 {
     for (; __first != __last; ++__first, (void) ++__result)
         *__result = _VSTD::move(*__first);
     return __result;
 }
 
+template <class _InputIterator, class _OutputIterator>
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
+_OutputIterator
+__move(_InputIterator __first, _InputIterator __last, _OutputIterator __result)
+{
+    return _VSTD::__move_constexpr(__first, __last, __result);
+}
+
 template <class _Tp, class _Up>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
 typename enable_if
 <
     is_same<typename remove_const<_Tp>::type, _Up>::value &&
@@ -1879,6 +1899,8 @@ typename enable_if
 >::type
 __move(_Tp* __first, _Tp* __last, _Up* __result)
 {
+    if (__libcpp_is_constant_evaluated())
+        return _VSTD::__move_constexpr(__first, __last, __result);
     const size_t __n = static_cast<size_t>(__last - __first);
     if (__n > 0)
         _VSTD::memmove(__result, __first, __n * sizeof(_Up));
@@ -1886,27 +1908,37 @@ __move(_Tp* __first, _Tp* __last, _Up* __result)
 }
 
 template <class _InputIterator, class _OutputIterator>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _OutputIterator
 move(_InputIterator __first, _InputIterator __last, _OutputIterator __result)
 {
-    return _VSTD::__move(__unwrap_iter(__first), __unwrap_iter(__last), __unwrap_iter(__result));
+    return _VSTD::__move(_VSTD::__unwrap_iter(__first), _VSTD::__unwrap_iter(__last), _VSTD::__unwrap_iter(__result));
 }
 
 // move_backward
 
+// __move_backward_constexpr exists so that __move_backward doesn't call itself when delegating to
+// the constexpr version of __move_backward.
 template <class _InputIterator, class _OutputIterator>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
 _OutputIterator
-__move_backward(_InputIterator __first, _InputIterator __last, _OutputIterator __result)
+__move_backward_constexpr(_InputIterator __first, _InputIterator __last, _OutputIterator __result)
 {
     while (__first != __last)
         *--__result = _VSTD::move(*--__last);
     return __result;
 }
 
+template <class _InputIterator, class _OutputIterator>
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
+_OutputIterator
+__move_backward(_InputIterator __first, _InputIterator __last, _OutputIterator __result)
+{
+    return _VSTD::__move_backward_constexpr(__first, __last, __result);
+}
+
 template <class _Tp, class _Up>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
 typename enable_if
 <
     is_same<typename remove_const<_Tp>::type, _Up>::value &&
@@ -1915,6 +1947,8 @@ typename enable_if
 >::type
 __move_backward(_Tp* __first, _Tp* __last, _Up* __result)
 {
+    if (__libcpp_is_constant_evaluated())
+        return _VSTD::__move_backward_constexpr(__first, __last, __result);
     const size_t __n = static_cast<size_t>(__last - __first);
     if (__n > 0)
     {
@@ -1925,12 +1959,12 @@ __move_backward(_Tp* __first, _Tp* __last, _Up* __result)
 }
 
 template <class _BidirectionalIterator1, class _BidirectionalIterator2>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _BidirectionalIterator2
 move_backward(_BidirectionalIterator1 __first, _BidirectionalIterator1 __last,
               _BidirectionalIterator2 __result)
 {
-    return _VSTD::__move_backward(__unwrap_iter(__first), __unwrap_iter(__last), __unwrap_iter(__result));
+    return _VSTD::__move_backward(_VSTD::__unwrap_iter(__first), _VSTD::__unwrap_iter(__last), _VSTD::__unwrap_iter(__result));
 }
 
 // iter_swap
@@ -2033,7 +2067,7 @@ inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _OutputIterator
 fill_n(_OutputIterator __first, _Size __n, const _Tp& __value_)
 {
-   return _VSTD::__fill_n(__first, __convert_to_integral(__n), __value_);
+   return _VSTD::__fill_n(__first, _VSTD::__convert_to_integral(__n), __value_);
 }
 
 // fill
@@ -2081,7 +2115,7 @@ inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _OutputIterator
 generate_n(_OutputIterator __first, _Size __orig_n, _Generator __gen)
 {
-    typedef decltype(__convert_to_integral(__orig_n)) _IntegralSize;
+    typedef decltype(_VSTD::__convert_to_integral(__orig_n)) _IntegralSize;
     _IntegralSize __n = __orig_n;
     for (; __n > 0; ++__first, (void) --__n)
         *__first = __gen();
@@ -2287,7 +2321,7 @@ unique_copy(_InputIterator __first, _InputIterator __last, _OutputIterator __res
 // reverse
 
 template <class _BidirectionalIterator>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 void
 __reverse(_BidirectionalIterator __first, _BidirectionalIterator __last, bidirectional_iterator_tag)
 {
@@ -2301,7 +2335,7 @@ __reverse(_BidirectionalIterator __first, _BidirectionalIterator __last, bidirec
 }
 
 template <class _RandomAccessIterator>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 void
 __reverse(_RandomAccessIterator __first, _RandomAccessIterator __last, random_access_iterator_tag)
 {
@@ -2311,7 +2345,7 @@ __reverse(_RandomAccessIterator __first, _RandomAccessIterator __last, random_ac
 }
 
 template <class _BidirectionalIterator>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 void
 reverse(_BidirectionalIterator __first, _BidirectionalIterator __last)
 {
@@ -2333,7 +2367,7 @@ reverse_copy(_BidirectionalIterator __first, _BidirectionalIterator __last, _Out
 // rotate
 
 template <class _ForwardIterator>
-_ForwardIterator
+_LIBCPP_CONSTEXPR_AFTER_CXX11 _ForwardIterator
 __rotate_left(_ForwardIterator __first, _ForwardIterator __last)
 {
     typedef typename iterator_traits<_ForwardIterator>::value_type value_type;
@@ -2344,7 +2378,7 @@ __rotate_left(_ForwardIterator __first, _ForwardIterator __last)
 }
 
 template <class _BidirectionalIterator>
-_BidirectionalIterator
+_LIBCPP_CONSTEXPR_AFTER_CXX11 _BidirectionalIterator
 __rotate_right(_BidirectionalIterator __first, _BidirectionalIterator __last)
 {
     typedef typename iterator_traits<_BidirectionalIterator>::value_type value_type;
@@ -2356,7 +2390,7 @@ __rotate_right(_BidirectionalIterator __first, _BidirectionalIterator __last)
 }
 
 template <class _ForwardIterator>
-_ForwardIterator
+_LIBCPP_CONSTEXPR_AFTER_CXX14 _ForwardIterator
 __rotate_forward(_ForwardIterator __first, _ForwardIterator __middle, _ForwardIterator __last)
 {
     _ForwardIterator __i = __middle;
@@ -2392,7 +2426,7 @@ __rotate_forward(_ForwardIterator __first, _ForwardIterator __middle, _ForwardIt
 
 template<typename _Integral>
 inline _LIBCPP_INLINE_VISIBILITY
-_Integral
+_LIBCPP_CONSTEXPR_AFTER_CXX14 _Integral
 __algo_gcd(_Integral __x, _Integral __y)
 {
     do
@@ -2405,7 +2439,7 @@ __algo_gcd(_Integral __x, _Integral __y)
 }
 
 template<typename _RandomAccessIterator>
-_RandomAccessIterator
+_LIBCPP_CONSTEXPR_AFTER_CXX14 _RandomAccessIterator
 __rotate_gcd(_RandomAccessIterator __first, _RandomAccessIterator __middle, _RandomAccessIterator __last)
 {
     typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type;
@@ -2441,7 +2475,7 @@ __rotate_gcd(_RandomAccessIterator __first, _RandomAccessIterator __middle, _Ran
 
 template <class _ForwardIterator>
 inline _LIBCPP_INLINE_VISIBILITY
-_ForwardIterator
+_LIBCPP_CONSTEXPR_AFTER_CXX11 _ForwardIterator
 __rotate(_ForwardIterator __first, _ForwardIterator __middle, _ForwardIterator __last,
          _VSTD::forward_iterator_tag)
 {
@@ -2456,7 +2490,7 @@ __rotate(_ForwardIterator __first, _ForwardIterator __middle, _ForwardIterator _
 
 template <class _BidirectionalIterator>
 inline _LIBCPP_INLINE_VISIBILITY
-_BidirectionalIterator
+_LIBCPP_CONSTEXPR_AFTER_CXX11 _BidirectionalIterator
 __rotate(_BidirectionalIterator __first, _BidirectionalIterator __middle, _BidirectionalIterator __last,
          _VSTD::bidirectional_iterator_tag)
 {
@@ -2473,7 +2507,7 @@ __rotate(_BidirectionalIterator __first, _BidirectionalIterator __middle, _Bidir
 
 template <class _RandomAccessIterator>
 inline _LIBCPP_INLINE_VISIBILITY
-_RandomAccessIterator
+_LIBCPP_CONSTEXPR_AFTER_CXX11 _RandomAccessIterator
 __rotate(_RandomAccessIterator __first, _RandomAccessIterator __middle, _RandomAccessIterator __last,
          _VSTD::random_access_iterator_tag)
 {
@@ -2491,7 +2525,7 @@ __rotate(_RandomAccessIterator __first, _RandomAccessIterator __middle, _RandomA
 
 template <class _ForwardIterator>
 inline _LIBCPP_INLINE_VISIBILITY
-_ForwardIterator
+_LIBCPP_CONSTEXPR_AFTER_CXX17 _ForwardIterator
 rotate(_ForwardIterator __first, _ForwardIterator __middle, _ForwardIterator __last)
 {
     if (__first == __middle)
@@ -2505,7 +2539,7 @@ rotate(_ForwardIterator __first, _ForwardIterator __middle, _ForwardIterator __l
 // rotate_copy
 
 template <class _ForwardIterator, class _OutputIterator>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _OutputIterator
 rotate_copy(_ForwardIterator __first, _ForwardIterator __middle, _ForwardIterator __last, _OutputIterator __result)
 {
@@ -2684,12 +2718,12 @@ clamp(const _Tp& __v, const _Tp& __lo, const _Tp& __hi)
 
 template <class _ForwardIterator, class _Compare>
 _LIBCPP_NODISCARD_EXT _LIBCPP_CONSTEXPR_AFTER_CXX11
-std::pair<_ForwardIterator, _ForwardIterator>
+pair<_ForwardIterator, _ForwardIterator>
 minmax_element(_ForwardIterator __first, _ForwardIterator __last, _Compare __comp)
 {
   static_assert(__is_cpp17_forward_iterator<_ForwardIterator>::value,
         "std::minmax_element requires a ForwardIterator");
-  std::pair<_ForwardIterator, _ForwardIterator> __result(__first, __first);
+  pair<_ForwardIterator, _ForwardIterator> __result(__first, __first);
   if (__first != __last)
   {
       if (++__first != __last)
@@ -2735,7 +2769,7 @@ minmax_element(_ForwardIterator __first, _ForwardIterator __last, _Compare __com
 template <class _ForwardIterator>
 _LIBCPP_NODISCARD_EXT inline
 _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX11
-std::pair<_ForwardIterator, _ForwardIterator>
+pair<_ForwardIterator, _ForwardIterator>
 minmax_element(_ForwardIterator __first, _ForwardIterator __last)
 {
     return _VSTD::minmax_element(__first, __last,
@@ -2774,7 +2808,7 @@ minmax(initializer_list<_Tp> __t, _Compare __comp)
     typedef typename initializer_list<_Tp>::const_iterator _Iter;
     _Iter __first = __t.begin();
     _Iter __last  = __t.end();
-    std::pair<_Tp, _Tp> __result(*__first, *__first);
+    pair<_Tp, _Tp> __result(*__first, *__first);
 
     ++__first;
     if (__t.size() % 2 == 0)
@@ -3005,9 +3039,17 @@ private:
 
 public:
     // constructors and reset functions
-    explicit uniform_int_distribution(result_type __a = 0,
-                                      result_type __b = numeric_limits<result_type>::max())
+#ifndef _LIBCPP_CXX03_LANG
+    uniform_int_distribution() : uniform_int_distribution(0) {}
+    explicit uniform_int_distribution(
+        result_type __a, result_type __b = numeric_limits<result_type>::max())
         : __p_(param_type(__a, __b)) {}
+#else
+    explicit uniform_int_distribution(
+        result_type __a = 0,
+        result_type __b = numeric_limits<result_type>::max())
+        : __p_(param_type(__a, __b)) {}
+#endif
     explicit uniform_int_distribution(const param_type& __p) : __p_(__p) {}
     void reset() {}
 
@@ -3050,7 +3092,7 @@ _LIBCPP_DISABLE_UBSAN_UNSIGNED_INTEGER_CHECK
     if (_Rp == 0)
         return static_cast<result_type>(_Eng(__g, _Dt)());
     size_t __w = _Dt - __libcpp_clz(_Rp) - 1;
-    if ((_Rp & (std::numeric_limits<_UIntType>::max() >> (_Dt - __w))) != 0)
+    if ((_Rp & (numeric_limits<_UIntType>::max() >> (_Dt - __w))) != 0)
         ++__w;
     _Eng __e(__g, __w);
     _UIntType __u;
@@ -3227,6 +3269,111 @@ template<class _RandomAccessIterator, class _UniformRandomNumberGenerator>
     }
 }
 
+#if _LIBCPP_STD_VER > 17
+
+// shift_left, shift_right
+
+template <class _ForwardIterator>
+inline _LIBCPP_INLINE_VISIBILITY constexpr
+_ForwardIterator
+shift_left(_ForwardIterator __first, _ForwardIterator __last,
+           typename iterator_traits<_ForwardIterator>::difference_type __n)
+{
+    if (__n == 0) {
+        return __last;
+    }
+
+    _ForwardIterator __m = __first;
+    if constexpr (__is_cpp17_random_access_iterator<_ForwardIterator>::value) {
+        if (__n >= __last - __first) {
+            return __first;
+        }
+        __m += __n;
+    } else {
+        for (; __n > 0; --__n) {
+            if (__m == __last) {
+                return __first;
+            }
+            ++__m;
+        }
+    }
+    return _VSTD::move(__m, __last, __first);
+}
+
+template <class _ForwardIterator>
+inline _LIBCPP_INLINE_VISIBILITY constexpr
+_ForwardIterator
+shift_right(_ForwardIterator __first, _ForwardIterator __last,
+            typename iterator_traits<_ForwardIterator>::difference_type __n)
+{
+    if (__n == 0) {
+        return __first;
+    }
+
+    if constexpr (__is_cpp17_random_access_iterator<_ForwardIterator>::value) {
+        decltype(__n) __d = __last - __first;
+        if (__n >= __d) {
+            return __last;
+        }
+        _ForwardIterator __m = __first + (__d - __n);
+        return _VSTD::move_backward(__first, __m, __last);
+    } else if constexpr (__is_cpp17_bidirectional_iterator<_ForwardIterator>::value) {
+        _ForwardIterator __m = __last;
+        for (; __n > 0; --__n) {
+            if (__m == __first) {
+                return __last;
+            }
+            --__m;
+        }
+        return _VSTD::move_backward(__first, __m, __last);
+    } else {
+        _ForwardIterator __ret = __first;
+        for (; __n > 0; --__n) {
+            if (__ret == __last) {
+                return __last;
+            }
+            ++__ret;
+        }
+
+        // We have an __n-element scratch space from __first to __ret.
+        // Slide an __n-element window [__trail, __lead) from left to right.
+        // We're essentially doing swap_ranges(__first, __ret, __trail, __lead)
+        // over and over; but once __lead reaches __last we needn't bother
+        // to save the values of elements [__trail, __last).
+
+        auto __trail = __first;
+        auto __lead = __ret;
+        while (__trail != __ret) {
+            if (__lead == __last) {
+                _VSTD::move(__first, __trail, __ret);
+                return __ret;
+            }
+            ++__trail;
+            ++__lead;
+        }
+
+        _ForwardIterator __mid = __first;
+        while (true) {
+            if (__lead == __last) {
+                __trail = _VSTD::move(__mid, __ret, __trail);
+                _VSTD::move(__first, __mid, __trail);
+                return __ret;
+            }
+            swap(*__mid, *__trail);
+            ++__mid;
+            ++__trail;
+            ++__lead;
+            if (__mid == __ret) {
+                __mid = __first;
+            }
+        }
+    }
+}
+
+#endif // _LIBCPP_STD_VER > 17
+
+// is_partitioned
+
 template <class _InputIterator, class _Predicate>
 _LIBCPP_NODISCARD_EXT _LIBCPP_CONSTEXPR_AFTER_CXX17 bool
 is_partitioned(_InputIterator __first, _InputIterator __last, _Predicate __pred)
@@ -3246,7 +3393,7 @@ is_partitioned(_InputIterator __first, _InputIterator __last, _Predicate __pred)
 // partition
 
 template <class _Predicate, class _ForwardIterator>
-_ForwardIterator
+_LIBCPP_CONSTEXPR_AFTER_CXX17 _ForwardIterator
 __partition(_ForwardIterator __first, _ForwardIterator __last, _Predicate __pred, forward_iterator_tag)
 {
     while (true)
@@ -3269,7 +3416,7 @@ __partition(_ForwardIterator __first, _ForwardIterator __last, _Predicate __pred
 }
 
 template <class _Predicate, class _BidirectionalIterator>
-_BidirectionalIterator
+_LIBCPP_CONSTEXPR_AFTER_CXX17 _BidirectionalIterator
 __partition(_BidirectionalIterator __first, _BidirectionalIterator __last, _Predicate __pred,
             bidirectional_iterator_tag)
 {
@@ -3294,7 +3441,7 @@ __partition(_BidirectionalIterator __first, _BidirectionalIterator __last, _Pred
 }
 
 template <class _ForwardIterator, class _Predicate>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _ForwardIterator
 partition(_ForwardIterator __first, _ForwardIterator __last, _Predicate __pred)
 {
@@ -3380,8 +3527,8 @@ __stable_partition(_ForwardIterator __first, _ForwardIterator __last, _Predicate
         // Move the falses into the temporary buffer, and the trues to the front of the line
         // Update __first to always point to the end of the trues
         value_type* __t = __p.first;
-        ::new(__t) value_type(_VSTD::move(*__first));
-        __d.__incr((value_type*)0);
+        ::new ((void*)__t) value_type(_VSTD::move(*__first));
+        __d.template __incr<value_type>();
         ++__t;
         _ForwardIterator __i = __first;
         while (++__i != __last)
@@ -3393,8 +3540,8 @@ __stable_partition(_ForwardIterator __first, _ForwardIterator __last, _Predicate
             }
             else
             {
-                ::new(__t) value_type(_VSTD::move(*__i));
-                __d.__incr((value_type*)0);
+                ::new ((void*)__t) value_type(_VSTD::move(*__i));
+                __d.template __incr<value_type>();
                 ++__t;
             }
         }
@@ -3415,7 +3562,7 @@ __stable_partition(_ForwardIterator __first, _ForwardIterator __last, _Predicate
     // F?????????????????
     // f       m         l
     typedef typename add_lvalue_reference<_Predicate>::type _PredRef;
-    _ForwardIterator __first_false = __stable_partition<_PredRef>(__first, __m, __pred, __len2, __p, __fit);
+    _ForwardIterator __first_false = _VSTD::__stable_partition<_PredRef>(__first, __m, __pred, __len2, __p, __fit);
     // TTTFFFFF??????????
     // f  ff   m         l
     // recurse on [__m, __last], except increase __m until *(__m) is false, *__last know to be true
@@ -3430,7 +3577,7 @@ __stable_partition(_ForwardIterator __first, _ForwardIterator __last, _Predicate
     }
     // TTTFFFFFTTTF??????
     // f  ff   m  m1     l
-    __second_false = __stable_partition<_PredRef>(__m1, __last, __pred, __len_half, __p, __fit);
+    __second_false = _VSTD::__stable_partition<_PredRef>(__m1, __last, __pred, __len_half, __p, __fit);
 __second_half_done:
     // TTTFFFFFTTTTTFFFFF
     // f  ff   m    sf   l
@@ -3472,7 +3619,7 @@ __stable_partition(_ForwardIterator __first, _ForwardIterator __last, _Predicate
         __p = _VSTD::get_temporary_buffer<value_type>(__len);
         __h.reset(__p.first);
     }
-    return __stable_partition<typename add_lvalue_reference<_Predicate>::type>
+    return _VSTD::__stable_partition<typename add_lvalue_reference<_Predicate>::type>
                              (__first, __last, __pred, __len, __p, forward_iterator_tag());
 }
 
@@ -3510,8 +3657,8 @@ __stable_partition(_BidirectionalIterator __first, _BidirectionalIterator __last
         // Move the falses into the temporary buffer, and the trues to the front of the line
         // Update __first to always point to the end of the trues
         value_type* __t = __p.first;
-        ::new(__t) value_type(_VSTD::move(*__first));
-        __d.__incr((value_type*)0);
+        ::new ((void*)__t) value_type(_VSTD::move(*__first));
+        __d.template __incr<value_type>();
         ++__t;
         _BidirectionalIterator __i = __first;
         while (++__i != __last)
@@ -3523,8 +3670,8 @@ __stable_partition(_BidirectionalIterator __first, _BidirectionalIterator __last
             }
             else
             {
-                ::new(__t) value_type(_VSTD::move(*__i));
-                __d.__incr((value_type*)0);
+                ::new ((void*)__t) value_type(_VSTD::move(*__i));
+                __d.template __incr<value_type>();
                 ++__t;
             }
         }
@@ -3558,7 +3705,7 @@ __stable_partition(_BidirectionalIterator __first, _BidirectionalIterator __last
     // F???TFFF?????????T
     // f   m1  m        l
     typedef typename add_lvalue_reference<_Predicate>::type _PredRef;
-    __first_false = __stable_partition<_PredRef>(__first, __m1, __pred, __len_half, __p, __bit);
+    __first_false = _VSTD::__stable_partition<_PredRef>(__first, __m1, __pred, __len_half, __p, __bit);
 __first_half_done:
     // TTTFFFFF?????????T
     // f  ff   m        l
@@ -3575,7 +3722,7 @@ __first_half_done:
     }
     // TTTFFFFFTTTF?????T
     // f  ff   m  m1    l
-    __second_false = __stable_partition<_PredRef>(__m1, __last, __pred, __len_half, __p, __bit);
+    __second_false = _VSTD::__stable_partition<_PredRef>(__m1, __last, __pred, __len_half, __p, __bit);
 __second_half_done:
     // TTTFFFFFTTTTTFFFFF
     // f  ff   m    sf  l
@@ -3620,7 +3767,7 @@ __stable_partition(_BidirectionalIterator __first, _BidirectionalIterator __last
         __p = _VSTD::get_temporary_buffer<value_type>(__len);
         __h.reset(__p.first);
     }
-    return __stable_partition<typename add_lvalue_reference<_Predicate>::type>
+    return _VSTD::__stable_partition<typename add_lvalue_reference<_Predicate>::type>
                              (__first, __last, __pred, __len, __p, bidirectional_iterator_tag());
 }
 
@@ -3629,7 +3776,7 @@ inline _LIBCPP_INLINE_VISIBILITY
 _ForwardIterator
 stable_partition(_ForwardIterator __first, _ForwardIterator __last, _Predicate __pred)
 {
-    return __stable_partition<typename add_lvalue_reference<_Predicate>::type>
+    return _VSTD::__stable_partition<typename add_lvalue_reference<_Predicate>::type>
                              (__first, __last, __pred, typename iterator_traits<_ForwardIterator>::iterator_category());
 }
 
@@ -3727,7 +3874,7 @@ unsigned
 __sort4(_ForwardIterator __x1, _ForwardIterator __x2, _ForwardIterator __x3,
             _ForwardIterator __x4, _Compare __c)
 {
-    unsigned __r = __sort3<_Compare>(__x1, __x2, __x3, __c);
+    unsigned __r = _VSTD::__sort3<_Compare>(__x1, __x2, __x3, __c);
     if (__c(*__x4, *__x3))
     {
         swap(*__x3, *__x4);
@@ -3754,7 +3901,7 @@ unsigned
 __sort5(_ForwardIterator __x1, _ForwardIterator __x2, _ForwardIterator __x3,
             _ForwardIterator __x4, _ForwardIterator __x5, _Compare __c)
 {
-    unsigned __r = __sort4<_Compare>(__x1, __x2, __x3, __x4, __c);
+    unsigned __r = _VSTD::__sort4<_Compare>(__x1, __x2, __x3, __x4, __c);
     if (__c(*__x5, *__x4))
     {
         swap(*__x4, *__x5);
@@ -3779,14 +3926,14 @@ __sort5(_ForwardIterator __x1, _ForwardIterator __x2, _ForwardIterator __x3,
 }
 
 // Assumes size > 0
-template <class _Compare, class _BirdirectionalIterator>
+template <class _Compare, class _BidirectionalIterator>
 void
-__selection_sort(_BirdirectionalIterator __first, _BirdirectionalIterator __last, _Compare __comp)
+__selection_sort(_BidirectionalIterator __first, _BidirectionalIterator __last, _Compare __comp)
 {
-    _BirdirectionalIterator __lm1 = __last;
+    _BidirectionalIterator __lm1 = __last;
     for (--__lm1; __first != __lm1; ++__first)
     {
-        _BirdirectionalIterator __i = _VSTD::min_element<_BirdirectionalIterator,
+        _BidirectionalIterator __i = _VSTD::min_element<_BidirectionalIterator,
                                                         typename add_lvalue_reference<_Compare>::type>
                                                        (__first, __last, __comp);
         if (__i != __first)
@@ -3794,19 +3941,19 @@ __selection_sort(_BirdirectionalIterator __first, _BirdirectionalIterator __last
     }
 }
 
-template <class _Compare, class _BirdirectionalIterator>
+template <class _Compare, class _BidirectionalIterator>
 void
-__insertion_sort(_BirdirectionalIterator __first, _BirdirectionalIterator __last, _Compare __comp)
+__insertion_sort(_BidirectionalIterator __first, _BidirectionalIterator __last, _Compare __comp)
 {
-    typedef typename iterator_traits<_BirdirectionalIterator>::value_type value_type;
+    typedef typename iterator_traits<_BidirectionalIterator>::value_type value_type;
     if (__first != __last)
     {
-        _BirdirectionalIterator __i = __first;
+        _BidirectionalIterator __i = __first;
         for (++__i; __i != __last; ++__i)
         {
-            _BirdirectionalIterator __j = __i;
+            _BidirectionalIterator __j = __i;
             value_type __t(_VSTD::move(*__j));
-            for (_BirdirectionalIterator __k = __i; __k != __first && __comp(__t,  *--__k); --__j)
+            for (_BidirectionalIterator __k = __i; __k != __first && __comp(__t,  *--__k); --__j)
                 *__j = _VSTD::move(*__k);
             *__j = _VSTD::move(__t);
         }
@@ -3819,7 +3966,7 @@ __insertion_sort_3(_RandomAccessIterator __first, _RandomAccessIterator __last,
 {
     typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type;
     _RandomAccessIterator __j = __first+2;
-    __sort3<_Compare>(__first, __first+1, __j, __comp);
+    _VSTD::__sort3<_Compare>(__first, __first+1, __j, __comp);
     for (_RandomAccessIterator __i = __j+1; __i != __last; ++__i)
     {
         if (__comp(*__i, *__j))
@@ -3863,7 +4010,7 @@ __insertion_sort_incomplete(_RandomAccessIterator __first, _RandomAccessIterator
     }
     typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type;
     _RandomAccessIterator __j = __first+2;
-    __sort3<_Compare>(__first, __first+1, __j, __comp);
+    _VSTD::__sort3<_Compare>(__first, __first+1, __j, __comp);
     const unsigned __limit = 8;
     unsigned __count = 0;
     for (_RandomAccessIterator __i = __j+1; __i != __last; ++__i)
@@ -3887,35 +4034,35 @@ __insertion_sort_incomplete(_RandomAccessIterator __first, _RandomAccessIterator
     return true;
 }
 
-template <class _Compare, class _BirdirectionalIterator>
+template <class _Compare, class _BidirectionalIterator>
 void
-__insertion_sort_move(_BirdirectionalIterator __first1, _BirdirectionalIterator __last1,
-                      typename iterator_traits<_BirdirectionalIterator>::value_type* __first2, _Compare __comp)
+__insertion_sort_move(_BidirectionalIterator __first1, _BidirectionalIterator __last1,
+                      typename iterator_traits<_BidirectionalIterator>::value_type* __first2, _Compare __comp)
 {
-    typedef typename iterator_traits<_BirdirectionalIterator>::value_type value_type;
+    typedef typename iterator_traits<_BidirectionalIterator>::value_type value_type;
     if (__first1 != __last1)
     {
         __destruct_n __d(0);
         unique_ptr<value_type, __destruct_n&> __h(__first2, __d);
         value_type* __last2 = __first2;
-        ::new(__last2) value_type(_VSTD::move(*__first1));
-        __d.__incr((value_type*)0);
+        ::new ((void*)__last2) value_type(_VSTD::move(*__first1));
+        __d.template __incr<value_type>();
         for (++__last2; ++__first1 != __last1; ++__last2)
         {
             value_type* __j2 = __last2;
             value_type* __i2 = __j2;
             if (__comp(*__first1, *--__i2))
             {
-                ::new(__j2) value_type(_VSTD::move(*__i2));
-                __d.__incr((value_type*)0);
+                ::new ((void*)__j2) value_type(_VSTD::move(*__i2));
+                __d.template __incr<value_type>();
                 for (--__j2; __i2 != __first2 && __comp(*__first1,  *--__i2); --__j2)
                     *__j2 = _VSTD::move(*__i2);
                 *__j2 = _VSTD::move(*__first1);
             }
             else
             {
-                ::new(__j2) value_type(_VSTD::move(*__first1));
-                __d.__incr((value_type*)0);
+                ::new ((void*)__j2) value_type(_VSTD::move(*__first1));
+                __d.template __incr<value_type>();
             }
         }
         __h.release();
@@ -4032,7 +4179,7 @@ __sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __c
                         ++__i;
                     }
                     // [__first, __i) == *__first and *__first < [__i, __last)
-                    // The first part is sorted, sort the secod part
+                    // The first part is sorted, sort the second part
                     // _VSTD::__sort<_Compare>(__i, __last, __comp);
                     __first = __i;
                     goto __restart;
@@ -4138,7 +4285,7 @@ inline _LIBCPP_INLINE_VISIBILITY
 void
 sort(_Tp** __first, _Tp** __last)
 {
-    _VSTD::sort((size_t*)__first, (size_t*)__last, __less<size_t>());
+    _VSTD::sort((uintptr_t*)__first, (uintptr_t*)__last, __less<uintptr_t>());
 }
 
 template <class _Tp>
@@ -4223,7 +4370,7 @@ _ForwardIterator
 lower_bound(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value_, _Compare __comp)
 {
     typedef typename add_lvalue_reference<_Compare>::type _Comp_ref;
-    return __lower_bound<_Comp_ref>(__first, __last, __value_, __comp);
+    return _VSTD::__lower_bound<_Comp_ref>(__first, __last, __value_, __comp);
 }
 
 template <class _ForwardIterator, class _Tp>
@@ -4267,7 +4414,7 @@ _ForwardIterator
 upper_bound(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value_, _Compare __comp)
 {
     typedef typename add_lvalue_reference<_Compare>::type _Comp_ref;
-    return __upper_bound<_Comp_ref>(__first, __last, __value_, __comp);
+    return _VSTD::__upper_bound<_Comp_ref>(__first, __last, __value_, __comp);
 }
 
 template <class _ForwardIterator, class _Tp>
@@ -4308,8 +4455,8 @@ __equal_range(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __va
             _ForwardIterator __mp1 = __m;
             return pair<_ForwardIterator, _ForwardIterator>
                    (
-                      __lower_bound<_Compare>(__first, __m, __value_, __comp),
-                      __upper_bound<_Compare>(++__mp1, __last, __value_, __comp)
+                      _VSTD::__lower_bound<_Compare>(__first, __m, __value_, __comp),
+                      _VSTD::__upper_bound<_Compare>(++__mp1, __last, __value_, __comp)
                    );
         }
     }
@@ -4323,7 +4470,7 @@ pair<_ForwardIterator, _ForwardIterator>
 equal_range(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value_, _Compare __comp)
 {
     typedef typename __comp_ref_type<_Compare>::type _Comp_ref;
-    return __equal_range<_Comp_ref>(__first, __last, __value_, __comp);
+    return _VSTD::__equal_range<_Comp_ref>(__first, __last, __value_, __comp);
 }
 
 template <class _ForwardIterator, class _Tp>
@@ -4343,7 +4490,7 @@ inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 bool
 __binary_search(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value_, _Compare __comp)
 {
-    __first = __lower_bound<_Compare>(__first, __last, __value_, __comp);
+    __first = _VSTD::__lower_bound<_Compare>(__first, __last, __value_, __comp);
     return __first != __last && !__comp(__value_, *__first);
 }
 
@@ -4354,7 +4501,7 @@ bool
 binary_search(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value_, _Compare __comp)
 {
     typedef typename __comp_ref_type<_Compare>::type _Comp_ref;
-    return __binary_search<_Comp_ref>(__first, __last, __value_, __comp);
+    return _VSTD::__binary_search<_Comp_ref>(__first, __last, __value_, __comp);
 }
 
 template <class _ForwardIterator, class _Tp>
@@ -4370,6 +4517,7 @@ binary_search(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __va
 // merge
 
 template <class _Compare, class _InputIterator1, class _InputIterator2, class _OutputIterator>
+_LIBCPP_CONSTEXPR_AFTER_CXX17
 _OutputIterator
 __merge(_InputIterator1 __first1, _InputIterator1 __last1,
         _InputIterator2 __first2, _InputIterator2 __last2, _OutputIterator __result, _Compare __comp)
@@ -4393,7 +4541,7 @@ __merge(_InputIterator1 __first1, _InputIterator1 __last1,
 }
 
 template <class _InputIterator1, class _InputIterator2, class _OutputIterator, class _Compare>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _OutputIterator
 merge(_InputIterator1 __first1, _InputIterator1 __last1,
       _InputIterator2 __first2, _InputIterator2 __last2, _OutputIterator __result, _Compare __comp)
@@ -4403,7 +4551,7 @@ merge(_InputIterator1 __first1, _InputIterator1 __last1,
 }
 
 template <class _InputIterator1, class _InputIterator2, class _OutputIterator>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _OutputIterator
 merge(_InputIterator1 __first1, _InputIterator1 __last1,
       _InputIterator2 __first2, _InputIterator2 __last2, _OutputIterator __result)
@@ -4456,20 +4604,21 @@ __buffered_inplace_merge(_BidirectionalIterator __first, _BidirectionalIterator
     if (__len1 <= __len2)
     {
         value_type* __p = __buff;
-        for (_BidirectionalIterator __i = __first; __i != __middle; __d.__incr((value_type*)0), (void) ++__i, (void) ++__p)
-            ::new(__p) value_type(_VSTD::move(*__i));
-        __half_inplace_merge(__buff, __p, __middle, __last, __first, __comp);
+        for (_BidirectionalIterator __i = __first; __i != __middle; __d.template __incr<value_type>(), (void) ++__i, (void) ++__p)
+            ::new ((void*)__p) value_type(_VSTD::move(*__i));
+        _VSTD::__half_inplace_merge<_Compare>(__buff, __p, __middle, __last, __first, __comp);
     }
     else
     {
         value_type* __p = __buff;
-        for (_BidirectionalIterator __i = __middle; __i != __last; __d.__incr((value_type*)0), (void) ++__i, (void) ++__p)
-            ::new(__p) value_type(_VSTD::move(*__i));
+        for (_BidirectionalIterator __i = __middle; __i != __last; __d.template __incr<value_type>(), (void) ++__i, (void) ++__p)
+            ::new ((void*)__p) value_type(_VSTD::move(*__i));
         typedef reverse_iterator<_BidirectionalIterator> _RBi;
         typedef reverse_iterator<value_type*> _Rv;
-        __half_inplace_merge(_Rv(__p), _Rv(__buff),
-                             _RBi(__middle), _RBi(__first),
-                             _RBi(__last), __invert<_Compare>(__comp));
+        typedef __invert<_Compare> _Inverted;
+        _VSTD::__half_inplace_merge<_Inverted>(_Rv(__p), _Rv(__buff),
+                                    _RBi(__middle), _RBi(__first),
+                                    _RBi(__last), _Inverted(__comp));
     }
 }
 
@@ -4487,7 +4636,7 @@ __inplace_merge(_BidirectionalIterator __first, _BidirectionalIterator __middle,
         if (__len2 == 0)
             return;
         if (__len1 <= __buff_size || __len2 <= __buff_size)
-            return __buffered_inplace_merge<_Compare>
+            return _VSTD::__buffered_inplace_merge<_Compare>
                    (__first, __middle, __last, __comp, __len1, __len2, __buff);
         // shrink [__first, __middle) as much as possible (with no moves), returning if it shrinks to 0
         for (; true; ++__first, (void) --__len1)
@@ -4515,7 +4664,7 @@ __inplace_merge(_BidirectionalIterator __first, _BidirectionalIterator __middle,
             __len21 = __len2 / 2;
             __m2 = __middle;
             _VSTD::advance(__m2, __len21);
-            __m1 = __upper_bound<_Compare>(__first, __middle, *__m2, __comp);
+            __m1 = _VSTD::__upper_bound<_Compare>(__first, __middle, *__m2, __comp);
             __len11 = _VSTD::distance(__first, __m1);
         }
         else
@@ -4530,7 +4679,7 @@ __inplace_merge(_BidirectionalIterator __first, _BidirectionalIterator __middle,
             __len11 = __len1 / 2;
             __m1 = __first;
             _VSTD::advance(__m1, __len11);
-            __m2 = __lower_bound<_Compare>(__middle, __last, *__m1, __comp);
+            __m2 = _VSTD::__lower_bound<_Compare>(__middle, __last, *__m1, __comp);
             __len21 = _VSTD::distance(__middle, __m2);
         }
         difference_type __len12 = __len1 - __len11;  // distance(__m1, __middle)
@@ -4539,11 +4688,11 @@ __inplace_merge(_BidirectionalIterator __first, _BidirectionalIterator __middle,
         // swap middle two partitions
         __middle = _VSTD::rotate(__m1, __middle, __m2);
         // __len12 and __len21 now have swapped meanings
-        // merge smaller range with recurisve call and larger with tail recursion elimination
+        // merge smaller range with recursive call and larger with tail recursion elimination
         if (__len11 + __len21 < __len12 + __len22)
         {
-            __inplace_merge<_Compare>(__first, __m1, __middle, __comp, __len11, __len21, __buff, __buff_size);
-//          __inplace_merge<_Compare>(__middle, __m2, __last, __comp, __len12, __len22, __buff, __buff_size);
+            _VSTD::__inplace_merge<_Compare>(__first, __m1, __middle, __comp, __len11, __len21, __buff, __buff_size);
+//          _VSTD::__inplace_merge<_Compare>(__middle, __m2, __last, __comp, __len12, __len22, __buff, __buff_size);
             __first = __middle;
             __middle = __m2;
             __len1 = __len12;
@@ -4551,8 +4700,8 @@ __inplace_merge(_BidirectionalIterator __first, _BidirectionalIterator __middle,
         }
         else
         {
-            __inplace_merge<_Compare>(__middle, __m2, __last, __comp, __len12, __len22, __buff, __buff_size);
-//          __inplace_merge<_Compare>(__first, __m1, __middle, __comp, __len11, __len21, __buff, __buff_size);
+            _VSTD::__inplace_merge<_Compare>(__middle, __m2, __last, __comp, __len12, __len22, __buff, __buff_size);
+//          _VSTD::__inplace_merge<_Compare>(__first, __m1, __middle, __comp, __len11, __len21, __buff, __buff_size);
             __last = __middle;
             __middle = __m1;
             __len1 = __len11;
@@ -4603,28 +4752,28 @@ __merge_move_construct(_InputIterator1 __first1, _InputIterator1 __last1,
     {
         if (__first1 == __last1)
         {
-            for (; __first2 != __last2; ++__first2, ++__result, (void) __d.__incr((value_type*)0))
-                ::new (__result) value_type(_VSTD::move(*__first2));
+            for (; __first2 != __last2; ++__first2, ++__result, (void)__d.template __incr<value_type>())
+                ::new ((void*)__result) value_type(_VSTD::move(*__first2));
             __h.release();
             return;
         }
         if (__first2 == __last2)
         {
-            for (; __first1 != __last1; ++__first1, ++__result, (void) __d.__incr((value_type*)0))
-                ::new (__result) value_type(_VSTD::move(*__first1));
+            for (; __first1 != __last1; ++__first1, ++__result, (void)__d.template __incr<value_type>())
+                ::new ((void*)__result) value_type(_VSTD::move(*__first1));
             __h.release();
             return;
         }
         if (__comp(*__first2, *__first1))
         {
-            ::new (__result) value_type(_VSTD::move(*__first2));
-            __d.__incr((value_type*)0);
+            ::new ((void*)__result) value_type(_VSTD::move(*__first2));
+            __d.template __incr<value_type>();
             ++__first2;
         }
         else
         {
-            ::new (__result) value_type(_VSTD::move(*__first1));
-            __d.__incr((value_type*)0);
+            ::new ((void*)__result) value_type(_VSTD::move(*__first1));
+            __d.template __incr<value_type>();
             ++__first1;
         }
     }
@@ -4677,38 +4826,38 @@ __stable_sort_move(_RandomAccessIterator __first1, _RandomAccessIterator __last1
     case 0:
         return;
     case 1:
-        ::new(__first2) value_type(_VSTD::move(*__first1));
+        ::new ((void*)__first2) value_type(_VSTD::move(*__first1));
         return;
     case 2:
         __destruct_n __d(0);
         unique_ptr<value_type, __destruct_n&> __h2(__first2, __d);
         if (__comp(*--__last1, *__first1))
         {
-            ::new(__first2) value_type(_VSTD::move(*__last1));
-            __d.__incr((value_type*)0);
+            ::new ((void*)__first2) value_type(_VSTD::move(*__last1));
+            __d.template __incr<value_type>();
             ++__first2;
-            ::new(__first2) value_type(_VSTD::move(*__first1));
+            ::new ((void*)__first2) value_type(_VSTD::move(*__first1));
         }
         else
         {
-            ::new(__first2) value_type(_VSTD::move(*__first1));
-            __d.__incr((value_type*)0);
+            ::new ((void*)__first2) value_type(_VSTD::move(*__first1));
+            __d.template __incr<value_type>();
             ++__first2;
-            ::new(__first2) value_type(_VSTD::move(*__last1));
+            ::new ((void*)__first2) value_type(_VSTD::move(*__last1));
         }
         __h2.release();
         return;
     }
     if (__len <= 8)
     {
-        __insertion_sort_move<_Compare>(__first1, __last1, __first2, __comp);
+        _VSTD::__insertion_sort_move<_Compare>(__first1, __last1, __first2, __comp);
         return;
     }
     typename iterator_traits<_RandomAccessIterator>::difference_type __l2 = __len / 2;
     _RandomAccessIterator __m = __first1 + __l2;
-    __stable_sort<_Compare>(__first1, __m, __comp, __l2, __first2, __l2);
-    __stable_sort<_Compare>(__m, __last1, __comp, __len - __l2, __first2 + __l2, __len - __l2);
-    __merge_move_construct<_Compare>(__first1, __m, __m, __last1, __first2, __comp);
+    _VSTD::__stable_sort<_Compare>(__first1, __m, __comp, __l2, __first2, __l2);
+    _VSTD::__stable_sort<_Compare>(__m, __last1, __comp, __len - __l2, __first2 + __l2, __len - __l2);
+    _VSTD::__merge_move_construct<_Compare>(__first1, __m, __m, __last1, __first2, __comp);
 }
 
 template <class _Tp>
@@ -4737,7 +4886,7 @@ __stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Comp
     }
     if (__len <= static_cast<difference_type>(__stable_sort_switch<value_type>::value))
     {
-        __insertion_sort<_Compare>(__first, __last, __comp);
+        _VSTD::__insertion_sort<_Compare>(__first, __last, __comp);
         return;
     }
     typename iterator_traits<_RandomAccessIterator>::difference_type __l2 = __len / 2;
@@ -4746,21 +4895,21 @@ __stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Comp
     {
         __destruct_n __d(0);
         unique_ptr<value_type, __destruct_n&> __h2(__buff, __d);
-        __stable_sort_move<_Compare>(__first, __m, __comp, __l2, __buff);
-        __d.__set(__l2, (value_type*)0);
-        __stable_sort_move<_Compare>(__m, __last, __comp, __len - __l2, __buff + __l2);
-        __d.__set(__len, (value_type*)0);
-        __merge_move_assign<_Compare>(__buff, __buff + __l2, __buff + __l2, __buff + __len, __first, __comp);
-//         __merge<_Compare>(move_iterator<value_type*>(__buff),
-//                           move_iterator<value_type*>(__buff + __l2),
-//                           move_iterator<_RandomAccessIterator>(__buff + __l2),
-//                           move_iterator<_RandomAccessIterator>(__buff + __len),
-//                           __first, __comp);
+        _VSTD::__stable_sort_move<_Compare>(__first, __m, __comp, __l2, __buff);
+        __d.__set(__l2, (value_type*)nullptr);
+        _VSTD::__stable_sort_move<_Compare>(__m, __last, __comp, __len - __l2, __buff + __l2);
+        __d.__set(__len, (value_type*)nullptr);
+        _VSTD::__merge_move_assign<_Compare>(__buff, __buff + __l2, __buff + __l2, __buff + __len, __first, __comp);
+//         _VSTD::__merge<_Compare>(move_iterator<value_type*>(__buff),
+//                                  move_iterator<value_type*>(__buff + __l2),
+//                                  move_iterator<_RandomAccessIterator>(__buff + __l2),
+//                                  move_iterator<_RandomAccessIterator>(__buff + __len),
+//                                  __first, __comp);
         return;
     }
-    __stable_sort<_Compare>(__first, __m, __comp, __l2, __buff, __buff_size);
-    __stable_sort<_Compare>(__m, __last, __comp, __len - __l2, __buff, __buff_size);
-    __inplace_merge<_Compare>(__first, __m, __last, __comp, __l2, __len - __l2, __buff, __buff_size);
+    _VSTD::__stable_sort<_Compare>(__first, __m, __comp, __l2, __buff, __buff_size);
+    _VSTD::__stable_sort<_Compare>(__m, __last, __comp, __len - __l2, __buff, __buff_size);
+    _VSTD::__inplace_merge<_Compare>(__first, __m, __last, __comp, __l2, __len - __l2, __buff, __buff_size);
 }
 
 template <class _RandomAccessIterator, class _Compare>
@@ -4779,7 +4928,7 @@ stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compar
         __h.reset(__buf.first);
     }
     typedef typename __comp_ref_type<_Compare>::type _Comp_ref;
-    __stable_sort<_Comp_ref>(__first, __last, __comp, __len, __buf.first, __buf.second);
+    _VSTD::__stable_sort<_Comp_ref>(__first, __last, __comp, __len, __buf.first, __buf.second);
 }
 
 template <class _RandomAccessIterator>
@@ -4883,7 +5032,7 @@ void
 push_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp)
 {
     typedef typename __comp_ref_type<_Compare>::type _Comp_ref;
-    __sift_up<_Comp_ref>(__first, __last, __comp, __last - __first);
+    _VSTD::__sift_up<_Comp_ref>(__first, __last, __comp, __last - __first);
 }
 
 template <class _RandomAccessIterator>
@@ -4960,7 +5109,7 @@ __pop_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare
     if (__len > 1)
     {
         swap(*__first, *--__last);
-        __sift_down<_Compare>(__first, __last, __comp, __len - 1, __first);
+        _VSTD::__sift_down<_Compare>(__first, __last, __comp, __len - 1, __first);
     }
 }
 
@@ -4970,7 +5119,7 @@ void
 pop_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp)
 {
     typedef typename __comp_ref_type<_Compare>::type _Comp_ref;
-    __pop_heap<_Comp_ref>(__first, __last, __comp, __last - __first);
+    _VSTD::__pop_heap<_Comp_ref>(__first, __last, __comp, __last - __first);
 }
 
 template <class _RandomAccessIterator>
@@ -4994,7 +5143,7 @@ __make_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compar
         // start from the first parent, there is no need to consider children
         for (difference_type __start = (__n - 2) / 2; __start >= 0; --__start)
         {
-            __sift_down<_Compare>(__first, __last, __comp, __n, __first + __start);
+            _VSTD::__sift_down<_Compare>(__first, __last, __comp, __n, __first + __start);
         }
     }
 }
@@ -5005,7 +5154,7 @@ void
 make_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp)
 {
     typedef typename __comp_ref_type<_Compare>::type _Comp_ref;
-    __make_heap<_Comp_ref>(__first, __last, __comp);
+    _VSTD::__make_heap<_Comp_ref>(__first, __last, __comp);
 }
 
 template <class _RandomAccessIterator>
@@ -5024,7 +5173,7 @@ __sort_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compar
 {
     typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type;
     for (difference_type __n = __last - __first; __n > 1; --__last, (void) --__n)
-        __pop_heap<_Compare>(__first, __last, __comp, __n);
+        _VSTD::__pop_heap<_Compare>(__first, __last, __comp, __n);
 }
 
 template <class _RandomAccessIterator, class _Compare>
@@ -5033,7 +5182,7 @@ void
 sort_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp)
 {
     typedef typename __comp_ref_type<_Compare>::type _Comp_ref;
-    __sort_heap<_Comp_ref>(__first, __last, __comp);
+    _VSTD::__sort_heap<_Comp_ref>(__first, __last, __comp);
 }
 
 template <class _RandomAccessIterator>
@@ -5051,17 +5200,17 @@ void
 __partial_sort(_RandomAccessIterator __first, _RandomAccessIterator __middle, _RandomAccessIterator __last,
              _Compare __comp)
 {
-    __make_heap<_Compare>(__first, __middle, __comp);
+    _VSTD::__make_heap<_Compare>(__first, __middle, __comp);
     typename iterator_traits<_RandomAccessIterator>::difference_type __len = __middle - __first;
     for (_RandomAccessIterator __i = __middle; __i != __last; ++__i)
     {
         if (__comp(*__i, *__first))
         {
             swap(*__i, *__first);
-            __sift_down<_Compare>(__first, __middle, __comp, __len, __first);
+            _VSTD::__sift_down<_Compare>(__first, __middle, __comp, __len, __first);
         }
     }
-    __sort_heap<_Compare>(__first, __middle, __comp);
+    _VSTD::__sort_heap<_Compare>(__first, __middle, __comp);
 }
 
 template <class _RandomAccessIterator, class _Compare>
@@ -5071,7 +5220,7 @@ partial_sort(_RandomAccessIterator __first, _RandomAccessIterator __middle, _Ran
              _Compare __comp)
 {
     typedef typename __comp_ref_type<_Compare>::type _Comp_ref;
-    __partial_sort<_Comp_ref>(__first, __middle, __last, __comp);
+    _VSTD::__partial_sort<_Comp_ref>(__first, __middle, __last, __comp);
 }
 
 template <class _RandomAccessIterator>
@@ -5095,15 +5244,15 @@ __partial_sort_copy(_InputIterator __first, _InputIterator __last,
     {
         for (; __first != __last && __r != __result_last; ++__first, (void) ++__r)
             *__r = *__first;
-        __make_heap<_Compare>(__result_first, __r, __comp);
+        _VSTD::__make_heap<_Compare>(__result_first, __r, __comp);
         typename iterator_traits<_RandomAccessIterator>::difference_type __len = __r - __result_first;
         for (; __first != __last; ++__first)
             if (__comp(*__first, *__result_first))
             {
                 *__result_first = *__first;
-                __sift_down<_Compare>(__result_first, __r, __comp, __len, __result_first);
+                _VSTD::__sift_down<_Compare>(__result_first, __r, __comp, __len, __result_first);
             }
-        __sort_heap<_Compare>(__result_first, __r, __comp);
+        _VSTD::__sort_heap<_Compare>(__result_first, __r, __comp);
     }
     return __r;
 }
@@ -5115,7 +5264,7 @@ partial_sort_copy(_InputIterator __first, _InputIterator __last,
                   _RandomAccessIterator __result_first, _RandomAccessIterator __result_last, _Compare __comp)
 {
     typedef typename __comp_ref_type<_Compare>::type _Comp_ref;
-    return __partial_sort_copy<_Comp_ref>(__first, __last, __result_first, __result_last, __comp);
+    return _VSTD::__partial_sort_copy<_Comp_ref>(__first, __last, __result_first, __result_last, __comp);
 }
 
 template <class _InputIterator, class _RandomAccessIterator>
@@ -5161,7 +5310,7 @@ __nth_element(_RandomAccessIterator __first, _RandomAccessIterator __nth, _Rando
         }
         if (__len <= __limit)
         {
-            __selection_sort<_Compare>(__first, __last, __comp);
+            _VSTD::__selection_sort<_Compare>(__first, __last, __comp);
             return;
         }
         // __len > __limit >= 3
@@ -5185,7 +5334,7 @@ __nth_element(_RandomAccessIterator __first, _RandomAccessIterator __nth, _Rando
                 if (__i == --__j)
                 {
                     // *__first == *__m, *__m <= all other elements
-                    // Parition instead into [__first, __i) == *__first and *__first < [__i, __last)
+                    // Partition instead into [__first, __i) == *__first and *__first < [__i, __last)
                     ++__i;  // __first + 1
                     __j = __last;
                     if (!__comp(*__first, *--__j))  // we need a guard if *__first == *(__last-1)
@@ -5223,8 +5372,8 @@ __nth_element(_RandomAccessIterator __first, _RandomAccessIterator __nth, _Rando
                     // The first part is sorted,
                     if (__nth < __i)
                         return;
-                    // __nth_element the secod part
-                    // __nth_element<_Compare>(__i, __nth, __last, __comp);
+                    // __nth_element the second part
+                    // _VSTD::__nth_element<_Compare>(__i, __nth, __last, __comp);
                     __first = __i;
                     goto __restart;
                 }
@@ -5306,12 +5455,12 @@ not_sorted:
         // __nth_element on range containing __nth
         if (__nth < __i)
         {
-            // __nth_element<_Compare>(__first, __nth, __i, __comp);
+            // _VSTD::__nth_element<_Compare>(__first, __nth, __i, __comp);
             __last = __i;
         }
         else
         {
-            // __nth_element<_Compare>(__i+1, __nth, __last, __comp);
+            // _VSTD::__nth_element<_Compare>(__i+1, __nth, __last, __comp);
             __first = ++__i;
         }
     }
@@ -5323,7 +5472,7 @@ void
 nth_element(_RandomAccessIterator __first, _RandomAccessIterator __nth, _RandomAccessIterator __last, _Compare __comp)
 {
     typedef typename __comp_ref_type<_Compare>::type _Comp_ref;
-    __nth_element<_Comp_ref>(__first, __nth, __last, __comp);
+    _VSTD::__nth_element<_Comp_ref>(__first, __nth, __last, __comp);
 }
 
 template <class _RandomAccessIterator>
@@ -5359,7 +5508,7 @@ includes(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __fi
          _Compare __comp)
 {
     typedef typename __comp_ref_type<_Compare>::type _Comp_ref;
-    return __includes<_Comp_ref>(__first1, __last1, __first2, __last2, __comp);
+    return _VSTD::__includes<_Comp_ref>(__first1, __last1, __first2, __last2, __comp);
 }
 
 template <class _InputIterator1, class _InputIterator2>
@@ -5376,7 +5525,7 @@ includes(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __fi
 // set_union
 
 template <class _Compare, class _InputIterator1, class _InputIterator2, class _OutputIterator>
-_OutputIterator
+_LIBCPP_CONSTEXPR_AFTER_CXX17 _OutputIterator
 __set_union(_InputIterator1 __first1, _InputIterator1 __last1,
             _InputIterator2 __first2, _InputIterator2 __last2, _OutputIterator __result, _Compare __comp)
 {
@@ -5401,17 +5550,17 @@ __set_union(_InputIterator1 __first1, _InputIterator1 __last1,
 }
 
 template <class _InputIterator1, class _InputIterator2, class _OutputIterator, class _Compare>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _OutputIterator
 set_union(_InputIterator1 __first1, _InputIterator1 __last1,
           _InputIterator2 __first2, _InputIterator2 __last2, _OutputIterator __result, _Compare __comp)
 {
     typedef typename __comp_ref_type<_Compare>::type _Comp_ref;
-    return __set_union<_Comp_ref>(__first1, __last1, __first2, __last2, __result, __comp);
+    return _VSTD::__set_union<_Comp_ref>(__first1, __last1, __first2, __last2, __result, __comp);
 }
 
 template <class _InputIterator1, class _InputIterator2, class _OutputIterator>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _OutputIterator
 set_union(_InputIterator1 __first1, _InputIterator1 __last1,
           _InputIterator2 __first2, _InputIterator2 __last2, _OutputIterator __result)
@@ -5453,7 +5602,7 @@ set_intersection(_InputIterator1 __first1, _InputIterator1 __last1,
                  _InputIterator2 __first2, _InputIterator2 __last2, _OutputIterator __result, _Compare __comp)
 {
     typedef typename __comp_ref_type<_Compare>::type _Comp_ref;
-    return __set_intersection<_Comp_ref>(__first1, __last1, __first2, __last2, __result, __comp);
+    return _VSTD::__set_intersection<_Comp_ref>(__first1, __last1, __first2, __last2, __result, __comp);
 }
 
 template <class _InputIterator1, class _InputIterator2, class _OutputIterator>
@@ -5470,7 +5619,7 @@ set_intersection(_InputIterator1 __first1, _InputIterator1 __last1,
 // set_difference
 
 template <class _Compare, class _InputIterator1, class _InputIterator2, class _OutputIterator>
-_OutputIterator
+_LIBCPP_CONSTEXPR_AFTER_CXX17 _OutputIterator
 __set_difference(_InputIterator1 __first1, _InputIterator1 __last1,
                  _InputIterator2 __first2, _InputIterator2 __last2, _OutputIterator __result, _Compare __comp)
 {
@@ -5495,17 +5644,17 @@ __set_difference(_InputIterator1 __first1, _InputIterator1 __last1,
 }
 
 template <class _InputIterator1, class _InputIterator2, class _OutputIterator, class _Compare>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _OutputIterator
 set_difference(_InputIterator1 __first1, _InputIterator1 __last1,
                _InputIterator2 __first2, _InputIterator2 __last2, _OutputIterator __result, _Compare __comp)
 {
     typedef typename __comp_ref_type<_Compare>::type _Comp_ref;
-    return __set_difference<_Comp_ref>(__first1, __last1, __first2, __last2, __result, __comp);
+    return _VSTD::__set_difference<_Comp_ref>(__first1, __last1, __first2, __last2, __result, __comp);
 }
 
 template <class _InputIterator1, class _InputIterator2, class _OutputIterator>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _OutputIterator
 set_difference(_InputIterator1 __first1, _InputIterator1 __last1,
                _InputIterator2 __first2, _InputIterator2 __last2, _OutputIterator __result)
@@ -5518,7 +5667,7 @@ set_difference(_InputIterator1 __first1, _InputIterator1 __last1,
 // set_symmetric_difference
 
 template <class _Compare, class _InputIterator1, class _InputIterator2, class _OutputIterator>
-_OutputIterator
+_LIBCPP_CONSTEXPR_AFTER_CXX17 _OutputIterator
 __set_symmetric_difference(_InputIterator1 __first1, _InputIterator1 __last1,
                            _InputIterator2 __first2, _InputIterator2 __last2, _OutputIterator __result, _Compare __comp)
 {
@@ -5548,17 +5697,17 @@ __set_symmetric_difference(_InputIterator1 __first1, _InputIterator1 __last1,
 }
 
 template <class _InputIterator1, class _InputIterator2, class _OutputIterator, class _Compare>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _OutputIterator
 set_symmetric_difference(_InputIterator1 __first1, _InputIterator1 __last1,
                          _InputIterator2 __first2, _InputIterator2 __last2, _OutputIterator __result, _Compare __comp)
 {
     typedef typename __comp_ref_type<_Compare>::type _Comp_ref;
-    return __set_symmetric_difference<_Comp_ref>(__first1, __last1, __first2, __last2, __result, __comp);
+    return _VSTD::__set_symmetric_difference<_Comp_ref>(__first1, __last1, __first2, __last2, __result, __comp);
 }
 
 template <class _InputIterator1, class _InputIterator2, class _OutputIterator>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _OutputIterator
 set_symmetric_difference(_InputIterator1 __first1, _InputIterator1 __last1,
                          _InputIterator2 __first2, _InputIterator2 __last2, _OutputIterator __result)
@@ -5593,7 +5742,7 @@ lexicographical_compare(_InputIterator1 __first1, _InputIterator1 __last1,
                         _InputIterator2 __first2, _InputIterator2 __last2, _Compare __comp)
 {
     typedef typename __comp_ref_type<_Compare>::type _Comp_ref;
-    return __lexicographical_compare<_Comp_ref>(__first1, __last1, __first2, __last2, __comp);
+    return _VSTD::__lexicographical_compare<_Comp_ref>(__first1, __last1, __first2, __last2, __comp);
 }
 
 template <class _InputIterator1, class _InputIterator2>
@@ -5611,7 +5760,7 @@ lexicographical_compare(_InputIterator1 __first1, _InputIterator1 __last1,
 // next_permutation
 
 template <class _Compare, class _BidirectionalIterator>
-bool
+_LIBCPP_CONSTEXPR_AFTER_CXX17 bool
 __next_permutation(_BidirectionalIterator __first, _BidirectionalIterator __last, _Compare __comp)
 {
     _BidirectionalIterator __i = __last;
@@ -5638,16 +5787,16 @@ __next_permutation(_BidirectionalIterator __first, _BidirectionalIterator __last
 }
 
 template <class _BidirectionalIterator, class _Compare>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 bool
 next_permutation(_BidirectionalIterator __first, _BidirectionalIterator __last, _Compare __comp)
 {
     typedef typename __comp_ref_type<_Compare>::type _Comp_ref;
-    return __next_permutation<_Comp_ref>(__first, __last, __comp);
+    return _VSTD::__next_permutation<_Comp_ref>(__first, __last, __comp);
 }
 
 template <class _BidirectionalIterator>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 bool
 next_permutation(_BidirectionalIterator __first, _BidirectionalIterator __last)
 {
@@ -5658,7 +5807,7 @@ next_permutation(_BidirectionalIterator __first, _BidirectionalIterator __last)
 // prev_permutation
 
 template <class _Compare, class _BidirectionalIterator>
-bool
+_LIBCPP_CONSTEXPR_AFTER_CXX17 bool
 __prev_permutation(_BidirectionalIterator __first, _BidirectionalIterator __last, _Compare __comp)
 {
     _BidirectionalIterator __i = __last;
@@ -5685,16 +5834,16 @@ __prev_permutation(_BidirectionalIterator __first, _BidirectionalIterator __last
 }
 
 template <class _BidirectionalIterator, class _Compare>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 bool
 prev_permutation(_BidirectionalIterator __first, _BidirectionalIterator __last, _Compare __comp)
 {
     typedef typename __comp_ref_type<_Compare>::type _Comp_ref;
-    return __prev_permutation<_Comp_ref>(__first, __last, __comp);
+    return _VSTD::__prev_permutation<_Comp_ref>(__first, __last, __comp);
 }
 
 template <class _BidirectionalIterator>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 bool
 prev_permutation(_BidirectionalIterator __first, _BidirectionalIterator __last)
 {
diff --git a/lib/libcxx/include/any b/lib/libcxx/include/any
index 36b07c9d7e..968c9769ee 100644
--- a/lib/libcxx/include/any
+++ b/lib/libcxx/include/any
@@ -81,8 +81,8 @@ namespace std {
 */
 
 #include <experimental/__config>
+#include <__availability>
 #include <memory>
-#include <new>
 #include <typeinfo>
 #include <type_traits>
 #include <cstdlib>
@@ -157,7 +157,7 @@ namespace __any_imp
   template <class _Tp>
   inline _LIBCPP_INLINE_VISIBILITY
   constexpr const void* __get_fallback_typeid() {
-      return &__unique_typeinfo<decay_t<_Tp>>::__id;
+      return &__unique_typeinfo<remove_cv_t<remove_reference_t<_Tp>>>::__id;
   }
 
   template <class _Tp>
@@ -368,7 +368,11 @@ namespace __any_imp
     template <class ..._Args>
     _LIBCPP_INLINE_VISIBILITY
     static _Tp& __create(any & __dest, _Args&&... __args) {
-        _Tp* __ret = ::new (static_cast<void*>(&__dest.__s.__buf)) _Tp(_VSTD::forward<_Args>(__args)...);
+        typedef allocator<_Tp> _Alloc;
+        typedef allocator_traits<_Alloc> _ATraits;
+        _Alloc __a;
+        _Tp * __ret = static_cast<_Tp*>(static_cast<void*>(&__dest.__s.__buf));
+        _ATraits::construct(__a, __ret, _VSTD::forward<_Args>(__args)...);
         __dest.__h = &_SmallHandler::__handle;
         return *__ret;
     }
@@ -376,8 +380,11 @@ namespace __any_imp
   private:
     _LIBCPP_INLINE_VISIBILITY
     static void __destroy(any & __this) {
-        _Tp & __value = *static_cast<_Tp *>(static_cast<void*>(&__this.__s.__buf));
-        __value.~_Tp();
+        typedef allocator<_Tp> _Alloc;
+        typedef allocator_traits<_Alloc> _ATraits;
+        _Alloc __a;
+        _Tp * __p = static_cast<_Tp *>(static_cast<void*>(&__this.__s.__buf));
+        _ATraits::destroy(__a, __p);
         __this.__h = nullptr;
     }
 
@@ -445,10 +452,12 @@ namespace __any_imp
     _LIBCPP_INLINE_VISIBILITY
     static _Tp& __create(any & __dest, _Args&&... __args) {
         typedef allocator<_Tp> _Alloc;
+        typedef allocator_traits<_Alloc> _ATraits;
         typedef __allocator_destructor<_Alloc> _Dp;
         _Alloc __a;
-        unique_ptr<_Tp, _Dp> __hold(__a.allocate(1), _Dp(__a, 1));
-        _Tp* __ret = ::new ((void*)__hold.get()) _Tp(_VSTD::forward<_Args>(__args)...);
+        unique_ptr<_Tp, _Dp> __hold(_ATraits::allocate(__a, 1), _Dp(__a, 1));
+        _Tp * __ret = __hold.get();
+        _ATraits::construct(__a, __ret, _VSTD::forward<_Args>(__args)...);
         __dest.__s.__ptr = __hold.release();
         __dest.__h = &_LargeHandler::__handle;
         return *__ret;
@@ -458,7 +467,12 @@ namespace __any_imp
 
     _LIBCPP_INLINE_VISIBILITY
     static void __destroy(any & __this){
-        delete static_cast<_Tp*>(__this.__s.__ptr);
+        typedef allocator<_Tp> _Alloc;
+        typedef allocator_traits<_Alloc> _ATraits;
+        _Alloc __a;
+        _Tp * __p = static_cast<_Tp *>(__this.__s.__ptr);
+        _ATraits::destroy(__a, __p);
+        _ATraits::deallocate(__a, __p, 1);
         __this.__h = nullptr;
     }
 
diff --git a/lib/libcxx/include/array b/lib/libcxx/include/array
index e73bbe7fea..9a479f7cd1 100644
--- a/lib/libcxx/include/array
+++ b/lib/libcxx/include/array
@@ -142,8 +142,8 @@ struct _LIBCPP_TEMPLATE_VIS array
     typedef const value_type*                     const_pointer;
     typedef size_t                                size_type;
     typedef ptrdiff_t                             difference_type;
-    typedef std::reverse_iterator<iterator>       reverse_iterator;
-    typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+    typedef _VSTD::reverse_iterator<iterator>       reverse_iterator;
+    typedef _VSTD::reverse_iterator<const_iterator> const_reverse_iterator;
 
     _Tp __elems_[_Size];
 
@@ -155,7 +155,7 @@ struct _LIBCPP_TEMPLATE_VIS array
 
     _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
     void swap(array& __a) _NOEXCEPT_(__is_nothrow_swappable<_Tp>::value) {
-        std::swap_ranges(data(), data() + _Size, __a.data());
+        _VSTD::swap_ranges(data(), data() + _Size, __a.data());
     }
 
     // iterators:
@@ -245,8 +245,8 @@ struct _LIBCPP_TEMPLATE_VIS array<_Tp, 0>
     typedef const value_type*                     const_pointer;
     typedef size_t                                size_type;
     typedef ptrdiff_t                             difference_type;
-    typedef std::reverse_iterator<iterator>       reverse_iterator;
-    typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+    typedef _VSTD::reverse_iterator<iterator>       reverse_iterator;
+    typedef _VSTD::reverse_iterator<const_iterator> const_reverse_iterator;
 
     typedef typename conditional<is_const<_Tp>::value, const char,
                                 char>::type _CharType;
@@ -459,8 +459,6 @@ get(const array<_Tp, _Size>& __a) _NOEXCEPT
     return __a.__elems_[_Ip];
 }
 
-#ifndef _LIBCPP_CXX03_LANG
-
 template <size_t _Ip, class _Tp, size_t _Size>
 inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX11
 _Tp&&
@@ -479,8 +477,6 @@ get(const array<_Tp, _Size>&& __a) _NOEXCEPT
     return _VSTD::move(__a.__elems_[_Ip]);
 }
 
-#endif  // !_LIBCPP_CXX03_LANG
-
 #if _LIBCPP_STD_VER > 17
 
 template <typename _Tp, size_t _Size, size_t... _Index>
@@ -504,7 +500,7 @@ to_array(_Tp (&__arr)[_Size]) noexcept(is_nothrow_constructible_v<_Tp, _Tp&>) {
   static_assert(
       is_constructible_v<_Tp, _Tp&>,
       "[array.creation]/1: to_array requires copy constructible elements.");
-  return __to_array_lvalue_impl(__arr, make_index_sequence<_Size>());
+  return _VSTD::__to_array_lvalue_impl(__arr, make_index_sequence<_Size>());
 }
 
 template <typename _Tp, size_t _Size>
@@ -516,8 +512,8 @@ to_array(_Tp(&&__arr)[_Size]) noexcept(is_nothrow_move_constructible_v<_Tp>) {
   static_assert(
       is_move_constructible_v<_Tp>,
       "[array.creation]/4: to_array requires move constructible elements.");
-  return __to_array_rvalue_impl(_VSTD::move(__arr),
-                                make_index_sequence<_Size>());
+  return _VSTD::__to_array_rvalue_impl(_VSTD::move(__arr),
+                                       make_index_sequence<_Size>());
 }
 
 #endif // _LIBCPP_STD_VER > 17
diff --git a/lib/libcxx/include/atomic b/lib/libcxx/include/atomic
index 9c28986537..0fc799a243 100644
--- a/lib/libcxx/include/atomic
+++ b/lib/libcxx/include/atomic
@@ -16,9 +16,12 @@
 namespace std
 {
 
-// feature test macro
+// feature test macro [version.syn]
 
-#define __cpp_lib_atomic_is_always_lock_free // as specified by SG10
+#define __cpp_lib_atomic_is_always_lock_free
+#define __cpp_lib_atomic_flag_test
+#define __cpp_lib_atomic_lock_free_type_aliases
+#define __cpp_lib_atomic_wait
 
  // order and consistency
 
@@ -45,6 +48,7 @@ template <class T> T kill_dependency(T y) noexcept;
 
 #define ATOMIC_BOOL_LOCK_FREE unspecified
 #define ATOMIC_CHAR_LOCK_FREE unspecified
+#define ATOMIC_CHAR8_T_LOCK_FREE unspecified // C++20
 #define ATOMIC_CHAR16_T_LOCK_FREE unspecified
 #define ATOMIC_CHAR32_T_LOCK_FREE unspecified
 #define ATOMIC_WCHAR_T_LOCK_FREE unspecified
@@ -108,6 +112,7 @@ template <>
 struct atomic<integral>
 {
     using value_type = integral;
+    using difference_type = value_type;
 
     static constexpr bool is_always_lock_free;
     bool is_lock_free() const volatile noexcept;
@@ -190,6 +195,7 @@ template <class T>
 struct atomic<T*>
 {
     using value_type = T*;
+    using difference_type = ptrdiff_t;
 
     static constexpr bool is_always_lock_free;
     bool is_lock_free() const volatile noexcept;
@@ -460,6 +466,7 @@ typedef atomic<long>               atomic_long;
 typedef atomic<unsigned long>      atomic_ulong;
 typedef atomic<long long>          atomic_llong;
 typedef atomic<unsigned long long> atomic_ullong;
+typedef atomic<char8_t>            atomic_char8_t; // C++20
 typedef atomic<char16_t>           atomic_char16_t;
 typedef atomic<char32_t>           atomic_char32_t;
 typedef atomic<wchar_t>            atomic_wchar_t;
@@ -477,7 +484,7 @@ typedef atomic<int_fast8_t>   atomic_int_fast8_t;
 typedef atomic<uint_fast8_t>  atomic_uint_fast8_t;
 typedef atomic<int_fast16_t>  atomic_int_fast16_t;
 typedef atomic<uint_fast16_t> atomic_uint_fast16_t;
-typedef atomic<int_fast32_t>  atomic_int_fast32_t;    
+typedef atomic<int_fast32_t>  atomic_int_fast32_t;
 typedef atomic<uint_fast32_t> atomic_uint_fast32_t;
 typedef atomic<int_fast64_t>  atomic_int_fast64_t;
 typedef atomic<uint_fast64_t> atomic_uint_fast64_t;
@@ -568,6 +575,7 @@ template <class T>
 */
 
 #include <__config>
+#include <__availability>
 #include <__threading_support>
 #include <cstddef>
 #include <cstdint>
@@ -654,7 +662,7 @@ typedef enum memory_order {
 
 template <typename _Tp> _LIBCPP_INLINE_VISIBILITY
 bool __cxx_nonatomic_compare_equal(_Tp const& __lhs, _Tp const& __rhs) {
-    return memcmp(&__lhs, &__rhs, sizeof(_Tp)) == 0;
+    return _VSTD::memcmp(&__lhs, &__rhs, sizeof(_Tp)) == 0;
 }
 
 static_assert((is_same<underlying_type<memory_order>::type, __memory_order_underlying_t>::value),
@@ -1119,6 +1127,9 @@ _Tp kill_dependency(_Tp __y) _NOEXCEPT
 #if defined(__CLANG_ATOMIC_BOOL_LOCK_FREE)
 # define ATOMIC_BOOL_LOCK_FREE      __CLANG_ATOMIC_BOOL_LOCK_FREE
 # define ATOMIC_CHAR_LOCK_FREE      __CLANG_ATOMIC_CHAR_LOCK_FREE
+#ifndef _LIBCPP_NO_HAS_CHAR8_T
+# define ATOMIC_CHAR8_T_LOCK_FREE   __CLANG_ATOMIC_CHAR8_T_LOCK_FREE
+#endif
 # define ATOMIC_CHAR16_T_LOCK_FREE  __CLANG_ATOMIC_CHAR16_T_LOCK_FREE
 # define ATOMIC_CHAR32_T_LOCK_FREE  __CLANG_ATOMIC_CHAR32_T_LOCK_FREE
 # define ATOMIC_WCHAR_T_LOCK_FREE   __CLANG_ATOMIC_WCHAR_T_LOCK_FREE
@@ -1130,6 +1141,9 @@ _Tp kill_dependency(_Tp __y) _NOEXCEPT
 #elif defined(__GCC_ATOMIC_BOOL_LOCK_FREE)
 # define ATOMIC_BOOL_LOCK_FREE      __GCC_ATOMIC_BOOL_LOCK_FREE
 # define ATOMIC_CHAR_LOCK_FREE      __GCC_ATOMIC_CHAR_LOCK_FREE
+#ifndef _LIBCPP_NO_HAS_CHAR8_T
+# define ATOMIC_CHAR8_T_LOCK_FREE   __GCC_ATOMIC_CHAR8_T_LOCK_FREE
+#endif
 # define ATOMIC_CHAR16_T_LOCK_FREE  __GCC_ATOMIC_CHAR16_T_LOCK_FREE
 # define ATOMIC_CHAR32_T_LOCK_FREE  __GCC_ATOMIC_CHAR32_T_LOCK_FREE
 # define ATOMIC_WCHAR_T_LOCK_FREE   __GCC_ATOMIC_WCHAR_T_LOCK_FREE
@@ -1245,10 +1259,10 @@ template <typename _Tp>
 _LIBCPP_INLINE_VISIBILITY
 bool __cxx_atomic_compare_exchange_strong(volatile __cxx_atomic_lock_impl<_Tp>* __a,
                                           _Tp* __expected, _Tp __value, memory_order, memory_order) {
-  __a->__lock();
   _Tp __temp;
+  __a->__lock();
   __cxx_atomic_assign_volatile(__temp, __a->__a_value);
-  bool __ret = __temp == *__expected;
+  bool __ret = (_VSTD::memcmp(&__temp, __expected, sizeof(_Tp)) == 0);
   if(__ret)
     __cxx_atomic_assign_volatile(__a->__a_value, __value);
   else
@@ -1261,11 +1275,11 @@ _LIBCPP_INLINE_VISIBILITY
 bool __cxx_atomic_compare_exchange_strong(__cxx_atomic_lock_impl<_Tp>* __a,
                                           _Tp* __expected, _Tp __value, memory_order, memory_order) {
   __a->__lock();
-  bool __ret = __a->__a_value == *__expected;
+  bool __ret = (_VSTD::memcmp(&__a->__a_value, __expected, sizeof(_Tp)) == 0);
   if(__ret)
-    __a->__a_value = __value;
+    _VSTD::memcpy(&__a->__a_value, &__value, sizeof(_Tp));
   else
-    *__expected = __a->__a_value;
+    _VSTD::memcpy(__expected, &__a->__a_value, sizeof(_Tp));
   __a->__unlock();
   return __ret;
 }
@@ -1274,10 +1288,10 @@ template <typename _Tp>
 _LIBCPP_INLINE_VISIBILITY
 bool __cxx_atomic_compare_exchange_weak(volatile __cxx_atomic_lock_impl<_Tp>* __a,
                                         _Tp* __expected, _Tp __value, memory_order, memory_order) {
-  __a->__lock();
   _Tp __temp;
+  __a->__lock();
   __cxx_atomic_assign_volatile(__temp, __a->__a_value);
-  bool __ret = __temp == *__expected;
+  bool __ret = (_VSTD::memcmp(&__temp, __expected, sizeof(_Tp)) == 0);
   if(__ret)
     __cxx_atomic_assign_volatile(__a->__a_value, __value);
   else
@@ -1290,11 +1304,11 @@ _LIBCPP_INLINE_VISIBILITY
 bool __cxx_atomic_compare_exchange_weak(__cxx_atomic_lock_impl<_Tp>* __a,
                                         _Tp* __expected, _Tp __value, memory_order, memory_order) {
   __a->__lock();
-  bool __ret = __a->__a_value == *__expected;
+  bool __ret = (_VSTD::memcmp(&__a->__a_value, __expected, sizeof(_Tp)) == 0);
   if(__ret)
-    __a->__a_value = __value;
+    _VSTD::memcpy(&__a->__a_value, &__value, sizeof(_Tp));
   else
-    *__expected = __a->__a_value;
+    _VSTD::memcpy(__expected, &__a->__a_value, sizeof(_Tp));
   __a->__unlock();
   return __ret;
 }
@@ -1444,6 +1458,9 @@ template<> struct __cxx_is_always_lock_free<bool> { enum { __value = 2 == ATOMIC
 template<> struct __cxx_is_always_lock_free<char> { enum { __value = 2 == ATOMIC_CHAR_LOCK_FREE }; };
 template<> struct __cxx_is_always_lock_free<signed char> { enum { __value = 2 == ATOMIC_CHAR_LOCK_FREE }; };
 template<> struct __cxx_is_always_lock_free<unsigned char> { enum { __value = 2 == ATOMIC_CHAR_LOCK_FREE }; };
+#ifndef _LIBCPP_NO_HAS_CHAR8_T
+template<> struct __cxx_is_always_lock_free<char8_t> { enum { __value = 2 == ATOMIC_CHAR8_T_LOCK_FREE }; };
+#endif
 template<> struct __cxx_is_always_lock_free<char16_t> { enum { __value = 2 == ATOMIC_CHAR16_T_LOCK_FREE }; };
 template<> struct __cxx_is_always_lock_free<char32_t> { enum { __value = 2 == ATOMIC_CHAR32_T_LOCK_FREE }; };
 template<> struct __cxx_is_always_lock_free<wchar_t> { enum { __value = 2 == ATOMIC_WCHAR_T_LOCK_FREE }; };
@@ -1486,8 +1503,6 @@ struct __cxx_atomic_impl : public _Base {
     using __cxx_contention_t = int64_t;
 #endif //__linux__
 
-#if _LIBCPP_STD_VER >= 11
-
 using __cxx_atomic_contention_t = __cxx_atomic_impl<__cxx_contention_t>;
 
 #ifndef _LIBCPP_HAS_NO_PLATFORM_WAIT
@@ -1519,7 +1534,7 @@ struct __libcpp_atomic_wait_backoff_impl {
         else if(__elapsed > chrono::microseconds(4))
             __libcpp_thread_yield();
         else
-            ; // poll
+            {} // poll
         return false;
     }
 };
@@ -1565,8 +1580,6 @@ _LIBCPP_INLINE_VISIBILITY bool __cxx_atomic_wait(_Atp* __a, _Tp const __val, mem
     return __cxx_atomic_wait(__a, __test_fn);
 }
 
-#endif //_LIBCPP_STD_VER >= 11
-
 // general atomic<T>
 
 template <class _Tp, bool = is_integral<_Tp>::value && !is_same<_Tp, bool>::value>
@@ -1775,6 +1788,7 @@ struct atomic
 {
     typedef __atomic_base<_Tp> __base;
     typedef _Tp value_type;
+    typedef value_type difference_type;
     _LIBCPP_INLINE_VISIBILITY
     atomic() _NOEXCEPT _LIBCPP_DEFAULT
     _LIBCPP_INLINE_VISIBILITY
@@ -1796,6 +1810,7 @@ struct atomic<_Tp*>
 {
     typedef __atomic_base<_Tp*> __base;
     typedef _Tp* value_type;
+    typedef ptrdiff_t difference_type;
     _LIBCPP_INLINE_VISIBILITY
     atomic() _NOEXCEPT _LIBCPP_DEFAULT
     _LIBCPP_INLINE_VISIBILITY
@@ -1872,7 +1887,7 @@ atomic_is_lock_free(const atomic<_Tp>* __o) _NOEXCEPT
 template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY
 void
-atomic_init(volatile atomic<_Tp>* __o, _Tp __d) _NOEXCEPT
+atomic_init(volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type __d) _NOEXCEPT
 {
     __cxx_atomic_init(&__o->__a_, __d);
 }
@@ -1880,7 +1895,7 @@ atomic_init(volatile atomic<_Tp>* __o, _Tp __d) _NOEXCEPT
 template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY
 void
-atomic_init(atomic<_Tp>* __o, _Tp __d) _NOEXCEPT
+atomic_init(atomic<_Tp>* __o, typename atomic<_Tp>::value_type __d) _NOEXCEPT
 {
     __cxx_atomic_init(&__o->__a_, __d);
 }
@@ -1890,7 +1905,7 @@ atomic_init(atomic<_Tp>* __o, _Tp __d) _NOEXCEPT
 template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY
 void
-atomic_store(volatile atomic<_Tp>* __o, _Tp __d) _NOEXCEPT
+atomic_store(volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type __d) _NOEXCEPT
 {
     __o->store(__d);
 }
@@ -1898,7 +1913,7 @@ atomic_store(volatile atomic<_Tp>* __o, _Tp __d) _NOEXCEPT
 template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY
 void
-atomic_store(atomic<_Tp>* __o, _Tp __d) _NOEXCEPT
+atomic_store(atomic<_Tp>* __o, typename atomic<_Tp>::value_type __d) _NOEXCEPT
 {
     __o->store(__d);
 }
@@ -1908,7 +1923,7 @@ atomic_store(atomic<_Tp>* __o, _Tp __d) _NOEXCEPT
 template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY
 void
-atomic_store_explicit(volatile atomic<_Tp>* __o, _Tp __d, memory_order __m) _NOEXCEPT
+atomic_store_explicit(volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type __d, memory_order __m) _NOEXCEPT
   _LIBCPP_CHECK_STORE_MEMORY_ORDER(__m)
 {
     __o->store(__d, __m);
@@ -1917,7 +1932,7 @@ atomic_store_explicit(volatile atomic<_Tp>* __o, _Tp __d, memory_order __m) _NOE
 template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY
 void
-atomic_store_explicit(atomic<_Tp>* __o, _Tp __d, memory_order __m) _NOEXCEPT
+atomic_store_explicit(atomic<_Tp>* __o, typename atomic<_Tp>::value_type __d, memory_order __m) _NOEXCEPT
   _LIBCPP_CHECK_STORE_MEMORY_ORDER(__m)
 {
     __o->store(__d, __m);
@@ -1966,7 +1981,7 @@ atomic_load_explicit(const atomic<_Tp>* __o, memory_order __m) _NOEXCEPT
 template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY
 _Tp
-atomic_exchange(volatile atomic<_Tp>* __o, _Tp __d) _NOEXCEPT
+atomic_exchange(volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type __d) _NOEXCEPT
 {
     return __o->exchange(__d);
 }
@@ -1974,7 +1989,7 @@ atomic_exchange(volatile atomic<_Tp>* __o, _Tp __d) _NOEXCEPT
 template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY
 _Tp
-atomic_exchange(atomic<_Tp>* __o, _Tp __d) _NOEXCEPT
+atomic_exchange(atomic<_Tp>* __o, typename atomic<_Tp>::value_type __d) _NOEXCEPT
 {
     return __o->exchange(__d);
 }
@@ -1984,7 +1999,7 @@ atomic_exchange(atomic<_Tp>* __o, _Tp __d) _NOEXCEPT
 template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY
 _Tp
-atomic_exchange_explicit(volatile atomic<_Tp>* __o, _Tp __d, memory_order __m) _NOEXCEPT
+atomic_exchange_explicit(volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type __d, memory_order __m) _NOEXCEPT
 {
     return __o->exchange(__d, __m);
 }
@@ -1992,7 +2007,7 @@ atomic_exchange_explicit(volatile atomic<_Tp>* __o, _Tp __d, memory_order __m) _
 template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY
 _Tp
-atomic_exchange_explicit(atomic<_Tp>* __o, _Tp __d, memory_order __m) _NOEXCEPT
+atomic_exchange_explicit(atomic<_Tp>* __o, typename atomic<_Tp>::value_type __d, memory_order __m) _NOEXCEPT
 {
     return __o->exchange(__d, __m);
 }
@@ -2002,7 +2017,7 @@ atomic_exchange_explicit(atomic<_Tp>* __o, _Tp __d, memory_order __m) _NOEXCEPT
 template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY
 bool
-atomic_compare_exchange_weak(volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d) _NOEXCEPT
+atomic_compare_exchange_weak(volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type* __e, typename atomic<_Tp>::value_type __d) _NOEXCEPT
 {
     return __o->compare_exchange_weak(*__e, __d);
 }
@@ -2010,7 +2025,7 @@ atomic_compare_exchange_weak(volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d) _NOEX
 template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY
 bool
-atomic_compare_exchange_weak(atomic<_Tp>* __o, _Tp* __e, _Tp __d) _NOEXCEPT
+atomic_compare_exchange_weak(atomic<_Tp>* __o, typename atomic<_Tp>::value_type* __e, typename atomic<_Tp>::value_type __d) _NOEXCEPT
 {
     return __o->compare_exchange_weak(*__e, __d);
 }
@@ -2020,7 +2035,7 @@ atomic_compare_exchange_weak(atomic<_Tp>* __o, _Tp* __e, _Tp __d) _NOEXCEPT
 template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY
 bool
-atomic_compare_exchange_strong(volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d) _NOEXCEPT
+atomic_compare_exchange_strong(volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type* __e, typename atomic<_Tp>::value_type __d) _NOEXCEPT
 {
     return __o->compare_exchange_strong(*__e, __d);
 }
@@ -2028,7 +2043,7 @@ atomic_compare_exchange_strong(volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d) _NO
 template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY
 bool
-atomic_compare_exchange_strong(atomic<_Tp>* __o, _Tp* __e, _Tp __d) _NOEXCEPT
+atomic_compare_exchange_strong(atomic<_Tp>* __o, typename atomic<_Tp>::value_type* __e, typename atomic<_Tp>::value_type __d) _NOEXCEPT
 {
     return __o->compare_exchange_strong(*__e, __d);
 }
@@ -2038,8 +2053,8 @@ atomic_compare_exchange_strong(atomic<_Tp>* __o, _Tp* __e, _Tp __d) _NOEXCEPT
 template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY
 bool
-atomic_compare_exchange_weak_explicit(volatile atomic<_Tp>* __o, _Tp* __e,
-                                      _Tp __d,
+atomic_compare_exchange_weak_explicit(volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type* __e,
+                                      typename atomic<_Tp>::value_type __d,
                                       memory_order __s, memory_order __f) _NOEXCEPT
   _LIBCPP_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
 {
@@ -2049,7 +2064,7 @@ atomic_compare_exchange_weak_explicit(volatile atomic<_Tp>* __o, _Tp* __e,
 template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY
 bool
-atomic_compare_exchange_weak_explicit(atomic<_Tp>* __o, _Tp* __e, _Tp __d,
+atomic_compare_exchange_weak_explicit(atomic<_Tp>* __o, typename atomic<_Tp>::value_type* __e, typename atomic<_Tp>::value_type __d,
                                       memory_order __s, memory_order __f) _NOEXCEPT
   _LIBCPP_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
 {
@@ -2062,7 +2077,7 @@ template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY
 bool
 atomic_compare_exchange_strong_explicit(volatile atomic<_Tp>* __o,
-                                        _Tp* __e, _Tp __d,
+                                        typename atomic<_Tp>::value_type* __e, typename atomic<_Tp>::value_type __d,
                                         memory_order __s, memory_order __f) _NOEXCEPT
   _LIBCPP_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
 {
@@ -2072,8 +2087,8 @@ atomic_compare_exchange_strong_explicit(volatile atomic<_Tp>* __o,
 template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY
 bool
-atomic_compare_exchange_strong_explicit(atomic<_Tp>* __o, _Tp* __e,
-                                        _Tp __d,
+atomic_compare_exchange_strong_explicit(atomic<_Tp>* __o, typename atomic<_Tp>::value_type* __e,
+                                        typename atomic<_Tp>::value_type __d,
                                         memory_order __s, memory_order __f) _NOEXCEPT
   _LIBCPP_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
 {
@@ -2156,10 +2171,10 @@ template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY
 typename enable_if
 <
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
+    is_integral<_Tp>::value && !is_same<_Tp, bool>::value && !is_const<_Tp>::value,
     _Tp
 >::type
-atomic_fetch_add(volatile atomic<_Tp>* __o, _Tp __op) _NOEXCEPT
+atomic_fetch_add(volatile atomic<_Tp>* __o, typename atomic<_Tp>::difference_type __op) _NOEXCEPT
 {
     return __o->fetch_add(__op);
 }
@@ -2168,10 +2183,10 @@ template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY
 typename enable_if
 <
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
+    is_integral<_Tp>::value && !is_same<_Tp, bool>::value && !is_const<_Tp>::value,
     _Tp
 >::type
-atomic_fetch_add(atomic<_Tp>* __o, _Tp __op) _NOEXCEPT
+atomic_fetch_add(atomic<_Tp>* __o, typename atomic<_Tp>::difference_type __op) _NOEXCEPT
 {
     return __o->fetch_add(__op);
 }
@@ -2179,7 +2194,7 @@ atomic_fetch_add(atomic<_Tp>* __o, _Tp __op) _NOEXCEPT
 template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY
 _Tp*
-atomic_fetch_add(volatile atomic<_Tp*>* __o, ptrdiff_t __op) _NOEXCEPT
+atomic_fetch_add(volatile atomic<_Tp*>* __o, typename atomic<_Tp*>::difference_type __op) _NOEXCEPT
 {
     return __o->fetch_add(__op);
 }
@@ -2187,7 +2202,7 @@ atomic_fetch_add(volatile atomic<_Tp*>* __o, ptrdiff_t __op) _NOEXCEPT
 template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY
 _Tp*
-atomic_fetch_add(atomic<_Tp*>* __o, ptrdiff_t __op) _NOEXCEPT
+atomic_fetch_add(atomic<_Tp*>* __o, typename atomic<_Tp*>::difference_type __op) _NOEXCEPT
 {
     return __o->fetch_add(__op);
 }
@@ -2198,10 +2213,10 @@ template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY
 typename enable_if
 <
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
+    is_integral<_Tp>::value && !is_same<_Tp, bool>::value && !is_const<_Tp>::value,
     _Tp
 >::type
-atomic_fetch_add_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) _NOEXCEPT
+atomic_fetch_add_explicit(volatile atomic<_Tp>* __o, typename atomic<_Tp>::difference_type __op, memory_order __m) _NOEXCEPT
 {
     return __o->fetch_add(__op, __m);
 }
@@ -2210,10 +2225,10 @@ template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY
 typename enable_if
 <
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
+    is_integral<_Tp>::value && !is_same<_Tp, bool>::value && !is_const<_Tp>::value,
     _Tp
 >::type
-atomic_fetch_add_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) _NOEXCEPT
+atomic_fetch_add_explicit(atomic<_Tp>* __o, typename atomic<_Tp>::difference_type __op, memory_order __m) _NOEXCEPT
 {
     return __o->fetch_add(__op, __m);
 }
@@ -2221,8 +2236,7 @@ atomic_fetch_add_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) _NOEXCEP
 template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY
 _Tp*
-atomic_fetch_add_explicit(volatile atomic<_Tp*>* __o, ptrdiff_t __op,
-                          memory_order __m) _NOEXCEPT
+atomic_fetch_add_explicit(volatile atomic<_Tp*>* __o, typename atomic<_Tp*>::difference_type __op, memory_order __m) _NOEXCEPT
 {
     return __o->fetch_add(__op, __m);
 }
@@ -2230,7 +2244,7 @@ atomic_fetch_add_explicit(volatile atomic<_Tp*>* __o, ptrdiff_t __op,
 template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY
 _Tp*
-atomic_fetch_add_explicit(atomic<_Tp*>* __o, ptrdiff_t __op, memory_order __m) _NOEXCEPT
+atomic_fetch_add_explicit(atomic<_Tp*>* __o, typename atomic<_Tp*>::difference_type __op, memory_order __m) _NOEXCEPT
 {
     return __o->fetch_add(__op, __m);
 }
@@ -2241,10 +2255,10 @@ template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY
 typename enable_if
 <
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
+    is_integral<_Tp>::value && !is_same<_Tp, bool>::value && !is_const<_Tp>::value,
     _Tp
 >::type
-atomic_fetch_sub(volatile atomic<_Tp>* __o, _Tp __op) _NOEXCEPT
+atomic_fetch_sub(volatile atomic<_Tp>* __o, typename atomic<_Tp>::difference_type __op) _NOEXCEPT
 {
     return __o->fetch_sub(__op);
 }
@@ -2253,10 +2267,10 @@ template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY
 typename enable_if
 <
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
+    is_integral<_Tp>::value && !is_same<_Tp, bool>::value && !is_const<_Tp>::value,
     _Tp
 >::type
-atomic_fetch_sub(atomic<_Tp>* __o, _Tp __op) _NOEXCEPT
+atomic_fetch_sub(atomic<_Tp>* __o, typename atomic<_Tp>::difference_type __op) _NOEXCEPT
 {
     return __o->fetch_sub(__op);
 }
@@ -2264,7 +2278,7 @@ atomic_fetch_sub(atomic<_Tp>* __o, _Tp __op) _NOEXCEPT
 template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY
 _Tp*
-atomic_fetch_sub(volatile atomic<_Tp*>* __o, ptrdiff_t __op) _NOEXCEPT
+atomic_fetch_sub(volatile atomic<_Tp*>* __o, typename atomic<_Tp*>::difference_type __op) _NOEXCEPT
 {
     return __o->fetch_sub(__op);
 }
@@ -2272,7 +2286,7 @@ atomic_fetch_sub(volatile atomic<_Tp*>* __o, ptrdiff_t __op) _NOEXCEPT
 template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY
 _Tp*
-atomic_fetch_sub(atomic<_Tp*>* __o, ptrdiff_t __op) _NOEXCEPT
+atomic_fetch_sub(atomic<_Tp*>* __o, typename atomic<_Tp*>::difference_type __op) _NOEXCEPT
 {
     return __o->fetch_sub(__op);
 }
@@ -2283,10 +2297,10 @@ template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY
 typename enable_if
 <
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
+    is_integral<_Tp>::value && !is_same<_Tp, bool>::value && !is_const<_Tp>::value,
     _Tp
 >::type
-atomic_fetch_sub_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) _NOEXCEPT
+atomic_fetch_sub_explicit(volatile atomic<_Tp>* __o, typename atomic<_Tp>::difference_type __op, memory_order __m) _NOEXCEPT
 {
     return __o->fetch_sub(__op, __m);
 }
@@ -2295,10 +2309,10 @@ template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY
 typename enable_if
 <
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
+    is_integral<_Tp>::value && !is_same<_Tp, bool>::value && !is_const<_Tp>::value,
     _Tp
 >::type
-atomic_fetch_sub_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) _NOEXCEPT
+atomic_fetch_sub_explicit(atomic<_Tp>* __o, typename atomic<_Tp>::difference_type __op, memory_order __m) _NOEXCEPT
 {
     return __o->fetch_sub(__op, __m);
 }
@@ -2306,8 +2320,7 @@ atomic_fetch_sub_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) _NOEXCEP
 template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY
 _Tp*
-atomic_fetch_sub_explicit(volatile atomic<_Tp*>* __o, ptrdiff_t __op,
-                          memory_order __m) _NOEXCEPT
+atomic_fetch_sub_explicit(volatile atomic<_Tp*>* __o, typename atomic<_Tp*>::difference_type __op, memory_order __m) _NOEXCEPT
 {
     return __o->fetch_sub(__op, __m);
 }
@@ -2315,7 +2328,7 @@ atomic_fetch_sub_explicit(volatile atomic<_Tp*>* __o, ptrdiff_t __op,
 template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY
 _Tp*
-atomic_fetch_sub_explicit(atomic<_Tp*>* __o, ptrdiff_t __op, memory_order __m) _NOEXCEPT
+atomic_fetch_sub_explicit(atomic<_Tp*>* __o, typename atomic<_Tp*>::difference_type __op, memory_order __m) _NOEXCEPT
 {
     return __o->fetch_sub(__op, __m);
 }
@@ -2329,7 +2342,7 @@ typename enable_if
     is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
     _Tp
 >::type
-atomic_fetch_and(volatile atomic<_Tp>* __o, _Tp __op) _NOEXCEPT
+atomic_fetch_and(volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type __op) _NOEXCEPT
 {
     return __o->fetch_and(__op);
 }
@@ -2341,7 +2354,7 @@ typename enable_if
     is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
     _Tp
 >::type
-atomic_fetch_and(atomic<_Tp>* __o, _Tp __op) _NOEXCEPT
+atomic_fetch_and(atomic<_Tp>* __o, typename atomic<_Tp>::value_type __op) _NOEXCEPT
 {
     return __o->fetch_and(__op);
 }
@@ -2355,7 +2368,7 @@ typename enable_if
     is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
     _Tp
 >::type
-atomic_fetch_and_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) _NOEXCEPT
+atomic_fetch_and_explicit(volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type __op, memory_order __m) _NOEXCEPT
 {
     return __o->fetch_and(__op, __m);
 }
@@ -2367,7 +2380,7 @@ typename enable_if
     is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
     _Tp
 >::type
-atomic_fetch_and_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) _NOEXCEPT
+atomic_fetch_and_explicit(atomic<_Tp>* __o, typename atomic<_Tp>::value_type __op, memory_order __m) _NOEXCEPT
 {
     return __o->fetch_and(__op, __m);
 }
@@ -2381,7 +2394,7 @@ typename enable_if
     is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
     _Tp
 >::type
-atomic_fetch_or(volatile atomic<_Tp>* __o, _Tp __op) _NOEXCEPT
+atomic_fetch_or(volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type __op) _NOEXCEPT
 {
     return __o->fetch_or(__op);
 }
@@ -2393,7 +2406,7 @@ typename enable_if
     is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
     _Tp
 >::type
-atomic_fetch_or(atomic<_Tp>* __o, _Tp __op) _NOEXCEPT
+atomic_fetch_or(atomic<_Tp>* __o, typename atomic<_Tp>::value_type __op) _NOEXCEPT
 {
     return __o->fetch_or(__op);
 }
@@ -2407,7 +2420,7 @@ typename enable_if
     is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
     _Tp
 >::type
-atomic_fetch_or_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) _NOEXCEPT
+atomic_fetch_or_explicit(volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type __op, memory_order __m) _NOEXCEPT
 {
     return __o->fetch_or(__op, __m);
 }
@@ -2419,7 +2432,7 @@ typename enable_if
     is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
     _Tp
 >::type
-atomic_fetch_or_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) _NOEXCEPT
+atomic_fetch_or_explicit(atomic<_Tp>* __o, typename atomic<_Tp>::value_type __op, memory_order __m) _NOEXCEPT
 {
     return __o->fetch_or(__op, __m);
 }
@@ -2433,7 +2446,7 @@ typename enable_if
     is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
     _Tp
 >::type
-atomic_fetch_xor(volatile atomic<_Tp>* __o, _Tp __op) _NOEXCEPT
+atomic_fetch_xor(volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type __op) _NOEXCEPT
 {
     return __o->fetch_xor(__op);
 }
@@ -2445,7 +2458,7 @@ typename enable_if
     is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
     _Tp
 >::type
-atomic_fetch_xor(atomic<_Tp>* __o, _Tp __op) _NOEXCEPT
+atomic_fetch_xor(atomic<_Tp>* __o, typename atomic<_Tp>::value_type __op) _NOEXCEPT
 {
     return __o->fetch_xor(__op);
 }
@@ -2459,7 +2472,7 @@ typename enable_if
     is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
     _Tp
 >::type
-atomic_fetch_xor_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) _NOEXCEPT
+atomic_fetch_xor_explicit(volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type __op, memory_order __m) _NOEXCEPT
 {
     return __o->fetch_xor(__op, __m);
 }
@@ -2471,7 +2484,7 @@ typename enable_if
     is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
     _Tp
 >::type
-atomic_fetch_xor_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) _NOEXCEPT
+atomic_fetch_xor_explicit(atomic<_Tp>* __o, typename atomic<_Tp>::value_type __op, memory_order __m) _NOEXCEPT
 {
     return __o->fetch_xor(__op, __m);
 }
@@ -2715,6 +2728,9 @@ typedef atomic<long>               atomic_long;
 typedef atomic<unsigned long>      atomic_ulong;
 typedef atomic<long long>          atomic_llong;
 typedef atomic<unsigned long long> atomic_ullong;
+#ifndef _LIBCPP_NO_HAS_CHAR8_T
+typedef atomic<char8_t>            atomic_char8_t;
+#endif
 typedef atomic<char16_t>           atomic_char16_t;
 typedef atomic<char32_t>           atomic_char32_t;
 typedef atomic<wchar_t>            atomic_wchar_t;
diff --git a/lib/libcxx/include/barrier b/lib/libcxx/include/barrier
index 58e3eef9cf..be213a6895 100644
--- a/lib/libcxx/include/barrier
+++ b/lib/libcxx/include/barrier
@@ -22,6 +22,8 @@ namespace std
   public:
     using arrival_token = see below;
 
+    static constexpr ptrdiff_t max() noexcept;
+
     constexpr explicit barrier(ptrdiff_t phase_count,
                                CompletionFunction f = CompletionFunction());
     ~barrier();
@@ -44,6 +46,7 @@ namespace std
 */
 
 #include <__config>
+#include <__availability>
 #include <atomic>
 #ifndef _LIBCPP_HAS_NO_TREE_BARRIER
 # include <memory>
@@ -57,6 +60,9 @@ namespace std
 # error <barrier> is not supported on this single threaded system
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 14
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -287,7 +293,7 @@ public:
 
     _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
     barrier(ptrdiff_t __count, _CompletionF __completion = _CompletionF())
-        : __b(__count, std::move(__completion)) {
+        : __b(__count, _VSTD::move(__completion)) {
     }
 
     barrier(barrier const&) = delete;
@@ -301,7 +307,7 @@ public:
     _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
     void wait(arrival_token&& __phase) const
     {
-        __b.wait(std::move(__phase));
+        __b.wait(_VSTD::move(__phase));
     }
 	_LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
     void arrive_and_wait()
@@ -319,4 +325,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 14
 
+_LIBCPP_POP_MACROS
+
 #endif //_LIBCPP_BARRIER
diff --git a/lib/libcxx/include/bit b/lib/libcxx/include/bit
index ae4605b191..f8c37c3d6b 100644
--- a/lib/libcxx/include/bit
+++ b/lib/libcxx/include/bit
@@ -17,13 +17,13 @@ namespace std {
 
   // [bit.pow.two], integral powers of 2
   template <class T>
-    constexpr bool ispow2(T x) noexcept; // C++20
+    constexpr bool has_single_bit(T x) noexcept; // C++20
   template <class T>
-    constexpr T ceil2(T x);              // C++20
+    constexpr T bit_ceil(T x);                   // C++20
   template <class T>
-    constexpr T floor2(T x) noexcept;    // C++20
+    constexpr T bit_floor(T x) noexcept;         // C++20
   template <class T>
-    constexpr T log2p1(T x) noexcept;    // C++20
+    constexpr T bit_width(T x) noexcept;         // C++20
 
   // [bit.rotate], rotating
   template<class T>
@@ -55,13 +55,14 @@ namespace std {
 */
 
 #include <__config>
+#include <__bits>
 #include <limits>
 #include <type_traits>
 #include <version>
 #include <__debug>
 
 #if defined(__IBMCPP__)
-#include "support/ibm/support.h"
+#include "__support/ibm/support.h"
 #endif
 #if defined(_LIBCPP_COMPILER_MSVC)
 #include <intrin.h>
@@ -76,122 +77,6 @@ _LIBCPP_PUSH_MACROS
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-#ifndef _LIBCPP_COMPILER_MSVC
-
-inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
-int __libcpp_ctz(unsigned __x)           _NOEXCEPT { return __builtin_ctz(__x); }
-
-inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
-int __libcpp_ctz(unsigned long __x)      _NOEXCEPT { return __builtin_ctzl(__x); }
-
-inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
-int __libcpp_ctz(unsigned long long __x) _NOEXCEPT { return __builtin_ctzll(__x); }
-
-
-inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
-int __libcpp_clz(unsigned __x)           _NOEXCEPT { return __builtin_clz(__x); }
-
-inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
-int __libcpp_clz(unsigned long __x)      _NOEXCEPT { return __builtin_clzl(__x); }
-
-inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
-int __libcpp_clz(unsigned long long __x) _NOEXCEPT { return __builtin_clzll(__x); }
-
-
-inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
-int __libcpp_popcount(unsigned __x)           _NOEXCEPT { return __builtin_popcount(__x); }
-
-inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
-int __libcpp_popcount(unsigned long __x)      _NOEXCEPT { return __builtin_popcountl(__x); }
-
-inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
-int __libcpp_popcount(unsigned long long __x) _NOEXCEPT { return __builtin_popcountll(__x); }
-
-#else  // _LIBCPP_COMPILER_MSVC
-
-// Precondition:  __x != 0
-inline _LIBCPP_INLINE_VISIBILITY
-int __libcpp_ctz(unsigned __x) {
-  static_assert(sizeof(unsigned) == sizeof(unsigned long), "");
-  static_assert(sizeof(unsigned long) == 4, "");
-  unsigned long __where;
-  if (_BitScanForward(&__where, __x))
-    return static_cast<int>(__where);
-  return 32;
-}
-
-inline _LIBCPP_INLINE_VISIBILITY
-int __libcpp_ctz(unsigned long __x) {
-    static_assert(sizeof(unsigned long) == sizeof(unsigned), "");
-    return __ctz(static_cast<unsigned>(__x));
-}
-
-inline _LIBCPP_INLINE_VISIBILITY
-int __libcpp_ctz(unsigned long long __x) {
-    unsigned long __where;
-#if defined(_LIBCPP_HAS_BITSCAN64)
-    (defined(_M_AMD64) || defined(__x86_64__))
-  if (_BitScanForward64(&__where, __x))
-    return static_cast<int>(__where);
-#else
-  // Win32 doesn't have _BitScanForward64 so emulate it with two 32 bit calls.
-  if (_BitScanForward(&__where, static_cast<unsigned long>(__x)))
-    return static_cast<int>(__where);
-  if (_BitScanForward(&__where, static_cast<unsigned long>(__x >> 32)))
-    return static_cast<int>(__where + 32);
-#endif
-  return 64;
-}
-
-// Precondition:  __x != 0
-inline _LIBCPP_INLINE_VISIBILITY
-int __libcpp_clz(unsigned __x) {
-  static_assert(sizeof(unsigned) == sizeof(unsigned long), "");
-  static_assert(sizeof(unsigned long) == 4, "");
-  unsigned long __where;
-  if (_BitScanReverse(&__where, __x))
-    return static_cast<int>(31 - __where);
-  return 32; // Undefined Behavior.
-}
-
-inline _LIBCPP_INLINE_VISIBILITY
-int __libcpp_clz(unsigned long __x) {
-    static_assert(sizeof(unsigned) == sizeof(unsigned long), "");
-    return __libcpp_clz(static_cast<unsigned>(__x));
-}
-
-inline _LIBCPP_INLINE_VISIBILITY
-int __libcpp_clz(unsigned long long __x) {
-  unsigned long __where;
-#if defined(_LIBCPP_HAS_BITSCAN64)
-  if (_BitScanReverse64(&__where, __x))
-    return static_cast<int>(63 - __where);
-#else
-  // Win32 doesn't have _BitScanReverse64 so emulate it with two 32 bit calls.
-  if (_BitScanReverse(&__where, static_cast<unsigned long>(__x >> 32)))
-    return static_cast<int>(63 - (__where + 32));
-  if (_BitScanReverse(&__where, static_cast<unsigned long>(__x)))
-    return static_cast<int>(63 - __where);
-#endif
-  return 64; // Undefined Behavior.
-}
-
-inline _LIBCPP_INLINE_VISIBILITY int __libcpp_popcount(unsigned __x) {
-  static_assert(sizeof(unsigned) == 4, "");
-  return __popcnt(__x);
-}
-
-inline _LIBCPP_INLINE_VISIBILITY int __libcpp_popcount(unsigned long __x) {
-  static_assert(sizeof(unsigned long) == 4, "");
-  return __popcnt(__x);
-}
-
-inline _LIBCPP_INLINE_VISIBILITY int __libcpp_popcount(unsigned long long __x) {
-  static_assert(sizeof(unsigned long long) == 8, "");
-  return __popcnt64(__x);
-}
-
-#endif // _LIBCPP_COMPILER_MSVC
 
 template <class _Tp>
 using __bitop_unsigned_integer _LIBCPP_NODEBUG_TYPE = integral_constant<bool,
@@ -343,14 +228,14 @@ _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX11
 unsigned __bit_log2(_Tp __t) _NOEXCEPT
 {
     static_assert(__bitop_unsigned_integer<_Tp>::value, "__bit_log2 requires unsigned");
-    return std::numeric_limits<_Tp>::digits - 1 - __countl_zero(__t);
+    return numeric_limits<_Tp>::digits - 1 - __countl_zero(__t);
 }
 
 template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
-bool __ispow2(_Tp __t) _NOEXCEPT
+bool __has_single_bit(_Tp __t) _NOEXCEPT
 {
-    static_assert(__bitop_unsigned_integer<_Tp>::value, "__ispow2 requires unsigned");
+    static_assert(__bitop_unsigned_integer<_Tp>::value, "__has_single_bit requires unsigned");
     return __t != 0 && (((__t & (__t - 1)) == 0));
 }
 
@@ -399,7 +284,7 @@ _LIBCPP_INLINE_VISIBILITY constexpr
 enable_if_t<__bitop_unsigned_integer<_Tp>::value, int>
 countr_zero(_Tp __t) noexcept
 {
-	return __countr_zero(__t);
+    return __countr_zero(__t);
 }
 
 
@@ -424,15 +309,15 @@ popcount(_Tp __t) noexcept
 template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY constexpr
 enable_if_t<__bitop_unsigned_integer<_Tp>::value, bool>
-ispow2(_Tp __t) noexcept
+has_single_bit(_Tp __t) noexcept
 {
-    return __ispow2(__t);
+    return __has_single_bit(__t);
 }
 
 template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY constexpr
 enable_if_t<__bitop_unsigned_integer<_Tp>::value, _Tp>
-floor2(_Tp __t) noexcept
+bit_floor(_Tp __t) noexcept
 {
     return __t == 0 ? 0 : _Tp{1} << __bit_log2(__t);
 }
@@ -440,11 +325,11 @@ floor2(_Tp __t) noexcept
 template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY constexpr
 enable_if_t<__bitop_unsigned_integer<_Tp>::value, _Tp>
-ceil2(_Tp __t) noexcept
+bit_ceil(_Tp __t) noexcept
 {
     if (__t < 2) return 1;
     const unsigned __n = numeric_limits<_Tp>::digits - countl_zero((_Tp)(__t - 1u));
-    _LIBCPP_DEBUG_ASSERT(__libcpp_is_constant_evaluated() || __n != numeric_limits<_Tp>::digits, "Bad input to ceil2");
+    _LIBCPP_DEBUG_ASSERT(__libcpp_is_constant_evaluated() || __n != numeric_limits<_Tp>::digits, "Bad input to bit_ceil");
 
     if constexpr (sizeof(_Tp) >= sizeof(unsigned))
         return _Tp{1} << __n;
@@ -459,12 +344,11 @@ ceil2(_Tp __t) noexcept
 template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY constexpr
 enable_if_t<__bitop_unsigned_integer<_Tp>::value, _Tp>
-log2p1(_Tp __t) noexcept
+bit_width(_Tp __t) noexcept
 {
     return __t == 0 ? 0 : __bit_log2(__t) + 1;
 }
 
-
 enum class endian
 {
     little = 0xDEAD,
diff --git a/lib/libcxx/include/bitset b/lib/libcxx/include/bitset
index 4755fbeb21..00503fe1c1 100644
--- a/lib/libcxx/include/bitset
+++ b/lib/libcxx/include/bitset
@@ -380,7 +380,7 @@ unsigned long long
 __bitset<_N_words, _Size>::to_ullong(true_type, true_type) const
 {
     unsigned long long __r = __first_[0];
-    for (std::size_t __i = 1; __i < sizeof(unsigned long long) / sizeof(__storage_type); ++__i)
+    for (size_t __i = 1; __i < sizeof(unsigned long long) / sizeof(__storage_type); ++__i)
         __r |= static_cast<unsigned long long>(__first_[__i]) << (sizeof(__storage_type) * CHAR_BIT);
     return __r;
 }
@@ -625,13 +625,13 @@ protected:
     explicit _LIBCPP_CONSTEXPR __bitset(unsigned long long) _NOEXCEPT;
 
     _LIBCPP_INLINE_VISIBILITY reference __make_ref(size_t) _NOEXCEPT
-        {return reference(0, 1);}
+        {return reference(nullptr, 1);}
     _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR const_reference __make_ref(size_t) const _NOEXCEPT
-        {return const_reference(0, 1);}
+        {return const_reference(nullptr, 1);}
     _LIBCPP_INLINE_VISIBILITY iterator __make_iter(size_t) _NOEXCEPT
-        {return iterator(0, 0);}
+        {return iterator(nullptr, 0);}
     _LIBCPP_INLINE_VISIBILITY const_iterator __make_iter(size_t) const _NOEXCEPT
-        {return const_iterator(0, 0);}
+        {return const_iterator(nullptr, 0);}
 
     _LIBCPP_INLINE_VISIBILITY void operator&=(const __bitset&) _NOEXCEPT {}
     _LIBCPP_INLINE_VISIBILITY void operator|=(const __bitset&) _NOEXCEPT {}
@@ -990,7 +990,7 @@ inline
 size_t
 bitset<_Size>::count() const _NOEXCEPT
 {
-    return static_cast<size_t>(__count_bool_true(base::__make_iter(0), _Size));
+    return static_cast<size_t>(_VSTD::__count_bool_true(base::__make_iter(0), _Size));
 }
 
 template <size_t _Size>
diff --git a/lib/libcxx/include/charconv b/lib/libcxx/include/charconv
index b64000242a..4666c5c51d 100644
--- a/lib/libcxx/include/charconv
+++ b/lib/libcxx/include/charconv
@@ -74,12 +74,13 @@ namespace std {
 */
 
 #include <__config>
+#include <__availability>
 #include <__errc>
-#include <type_traits>
+#include <cmath> // for log2f
+#include <cstdint>
+#include <cstring>
 #include <limits>
-#include <stdint.h>
-#include <string.h>
-#include <math.h>
+#include <type_traits>
 
 #include <__debug>
 
@@ -206,7 +207,7 @@ __mul_overflowed(unsigned char __a, _Tp __b, unsigned char& __r)
 {
     auto __c = __a * __b;
     __r = __c;
-    return __c > (numeric_limits<unsigned char>::max)();
+    return __c > numeric_limits<unsigned char>::max();
 }
 
 template <typename _Tp>
@@ -215,7 +216,7 @@ __mul_overflowed(unsigned short __a, _Tp __b, unsigned short& __r)
 {
     auto __c = __a * __b;
     __r = __c;
-    return __c > (numeric_limits<unsigned short>::max)();
+    return __c > numeric_limits<unsigned short>::max();
 }
 
 template <typename _Tp>
@@ -226,7 +227,7 @@ __mul_overflowed(_Tp __a, _Tp __b, _Tp& __r)
 #if !defined(_LIBCPP_COMPILER_MSVC)
     return __builtin_mul_overflow(__a, __b, &__r);
 #else
-    bool __did = __b && ((numeric_limits<_Tp>::max)() / __b) < __a;
+    bool __did = __b && (numeric_limits<_Tp>::max() / __b) < __a;
     __r = __a * __b;
     return __did;
 #endif
@@ -332,7 +333,7 @@ __to_chars_itoa(char* __first, char* __last, _Tp __value, false_type)
         auto __len = __p - __buf;
         if (__len <= __diff)
         {
-            memcpy(__first, __buf, __len);
+            _VSTD::memcpy(__first, __buf, __len);
             return {__first + __len, {}};
         }
         else
@@ -381,7 +382,7 @@ __to_chars_integral(char* __first, char* __last, _Tp __value, int __base,
         return {__last, errc::value_too_large};
     else
     {
-        memmove(__first, __p, __len);
+        _VSTD::memmove(__first, __p, __len);
         return {__first + __len, {}};
     }
 }
@@ -428,13 +429,13 @@ __sign_combinator(_It __first, _It __last, _Tp& __value, _Fn __f, _Ts... __args)
         if (__x <= __complement(__to_unsigned(__tl::min())))
         {
             __x = __complement(__x);
-            memcpy(&__value, &__x, sizeof(__x));
+            _VSTD::memcpy(&__value, &__x, sizeof(__x));
             return __r;
         }
     }
     else
     {
-        if (__x <= (__tl::max)())
+        if (__x <= __tl::max())
         {
             __value = __x;
             return __r;
@@ -525,7 +526,7 @@ __from_chars_atoi(const char* __first, const char* __last, _Tp& __value)
             auto __p = __tx::__read(__first, __last, __a, __b);
             if (__p == __last || !__in_pattern(*__p))
             {
-                __output_type __m = (numeric_limits<_Tp>::max)();
+                __output_type __m = numeric_limits<_Tp>::max();
                 if (__m >= __a && __m - __a >= __b)
                 {
                     __value = __a + __b;
@@ -580,7 +581,7 @@ __from_chars_integral(const char* __first, const char* __last, _Tp& __value,
 
             if (__p == __last || !__in_pattern(*__p, __base))
             {
-                if ((__tl::max)() - __a >= __b)
+                if (__tl::max() - __a >= __b)
                 {
                     __value = __a + __b;
                     return {__p, {}};
diff --git a/lib/libcxx/include/chrono b/lib/libcxx/include/chrono
index 117aab3190..53e4546010 100644
--- a/lib/libcxx/include/chrono
+++ b/lib/libcxx/include/chrono
@@ -824,6 +824,7 @@ constexpr chrono::year                                  operator ""y(unsigned lo
 */
 
 #include <__config>
+#include <__availability>
 #include <ctime>
 #include <type_traits>
 #include <ratio>
@@ -1076,7 +1077,7 @@ public:
                is_convertible<_Rep2, rep>::value &&
                (treat_as_floating_point<rep>::value ||
                !treat_as_floating_point<_Rep2>::value)
-            >::type* = 0)
+            >::type* = nullptr)
                 : __rep_(__r) {}
 
     // conversions
@@ -1089,7 +1090,7 @@ public:
                 treat_as_floating_point<rep>::value ||
                 (__no_overflow<_Period2, period>::type::den == 1 &&
                  !treat_as_floating_point<_Rep2>::value))
-            >::type* = 0)
+            >::type* = nullptr)
                 : __rep_(_VSTD::chrono::duration_cast<duration>(__d).count()) {}
 
     // observer
@@ -1375,7 +1376,7 @@ public:
         typename enable_if
         <
             is_convertible<_Duration2, duration>::value
-        >::type* = 0)
+        >::type* = nullptr)
             : __d_(t.time_since_epoch()) {}
 
     // observer
diff --git a/lib/libcxx/include/cmath b/lib/libcxx/include/cmath
index 0901a23a24..138ae6f99a 100644
--- a/lib/libcxx/include/cmath
+++ b/lib/libcxx/include/cmath
@@ -660,8 +660,8 @@ _LIBCPP_CONSTEXPR _IntT __max_representable_int_for_float() _NOEXCEPT {
 template <class _IntT, class _RealT>
 _LIBCPP_INLINE_VISIBILITY
 _IntT __clamp_to_integral(_RealT __r) _NOEXCEPT {
-  using _Lim = std::numeric_limits<_IntT>;
-  const _IntT _MaxVal = std::__max_representable_int_for_float<_IntT, _RealT>();
+  using _Lim = numeric_limits<_IntT>;
+  const _IntT _MaxVal = __max_representable_int_for_float<_IntT, _RealT>();
   if (__r >= ::nextafter(static_cast<_RealT>(_MaxVal), INFINITY)) {
     return _Lim::max();
   } else if (__r <= _Lim::lowest()) {
diff --git a/lib/libcxx/include/codecvt b/lib/libcxx/include/codecvt
index 05fa765c31..2befa1b0af 100644
--- a/lib/libcxx/include/codecvt
+++ b/lib/libcxx/include/codecvt
@@ -109,6 +109,7 @@ protected:
     virtual int do_max_length() const _NOEXCEPT;
 };
 
+_LIBCPP_SUPPRESS_DEPRECATED_PUSH
 template <>
 class _LIBCPP_TYPE_VIS __codecvt_utf8<char16_t>
     : public codecvt<char16_t, char, mbstate_t>
@@ -125,6 +126,8 @@ public:
                             codecvt_mode _Mode)
         : codecvt<char16_t, char, mbstate_t>(__refs), _Maxcode_(_Maxcode),
           _Mode_(_Mode) {}
+_LIBCPP_SUPPRESS_DEPRECATED_POP
+
 protected:
     virtual result
         do_out(state_type& __st,
@@ -144,6 +147,7 @@ protected:
     virtual int do_max_length() const _NOEXCEPT;
 };
 
+_LIBCPP_SUPPRESS_DEPRECATED_PUSH
 template <>
 class _LIBCPP_TYPE_VIS __codecvt_utf8<char32_t>
     : public codecvt<char32_t, char, mbstate_t>
@@ -160,6 +164,8 @@ public:
                             codecvt_mode _Mode)
         : codecvt<char32_t, char, mbstate_t>(__refs), _Maxcode_(_Maxcode),
           _Mode_(_Mode) {}
+_LIBCPP_SUPPRESS_DEPRECATED_POP
+
 protected:
     virtual result
         do_out(state_type& __st,
@@ -267,6 +273,7 @@ protected:
     virtual int do_max_length() const _NOEXCEPT;
 };
 
+_LIBCPP_SUPPRESS_DEPRECATED_PUSH
 template <>
 class _LIBCPP_TYPE_VIS __codecvt_utf16<char16_t, false>
     : public codecvt<char16_t, char, mbstate_t>
@@ -283,6 +290,8 @@ public:
                             codecvt_mode _Mode)
         : codecvt<char16_t, char, mbstate_t>(__refs), _Maxcode_(_Maxcode),
           _Mode_(_Mode) {}
+_LIBCPP_SUPPRESS_DEPRECATED_POP
+
 protected:
     virtual result
         do_out(state_type& __st,
@@ -302,6 +311,7 @@ protected:
     virtual int do_max_length() const _NOEXCEPT;
 };
 
+_LIBCPP_SUPPRESS_DEPRECATED_PUSH
 template <>
 class _LIBCPP_TYPE_VIS __codecvt_utf16<char16_t, true>
     : public codecvt<char16_t, char, mbstate_t>
@@ -318,6 +328,8 @@ public:
                             codecvt_mode _Mode)
         : codecvt<char16_t, char, mbstate_t>(__refs), _Maxcode_(_Maxcode),
           _Mode_(_Mode) {}
+_LIBCPP_SUPPRESS_DEPRECATED_POP
+
 protected:
     virtual result
         do_out(state_type& __st,
@@ -337,6 +349,7 @@ protected:
     virtual int do_max_length() const _NOEXCEPT;
 };
 
+_LIBCPP_SUPPRESS_DEPRECATED_PUSH
 template <>
 class _LIBCPP_TYPE_VIS __codecvt_utf16<char32_t, false>
     : public codecvt<char32_t, char, mbstate_t>
@@ -353,6 +366,8 @@ public:
                             codecvt_mode _Mode)
         : codecvt<char32_t, char, mbstate_t>(__refs), _Maxcode_(_Maxcode),
           _Mode_(_Mode) {}
+_LIBCPP_SUPPRESS_DEPRECATED_POP
+
 protected:
     virtual result
         do_out(state_type& __st,
@@ -372,6 +387,7 @@ protected:
     virtual int do_max_length() const _NOEXCEPT;
 };
 
+_LIBCPP_SUPPRESS_DEPRECATED_PUSH
 template <>
 class _LIBCPP_TYPE_VIS __codecvt_utf16<char32_t, true>
     : public codecvt<char32_t, char, mbstate_t>
@@ -388,6 +404,8 @@ public:
                             codecvt_mode _Mode)
         : codecvt<char32_t, char, mbstate_t>(__refs), _Maxcode_(_Maxcode),
           _Mode_(_Mode) {}
+_LIBCPP_SUPPRESS_DEPRECATED_POP
+
 protected:
     virtual result
         do_out(state_type& __st,
@@ -460,6 +478,7 @@ protected:
     virtual int do_max_length() const _NOEXCEPT;
 };
 
+_LIBCPP_SUPPRESS_DEPRECATED_PUSH
 template <>
 class _LIBCPP_TYPE_VIS __codecvt_utf8_utf16<char32_t>
     : public codecvt<char32_t, char, mbstate_t>
@@ -476,6 +495,8 @@ public:
                             codecvt_mode _Mode)
         : codecvt<char32_t, char, mbstate_t>(__refs), _Maxcode_(_Maxcode),
           _Mode_(_Mode) {}
+_LIBCPP_SUPPRESS_DEPRECATED_POP
+
 protected:
     virtual result
         do_out(state_type& __st,
@@ -495,6 +516,7 @@ protected:
     virtual int do_max_length() const _NOEXCEPT;
 };
 
+_LIBCPP_SUPPRESS_DEPRECATED_PUSH
 template <>
 class _LIBCPP_TYPE_VIS __codecvt_utf8_utf16<char16_t>
     : public codecvt<char16_t, char, mbstate_t>
@@ -511,6 +533,8 @@ public:
                             codecvt_mode _Mode)
         : codecvt<char16_t, char, mbstate_t>(__refs), _Maxcode_(_Maxcode),
           _Mode_(_Mode) {}
+_LIBCPP_SUPPRESS_DEPRECATED_POP
+
 protected:
     virtual result
         do_out(state_type& __st,
diff --git a/lib/libcxx/include/compare b/lib/libcxx/include/compare
index 717859a1e3..048f4821dd 100644
--- a/lib/libcxx/include/compare
+++ b/lib/libcxx/include/compare
@@ -154,8 +154,13 @@ enum class _LIBCPP_ENUM_VIS _NCmpResult : signed char {
   __unordered = -127
 };
 
-struct _CmpUnspecifiedType;
-using _CmpUnspecifiedParam = void (_CmpUnspecifiedType::*)();
+struct _CmpUnspecifiedParam {
+  _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEVAL
+  _CmpUnspecifiedParam(int _CmpUnspecifiedParam::*) noexcept {}
+
+  template<typename _Tp, typename = _VSTD::enable_if_t<!_VSTD::is_same_v<_Tp, int>>>
+  _CmpUnspecifiedParam(_Tp) = delete;
+};
 
 class  weak_equality {
   _LIBCPP_INLINE_VISIBILITY
@@ -696,8 +701,8 @@ constexpr _ClassifyCompCategory __type_to_enum() noexcept {
 
 template <size_t _Size>
 constexpr _ClassifyCompCategory
-__compute_comp_type(std::array<_ClassifyCompCategory, _Size> __types) {
-  std::array<int, _CCC_Size> __seen = {};
+__compute_comp_type(array<_ClassifyCompCategory, _Size> __types) {
+  array<int, _CCC_Size> __seen = {};
   for (auto __type : __types)
     ++__seen[__type];
   if (__seen[_None])
diff --git a/lib/libcxx/include/complex b/lib/libcxx/include/complex
index 36c66db50e..93b7bb5dd5 100644
--- a/lib/libcxx/include/complex
+++ b/lib/libcxx/include/complex
@@ -227,14 +227,6 @@ template<class T> complex<T> sqrt (const complex<T>&);
 template<class T> complex<T> tan (const complex<T>&);
 template<class T> complex<T> tanh (const complex<T>&);
 
-template<class T, class charT, class traits>
-  basic_istream<charT, traits>&
-  operator>>(basic_istream<charT, traits>& is, complex<T>& x);
-
-template<class T, class charT, class traits>
-  basic_ostream<charT, traits>&
-  operator<<(basic_ostream<charT, traits>& o, const complex<T>& x);
-
 }  // std
 
 */
@@ -244,9 +236,12 @@ template<class T, class charT, class traits>
 #include <stdexcept>
 #include <cmath>
 #include <iosfwd>
-#include <sstream>
 #include <version>
 
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#   include <sstream> // for std::basic_ostringstream
+#endif
+
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #pragma GCC system_header
 #endif
@@ -955,7 +950,7 @@ inline _LIBCPP_INLINE_VISIBILITY
 complex<_Tp>
 proj(const complex<_Tp>& __c)
 {
-    std::complex<_Tp> __r = __c;
+    complex<_Tp> __r = __c;
     if (__libcpp_isinf_or_builtin(__c.real()) || __libcpp_isinf_or_builtin(__c.imag()))
         __r = complex<_Tp>(INFINITY, copysign(_Tp(0), __c.imag()));
     return __r;
@@ -1438,6 +1433,7 @@ operator>>(basic_istream<_CharT, _Traits>& __is, complex<_Tp>& __x)
     return __is;
 }
 
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
 template<class _Tp, class _CharT, class _Traits>
 basic_ostream<_CharT, _Traits>&
 operator<<(basic_ostream<_CharT, _Traits>& __os, const complex<_Tp>& __x)
@@ -1449,6 +1445,7 @@ operator<<(basic_ostream<_CharT, _Traits>& __os, const complex<_Tp>& __x)
     __s << '(' << __x.real() << ',' << __x.imag() << ')';
     return __os << __s.str();
 }
+#endif // !_LIBCPP_HAS_NO_LOCALIZATION
 
 #if _LIBCPP_STD_VER > 11
 // Literal suffix for complex number literals [complex.literals]
diff --git a/lib/libcxx/include/concepts b/lib/libcxx/include/concepts
index 047e2c290f..cf5f9d6397 100644
--- a/lib/libcxx/include/concepts
+++ b/lib/libcxx/include/concepts
@@ -157,6 +157,11 @@ concept __same_as_impl = _VSTD::_IsSame<_Tp, _Up>::value;
 template<class _Tp, class _Up>
 concept same_as = __same_as_impl<_Tp, _Up> && __same_as_impl<_Up, _Tp>;
 
+// [concept.destructible]
+
+template<class _Tp>
+concept destructible = _VSTD::is_nothrow_destructible_v<_Tp>;
+
 #endif //_LIBCPP_STD_VER > 17 && defined(__cpp_concepts) && __cpp_concepts >= 201811L
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/lib/libcxx/include/ctime b/lib/libcxx/include/ctime
index f9f2f1659d..b0e6c65af5 100644
--- a/lib/libcxx/include/ctime
+++ b/lib/libcxx/include/ctime
@@ -52,6 +52,20 @@ int timespec_get( struct timespec *ts, int base); // C++17
 #pragma GCC system_header
 #endif
 
+// FIXME:
+// Apple SDKs don't define ::timespec_get unconditionally in C++ mode. This
+// should be fixed in future SDKs, but for the time being we need to avoid
+// trying to use that declaration when the SDK doesn't provide it. Note that
+// we're detecting this here instead of in <__config> because we can't include
+// system headers from <__config>, since it leads to circular module dependencies.
+// This is also meant to be a very temporary workaround until the SDKs are fixed.
+#if defined(__APPLE__)
+#   include <sys/cdefs.h>
+#   if defined(_LIBCPP_HAS_TIMESPEC_GET) && (__DARWIN_C_LEVEL < __DARWIN_C_FULL)
+#       define _LIBCPP_HAS_TIMESPEC_GET_NOT_ACTUALLY_PROVIDED
+#   endif
+#endif
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 using ::clock_t;
@@ -72,7 +86,7 @@ using ::gmtime;
 using ::localtime;
 #endif
 using ::strftime;
-#if _LIBCPP_STD_VER > 14 && defined(_LIBCPP_HAS_TIMESPEC_GET)
+#if _LIBCPP_STD_VER > 14 && defined(_LIBCPP_HAS_TIMESPEC_GET) && !defined(_LIBCPP_HAS_TIMESPEC_GET_NOT_ACTUALLY_PROVIDED)
 using ::timespec_get;
 #endif
 
diff --git a/lib/libcxx/include/deque b/lib/libcxx/include/deque
index c2ea5f2dbe..c6517d170f 100644
--- a/lib/libcxx/include/deque
+++ b/lib/libcxx/include/deque
@@ -1237,7 +1237,7 @@ __deque_base<_Tp, _Allocator>::swap(__deque_base& __c)
     __map_.swap(__c.__map_);
     _VSTD::swap(__start_, __c.__start_);
     _VSTD::swap(size(), __c.size());
-    __swap_allocator(__alloc(), __c.__alloc());
+    _VSTD::__swap_allocator(__alloc(), __c.__alloc());
 }
 
 template <class _Tp, class _Allocator>
@@ -1393,7 +1393,7 @@ public:
     size_type size() const _NOEXCEPT {return __base::size();}
     _LIBCPP_INLINE_VISIBILITY
     size_type max_size() const _NOEXCEPT
-        {return std::min<size_type>(
+        {return _VSTD::min<size_type>(
             __alloc_traits::max_size(__base::__alloc()),
             numeric_limits<difference_type>::max());}
     void resize(size_type __n);
@@ -1586,7 +1586,7 @@ public:
 
 #ifndef _LIBCPP_HAS_NO_DEDUCTION_GUIDES
 template<class _InputIterator,
-         class _Alloc = typename std::allocator<typename iterator_traits<_InputIterator>::value_type>,
+         class _Alloc = allocator<typename iterator_traits<_InputIterator>::value_type>,
          class = typename enable_if<__is_allocator<_Alloc>::value, void>::type
          >
 deque(_InputIterator, _InputIterator)
@@ -2376,7 +2376,7 @@ deque<_Tp, _Allocator>::__append(_ForIter __f, _ForIter __l,
     for (__deque_block_range __br : __deque_range(__base::end(), __base::end() + __n)) {
       _ConstructTransaction __tx(this, __br);
       for (; __tx.__pos_ != __tx.__end_; ++__tx.__pos_, (void)++__f) {
-        __alloc_traits::construct(__a, std::__to_address(__tx.__pos_), *__f);
+        __alloc_traits::construct(__a, _VSTD::__to_address(__tx.__pos_), *__f);
       }
     }
 }
@@ -2393,7 +2393,7 @@ deque<_Tp, _Allocator>::__append(size_type __n)
     for (__deque_block_range __br : __deque_range(__base::end(), __base::end() + __n)) {
       _ConstructTransaction __tx(this, __br);
       for (; __tx.__pos_ != __tx.__end_; ++__tx.__pos_) {
-        __alloc_traits::construct(__a, std::__to_address(__tx.__pos_));
+        __alloc_traits::construct(__a, _VSTD::__to_address(__tx.__pos_));
       }
     }
 }
@@ -2410,7 +2410,7 @@ deque<_Tp, _Allocator>::__append(size_type __n, const value_type& __v)
     for (__deque_block_range __br : __deque_range(__base::end(), __base::end() + __n)) {
       _ConstructTransaction __tx(this, __br);
       for (; __tx.__pos_ != __tx.__end_; ++__tx.__pos_) {
-        __alloc_traits::construct(__a, std::__to_address(__tx.__pos_), __v);
+        __alloc_traits::construct(__a, _VSTD::__to_address(__tx.__pos_), __v);
       }
     }
 
@@ -2708,7 +2708,7 @@ void
 deque<_Tp, _Allocator>::pop_front()
 {
     allocator_type& __a = __base::__alloc();
-    __alloc_traits::destroy(__a, __to_address(*(__base::__map_.begin() +
+    __alloc_traits::destroy(__a, _VSTD::__to_address(*(__base::__map_.begin() +
                                                     __base::__start_ / __base::__block_size) +
                                                     __base::__start_ % __base::__block_size));
     --__base::size();
@@ -2723,7 +2723,7 @@ deque<_Tp, _Allocator>::pop_back()
     _LIBCPP_ASSERT(!empty(), "deque::pop_back called for empty deque");
     allocator_type& __a = __base::__alloc();
     size_type __p = __base::size() + __base::__start_ - 1;
-    __alloc_traits::destroy(__a, __to_address(*(__base::__map_.begin() +
+    __alloc_traits::destroy(__a, _VSTD::__to_address(*(__base::__map_.begin() +
                                                     __p / __base::__block_size) +
                                                     __p % __base::__block_size));
     --__base::size();
diff --git a/lib/libcxx/include/exception b/lib/libcxx/include/exception
index 8e32979f57..4bf4049f33 100644
--- a/lib/libcxx/include/exception
+++ b/lib/libcxx/include/exception
@@ -77,6 +77,8 @@ template <class E> void rethrow_if_nested(const E& e);
 */
 
 #include <__config>
+#include <__availability>
+#include <__memory/base.h>
 #include <cstddef>
 #include <cstdlib>
 #include <type_traits>
diff --git a/lib/libcxx/include/experimental/memory_resource b/lib/libcxx/include/experimental/memory_resource
index f999fb9bef..816d21f513 100644
--- a/lib/libcxx/include/experimental/memory_resource
+++ b/lib/libcxx/include/experimental/memory_resource
@@ -116,7 +116,7 @@ public:
         { return do_is_equal(__other); }
 
 // 8.5.3, memory.resource.priv
-protected:
+private:
     virtual void* do_allocate(size_t, size_t) = 0;
     virtual void do_deallocate(void*, size_t, size_t) = 0;
     virtual bool do_is_equal(memory_resource const &) const _NOEXCEPT = 0;
@@ -381,7 +381,7 @@ public:
     { return __alloc_; }
 
 // 8.7.3, memory.resource.adaptor.mem
-protected:
+private:
     virtual void * do_allocate(size_t __bytes, size_t)
     {
         if (__bytes > __max_size()) {
@@ -407,7 +407,6 @@ protected:
         return __p  ? __alloc_ == __p->__alloc_ : false;
     }
 
-private:
     _LIBCPP_INLINE_VISIBILITY
     size_t __max_size() const _NOEXCEPT {
         return numeric_limits<size_t>::max() - _MaxAlign;
diff --git a/lib/libcxx/include/experimental/simd b/lib/libcxx/include/experimental/simd
index 39ac35e4eb..41f8f799a0 100644
--- a/lib/libcxx/include/experimental/simd
+++ b/lib/libcxx/include/experimental/simd
@@ -659,6 +659,9 @@ public:
 #pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL_SIMD
 
 #if _LIBCPP_STD_VER >= 17
@@ -1566,4 +1569,6 @@ public:
 
 _LIBCPP_END_NAMESPACE_EXPERIMENTAL_SIMD
 
+_LIBCPP_POP_MACROS
+
 #endif /* _LIBCPP_EXPERIMENTAL_SIMD */
diff --git a/lib/libcxx/include/ext/hash_map b/lib/libcxx/include/ext/hash_map
index 7478d74100..2d6024cb90 100644
--- a/lib/libcxx/include/ext/hash_map
+++ b/lib/libcxx/include/ext/hash_map
@@ -671,7 +671,7 @@ hash_map<_Key, _Tp, _Hash, _Pred, _Alloc>::__construct_node(const key_type& __k)
     __h.get_deleter().__first_constructed = true;
     __node_traits::construct(__na, _VSTD::addressof(__h->__value_.second));
     __h.get_deleter().__second_constructed = true;
-    return _LIBCPP_EXPLICIT_MOVE(__h);  // explicitly moved for C++03
+    return __h;
 }
 
 template <class _Key, class _Tp, class _Hash, class _Pred, class _Alloc>
diff --git a/lib/libcxx/include/filesystem b/lib/libcxx/include/filesystem
index 1363b63027..92e37e183d 100644
--- a/lib/libcxx/include/filesystem
+++ b/lib/libcxx/include/filesystem
@@ -230,23 +230,31 @@
 */
 
 #include <__config>
+#include <__availability>
 #include <cstddef>
 #include <cstdlib>
 #include <chrono>
 #include <iterator>
 #include <iosfwd>
-#include <locale>
 #include <memory>
 #include <stack>
 #include <string>
 #include <system_error>
 #include <utility>
-#include <iomanip> // for quoted
 #include <string_view>
 #include <version>
 
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+# include <locale>
+# include <iomanip> // for quoted
+#endif
+
 #include <__debug>
 
+#if defined(_LIBCPP_HAS_NO_FILESYSTEM_LIBRARY)
+# error "The Filesystem library is not supported by this configuration of libc++"
+#endif
+
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #pragma GCC system_header
 #endif
@@ -543,6 +551,13 @@ struct __can_convert_char<wchar_t> {
   static const bool value = true;
   using __char_type = wchar_t;
 };
+#ifndef _LIBCPP_NO_HAS_CHAR8_T
+template <>
+struct __can_convert_char<char8_t> {
+  static const bool value = true;
+  using __char_type = char8_t;
+};
+#endif
 template <>
 struct __can_convert_char<char16_t> {
   static const bool value = true;
@@ -557,10 +572,20 @@ struct __can_convert_char<char32_t> {
 template <class _ECharT>
 typename enable_if<__can_convert_char<_ECharT>::value, bool>::type
 __is_separator(_ECharT __e) {
+#if defined(_LIBCPP_WIN32API)
+  return __e == _ECharT('/') || __e == _ECharT('\\');
+#else
   return __e == _ECharT('/');
+#endif
 }
 
-struct _NullSentinal {};
+#ifndef _LIBCPP_NO_HAS_CHAR8_T
+typedef u8string __u8_string;
+#else
+typedef string __u8_string;
+#endif
+
+struct _NullSentinel {};
 
 template <class _Tp>
 using _Void = void;
@@ -615,9 +640,9 @@ struct __is_pathable_char_array<_Source, _ECharT*, _UPtr, true>
   static _ECharT const* __range_begin(const _ECharT* __b) { return __b; }
   static _ECharT const* __range_end(const _ECharT* __b) {
     using _Iter = const _ECharT*;
-    const _ECharT __sentinal = _ECharT{};
+    const _ECharT __sentinel = _ECharT{};
     _Iter __e = __b;
-    for (; *__e != __sentinal; ++__e)
+    for (; *__e != __sentinel; ++__e)
       ;
     return __e;
   }
@@ -639,7 +664,7 @@ struct __is_pathable_iter<
   using _Base = __can_convert_char<_ECharT>;
 
   static _Iter __range_begin(_Iter __b) { return __b; }
-  static _NullSentinal __range_end(_Iter) { return _NullSentinal{}; }
+  static _NullSentinel __range_end(_Iter) { return _NullSentinel{}; }
 
   static _ECharT __first_or_null(_Iter __b) { return *__b; }
 };
@@ -661,80 +686,219 @@ struct __is_pathable<_Tp, false, true, false> : __is_pathable_char_array<_Tp> {
 template <class _Tp>
 struct __is_pathable<_Tp, false, false, true> : __is_pathable_iter<_Tp> {};
 
+#if defined(_LIBCPP_WIN32API)
+typedef wstring __path_string;
+typedef wchar_t __path_value;
+#else
+typedef string __path_string;
+typedef char __path_value;
+#endif
+
+#if defined(_LIBCPP_WIN32API)
+_LIBCPP_FUNC_VIS
+size_t __wide_to_char(const wstring&, char*, size_t);
+_LIBCPP_FUNC_VIS
+size_t __char_to_wide(const string&, wchar_t*, size_t);
+#endif
+
+template <class _ECharT>
+struct _PathCVT;
+
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
 template <class _ECharT>
 struct _PathCVT {
   static_assert(__can_convert_char<_ECharT>::value,
                 "Char type not convertible");
 
   typedef __narrow_to_utf8<sizeof(_ECharT) * __CHAR_BIT__> _Narrower;
+#if defined(_LIBCPP_WIN32API)
+  typedef __widen_from_utf8<sizeof(wchar_t) * __CHAR_BIT__> _Widener;
+#endif
 
-  static void __append_range(string& __dest, _ECharT const* __b,
+  static void __append_range(__path_string& __dest, _ECharT const* __b,
                              _ECharT const* __e) {
+#if defined(_LIBCPP_WIN32API)
+    string __utf8;
+    _Narrower()(back_inserter(__utf8), __b, __e);
+    _Widener()(back_inserter(__dest), __utf8.data(), __utf8.data() + __utf8.size());
+#else
     _Narrower()(back_inserter(__dest), __b, __e);
+#endif
   }
 
   template <class _Iter>
-  static void __append_range(string& __dest, _Iter __b, _Iter __e) {
+  static void __append_range(__path_string& __dest, _Iter __b, _Iter __e) {
     static_assert(!is_same<_Iter, _ECharT*>::value, "Call const overload");
     if (__b == __e)
       return;
     basic_string<_ECharT> __tmp(__b, __e);
+#if defined(_LIBCPP_WIN32API)
+    string __utf8;
+    _Narrower()(back_inserter(__utf8), __tmp.data(),
+                __tmp.data() + __tmp.length());
+    _Widener()(back_inserter(__dest), __utf8.data(), __utf8.data() + __utf8.size());
+#else
     _Narrower()(back_inserter(__dest), __tmp.data(),
                 __tmp.data() + __tmp.length());
+#endif
   }
 
   template <class _Iter>
-  static void __append_range(string& __dest, _Iter __b, _NullSentinal) {
+  static void __append_range(__path_string& __dest, _Iter __b, _NullSentinel) {
     static_assert(!is_same<_Iter, _ECharT*>::value, "Call const overload");
-    const _ECharT __sentinal = _ECharT{};
-    if (*__b == __sentinal)
+    const _ECharT __sentinel = _ECharT{};
+    if (*__b == __sentinel)
       return;
     basic_string<_ECharT> __tmp;
-    for (; *__b != __sentinal; ++__b)
+    for (; *__b != __sentinel; ++__b)
       __tmp.push_back(*__b);
+#if defined(_LIBCPP_WIN32API)
+    string __utf8;
+    _Narrower()(back_inserter(__utf8), __tmp.data(),
+                __tmp.data() + __tmp.length());
+    _Widener()(back_inserter(__dest), __utf8.data(), __utf8.data() + __utf8.size());
+#else
     _Narrower()(back_inserter(__dest), __tmp.data(),
                 __tmp.data() + __tmp.length());
+#endif
   }
 
   template <class _Source>
-  static void __append_source(string& __dest, _Source const& __s) {
+  static void __append_source(__path_string& __dest, _Source const& __s) {
     using _Traits = __is_pathable<_Source>;
     __append_range(__dest, _Traits::__range_begin(__s),
                    _Traits::__range_end(__s));
   }
 };
+#endif // !_LIBCPP_HAS_NO_LOCALIZATION
 
 template <>
-struct _PathCVT<char> {
+struct _PathCVT<__path_value> {
 
   template <class _Iter>
   static typename enable_if<__is_exactly_cpp17_input_iterator<_Iter>::value>::type
-  __append_range(string& __dest, _Iter __b, _Iter __e) {
+  __append_range(__path_string& __dest, _Iter __b, _Iter __e) {
     for (; __b != __e; ++__b)
       __dest.push_back(*__b);
   }
 
   template <class _Iter>
   static typename enable_if<__is_cpp17_forward_iterator<_Iter>::value>::type
-  __append_range(string& __dest, _Iter __b, _Iter __e) {
+  __append_range(__path_string& __dest, _Iter __b, _Iter __e) {
     __dest.__append_forward_unsafe(__b, __e);
   }
 
   template <class _Iter>
-  static void __append_range(string& __dest, _Iter __b, _NullSentinal) {
-    const char __sentinal = char{};
-    for (; *__b != __sentinal; ++__b)
+  static void __append_range(__path_string& __dest, _Iter __b, _NullSentinel) {
+    const char __sentinel = char{};
+    for (; *__b != __sentinel; ++__b)
       __dest.push_back(*__b);
   }
 
   template <class _Source>
-  static void __append_source(string& __dest, _Source const& __s) {
+  static void __append_source(__path_string& __dest, _Source const& __s) {
     using _Traits = __is_pathable<_Source>;
     __append_range(__dest, _Traits::__range_begin(__s),
                    _Traits::__range_end(__s));
   }
 };
 
+#if defined(_LIBCPP_WIN32API)
+template <>
+struct _PathCVT<char> {
+
+  static void
+  __append_string(__path_string& __dest, const basic_string<char> &__str) {
+      size_t __size = __char_to_wide(__str, nullptr, 0);
+      size_t __pos = __dest.size();
+      __dest.resize(__pos + __size);
+      __char_to_wide(__str, const_cast<__path_value*>(__dest.data()) + __pos, __size);
+  }
+
+  template <class _Iter>
+  static typename enable_if<__is_exactly_cpp17_input_iterator<_Iter>::value>::type
+  __append_range(__path_string& __dest, _Iter __b, _Iter __e) {
+    basic_string<char> __tmp(__b, __e);
+    __append_string(__dest, __tmp);
+  }
+
+  template <class _Iter>
+  static typename enable_if<__is_cpp17_forward_iterator<_Iter>::value>::type
+  __append_range(__path_string& __dest, _Iter __b, _Iter __e) {
+    basic_string<char> __tmp(__b, __e);
+    __append_string(__dest, __tmp);
+  }
+
+  template <class _Iter>
+  static void __append_range(__path_string& __dest, _Iter __b, _NullSentinel) {
+    const char __sentinel = char{};
+    basic_string<char> __tmp;
+    for (; *__b != __sentinel; ++__b)
+      __tmp.push_back(*__b);
+    __append_string(__dest, __tmp);
+  }
+
+  template <class _Source>
+  static void __append_source(__path_string& __dest, _Source const& __s) {
+    using _Traits = __is_pathable<_Source>;
+    __append_range(__dest, _Traits::__range_begin(__s),
+                   _Traits::__range_end(__s));
+  }
+};
+
+template <class _ECharT>
+struct _PathExport {
+  typedef __narrow_to_utf8<sizeof(wchar_t) * __CHAR_BIT__> _Narrower;
+  typedef __widen_from_utf8<sizeof(_ECharT) * __CHAR_BIT__> _Widener;
+
+  template <class _Str>
+  static void __append(_Str& __dest, const __path_string& __src) {
+    string __utf8;
+    _Narrower()(back_inserter(__utf8), __src.data(), __src.data() + __src.size());
+    _Widener()(back_inserter(__dest), __utf8.data(), __utf8.data() + __utf8.size());
+  }
+};
+
+template <>
+struct _PathExport<char> {
+  template <class _Str>
+  static void __append(_Str& __dest, const __path_string& __src) {
+    size_t __size = __wide_to_char(__src, nullptr, 0);
+    size_t __pos = __dest.size();
+    __dest.resize(__size);
+    __wide_to_char(__src, const_cast<char*>(__dest.data()) + __pos, __size);
+  }
+};
+
+template <>
+struct _PathExport<wchar_t> {
+  template <class _Str>
+  static void __append(_Str& __dest, const __path_string& __src) {
+    __dest.append(__src.begin(), __src.end());
+  }
+};
+
+template <>
+struct _PathExport<char16_t> {
+  template <class _Str>
+  static void __append(_Str& __dest, const __path_string& __src) {
+    __dest.append(__src.begin(), __src.end());
+  }
+};
+
+#ifndef _LIBCPP_NO_HAS_CHAR8_T
+template <>
+struct _PathExport<char8_t> {
+  typedef __narrow_to_utf8<sizeof(wchar_t) * __CHAR_BIT__> _Narrower;
+
+  template <class _Str>
+  static void __append(_Str& __dest, const __path_string& __src) {
+    _Narrower()(back_inserter(__dest), __src.data(), __src.data() + __src.size());
+  }
+};
+#endif /* !_LIBCPP_NO_HAS_CHAR8_T */
+#endif /* _LIBCPP_WIN32API */
+
 class _LIBCPP_TYPE_VIS path {
   template <class _SourceOrIter, class _Tp = path&>
   using _EnableIfPathable =
@@ -747,10 +911,15 @@ class _LIBCPP_TYPE_VIS path {
   using _SourceCVT = _PathCVT<_SourceChar<_Tp> >;
 
 public:
+#if defined(_LIBCPP_WIN32API)
+  typedef wchar_t value_type;
+  static constexpr value_type preferred_separator = L'\\';
+#else
   typedef char value_type;
-  typedef basic_string<value_type> string_type;
-  typedef _VSTD::string_view __string_view;
   static constexpr value_type preferred_separator = '/';
+#endif
+  typedef basic_string<value_type> string_type;
+  typedef basic_string_view<value_type> __string_view;
 
   enum class _LIBCPP_ENUM_VIS format : unsigned char {
     auto_format,
@@ -779,12 +948,14 @@ public:
     _PathCVT<_ItVal>::__append_range(__pn_, __first, __last);
   }
 
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
   // TODO Implement locale conversions.
   template <class _Source, class = _EnableIfPathable<_Source, void> >
   path(const _Source& __src, const locale& __loc, format = format::auto_format);
   template <class _InputIt>
   path(_InputIt __first, _InputIt _last, const locale& __loc,
        format = format::auto_format);
+#endif
 
   _LIBCPP_INLINE_VISIBILITY
   ~path() = default;
@@ -922,9 +1093,8 @@ public:
   template <class _ECharT>
   typename enable_if<__can_convert_char<_ECharT>::value, path&>::type
   operator+=(_ECharT __x) {
-    basic_string<_ECharT> __tmp;
-    __tmp += __x;
-    _PathCVT<_ECharT>::__append_source(__pn_, __tmp);
+    _PathCVT<_ECharT>::__append_source(__pn_,
+                                       basic_string_view<_ECharT>(&__x, 1));
     return *this;
   }
 
@@ -950,7 +1120,12 @@ public:
   _LIBCPP_INLINE_VISIBILITY
   void clear() noexcept { __pn_.clear(); }
 
-  path& make_preferred() { return *this; }
+  path& make_preferred() {
+#if defined(_LIBCPP_WIN32API)
+    _VSTD::replace(__pn_.begin(), __pn_.end(), L'/', L'\\');
+#endif
+    return *this;
+  }
 
   _LIBCPP_INLINE_VISIBILITY
   path& remove_filename() {
@@ -983,6 +1158,64 @@ public:
 
   _LIBCPP_INLINE_VISIBILITY operator string_type() const { return __pn_; }
 
+#if defined(_LIBCPP_WIN32API)
+  _LIBCPP_INLINE_VISIBILITY _VSTD::wstring wstring() const { return __pn_; }
+
+  _VSTD::wstring generic_wstring() const { return __pn_; }
+
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+  template <class _ECharT, class _Traits = char_traits<_ECharT>,
+            class _Allocator = allocator<_ECharT> >
+  basic_string<_ECharT, _Traits, _Allocator>
+  string(const _Allocator& __a = _Allocator()) const {
+    using _Str = basic_string<_ECharT, _Traits, _Allocator>;
+    _Str __s(__a);
+    __s.reserve(__pn_.size());
+    _PathExport<_ECharT>::__append(__s, __pn_);
+    return __s;
+  }
+
+  _LIBCPP_INLINE_VISIBILITY _VSTD::string string() const {
+    return string<char>();
+  }
+  _LIBCPP_INLINE_VISIBILITY __u8_string u8string() const {
+    using _CVT = __narrow_to_utf8<sizeof(wchar_t) * __CHAR_BIT__>;
+    __u8_string __s;
+    __s.reserve(__pn_.size());
+    _CVT()(back_inserter(__s), __pn_.data(), __pn_.data() + __pn_.size());
+    return __s;
+  }
+
+  _LIBCPP_INLINE_VISIBILITY _VSTD::u16string u16string() const {
+    return string<char16_t>();
+  }
+  _LIBCPP_INLINE_VISIBILITY _VSTD::u32string u32string() const {
+    return string<char32_t>();
+  }
+
+  // generic format observers
+  template <class _ECharT, class _Traits = char_traits<_ECharT>,
+            class _Allocator = allocator<_ECharT> >
+  basic_string<_ECharT, _Traits, _Allocator>
+  generic_string(const _Allocator& __a = _Allocator()) const {
+    return string<_ECharT, _Traits, _Allocator>(__a);
+  }
+
+  _VSTD::string generic_string() const { return generic_string<char>(); }
+  _VSTD::u16string generic_u16string() const { return generic_string<char16_t>(); }
+  _VSTD::u32string generic_u32string() const { return generic_string<char32_t>(); }
+  __u8_string generic_u8string() const { return u8string(); }
+#endif /* !_LIBCPP_HAS_NO_LOCALIZATION */
+#else /* _LIBCPP_WIN32API */
+
+  _LIBCPP_INLINE_VISIBILITY _VSTD::string string() const { return __pn_; }
+#ifndef _LIBCPP_NO_HAS_CHAR8_T
+  _LIBCPP_INLINE_VISIBILITY _VSTD::u8string u8string() const { return _VSTD::u8string(__pn_.begin(), __pn_.end()); }
+#else
+  _LIBCPP_INLINE_VISIBILITY _VSTD::string u8string() const { return __pn_; }
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
   template <class _ECharT, class _Traits = char_traits<_ECharT>,
             class _Allocator = allocator<_ECharT> >
   basic_string<_ECharT, _Traits, _Allocator>
@@ -995,19 +1228,26 @@ public:
     return __s;
   }
 
-  _LIBCPP_INLINE_VISIBILITY std::string string() const { return __pn_; }
-  _LIBCPP_INLINE_VISIBILITY std::wstring wstring() const {
+  _LIBCPP_INLINE_VISIBILITY _VSTD::wstring wstring() const {
     return string<wchar_t>();
   }
-  _LIBCPP_INLINE_VISIBILITY std::string u8string() const { return __pn_; }
-  _LIBCPP_INLINE_VISIBILITY std::u16string u16string() const {
+  _LIBCPP_INLINE_VISIBILITY _VSTD::u16string u16string() const {
     return string<char16_t>();
   }
-  _LIBCPP_INLINE_VISIBILITY std::u32string u32string() const {
+  _LIBCPP_INLINE_VISIBILITY _VSTD::u32string u32string() const {
     return string<char32_t>();
   }
+#endif /* !_LIBCPP_HAS_NO_LOCALIZATION */
 
   // generic format observers
+  _VSTD::string generic_string() const { return __pn_; }
+#ifndef _LIBCPP_NO_HAS_CHAR8_T
+  _VSTD::u8string generic_u8string() const { return _VSTD::u8string(__pn_.begin(), __pn_.end()); }
+#else
+  _VSTD::string generic_u8string() const { return __pn_; }
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
   template <class _ECharT, class _Traits = char_traits<_ECharT>,
             class _Allocator = allocator<_ECharT> >
   basic_string<_ECharT, _Traits, _Allocator>
@@ -1015,11 +1255,11 @@ public:
     return string<_ECharT, _Traits, _Allocator>(__a);
   }
 
-  std::string generic_string() const { return __pn_; }
-  std::wstring generic_wstring() const { return string<wchar_t>(); }
-  std::string generic_u8string() const { return __pn_; }
-  std::u16string generic_u16string() const { return string<char16_t>(); }
-  std::u32string generic_u32string() const { return string<char32_t>(); }
+  _VSTD::wstring generic_wstring() const { return string<wchar_t>(); }
+  _VSTD::u16string generic_u16string() const { return string<char16_t>(); }
+  _VSTD::u32string generic_u32string() const { return string<char32_t>(); }
+#endif /* !_LIBCPP_HAS_NO_LOCALIZATION */
+#endif /* !_LIBCPP_WIN32API */
 
 private:
   int __compare(__string_view) const;
@@ -1123,23 +1363,24 @@ public:
   iterator begin() const;
   iterator end() const;
 
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
   template <class _CharT, class _Traits>
   _LIBCPP_INLINE_VISIBILITY friend
-      typename enable_if<is_same<_CharT, char>::value &&
-                             is_same<_Traits, char_traits<char> >::value,
+      typename enable_if<is_same<_CharT, value_type>::value &&
+                             is_same<_Traits, char_traits<value_type> >::value,
                          basic_ostream<_CharT, _Traits>&>::type
       operator<<(basic_ostream<_CharT, _Traits>& __os, const path& __p) {
-    __os << std::__quoted(__p.native());
+    __os << _VSTD::__quoted(__p.native());
     return __os;
   }
 
   template <class _CharT, class _Traits>
   _LIBCPP_INLINE_VISIBILITY friend
-      typename enable_if<!is_same<_CharT, char>::value ||
-                             !is_same<_Traits, char_traits<char> >::value,
+      typename enable_if<!is_same<_CharT, value_type>::value ||
+                             !is_same<_Traits, char_traits<value_type> >::value,
                          basic_ostream<_CharT, _Traits>&>::type
       operator<<(basic_ostream<_CharT, _Traits>& __os, const path& __p) {
-    __os << std::__quoted(__p.string<_CharT, _Traits>());
+    __os << _VSTD::__quoted(__p.string<_CharT, _Traits>());
     return __os;
   }
 
@@ -1151,6 +1392,7 @@ public:
     __p = __tmp;
     return __is;
   }
+#endif // !_LIBCPP_HAS_NO_LOCALIZATION
 
   friend _LIBCPP_INLINE_VISIBILITY bool operator==(const path& __lhs, const path& __rhs) noexcept {
     return __lhs.compare(__rhs) == 0;
@@ -1193,25 +1435,70 @@ inline _LIBCPP_INLINE_VISIBILITY void swap(path& __lhs, path& __rhs) noexcept {
 _LIBCPP_FUNC_VIS
 size_t hash_value(const path& __p) noexcept;
 
-template <class _Source>
-_LIBCPP_INLINE_VISIBILITY
-    typename enable_if<__is_pathable<_Source>::value, path>::type
-    u8path(const _Source& __s) {
-  static_assert(
-      is_same<typename __is_pathable<_Source>::__char_type, char>::value,
-      "u8path(Source const&) requires Source have a character type of type "
-      "'char'");
-  return path(__s);
-}
-
 template <class _InputIt>
-_LIBCPP_INLINE_VISIBILITY
+_LIBCPP_INLINE_VISIBILITY _LIBCPP_DEPRECATED_WITH_CHAR8_T
     typename enable_if<__is_pathable<_InputIt>::value, path>::type
     u8path(_InputIt __f, _InputIt __l) {
   static_assert(
+#ifndef _LIBCPP_NO_HAS_CHAR8_T
+      is_same<typename __is_pathable<_InputIt>::__char_type, char8_t>::value ||
+#endif
       is_same<typename __is_pathable<_InputIt>::__char_type, char>::value,
-      "u8path(Iter, Iter) requires Iter have a value_type of type 'char'");
+      "u8path(Iter, Iter) requires Iter have a value_type of type 'char'"
+      " or 'char8_t'");
+#if defined(_LIBCPP_WIN32API)
+  string __tmp(__f, __l);
+  using _CVT = __widen_from_utf8<sizeof(wchar_t) * __CHAR_BIT__>;
+  _VSTD::wstring __w;
+  __w.reserve(__tmp.size());
+  _CVT()(back_inserter(__w), __tmp.data(), __tmp.data() + __tmp.size());
+  return path(__w);
+#else
   return path(__f, __l);
+#endif /* !_LIBCPP_WIN32API */
+}
+
+#if defined(_LIBCPP_WIN32API)
+template <class _InputIt>
+_LIBCPP_INLINE_VISIBILITY _LIBCPP_DEPRECATED_WITH_CHAR8_T
+    typename enable_if<__is_pathable<_InputIt>::value, path>::type
+    u8path(_InputIt __f, _NullSentinel) {
+  static_assert(
+#ifndef _LIBCPP_NO_HAS_CHAR8_T
+      is_same<typename __is_pathable<_InputIt>::__char_type, char8_t>::value ||
+#endif
+      is_same<typename __is_pathable<_InputIt>::__char_type, char>::value,
+      "u8path(Iter, Iter) requires Iter have a value_type of type 'char'"
+      " or 'char8_t'");
+  string __tmp;
+  const char __sentinel = char{};
+  for (; *__f != __sentinel; ++__f)
+    __tmp.push_back(*__f);
+  using _CVT = __widen_from_utf8<sizeof(wchar_t) * __CHAR_BIT__>;
+  _VSTD::wstring __w;
+  __w.reserve(__tmp.size());
+  _CVT()(back_inserter(__w), __tmp.data(), __tmp.data() + __tmp.size());
+  return path(__w);
+}
+#endif /* _LIBCPP_WIN32API */
+
+template <class _Source>
+_LIBCPP_INLINE_VISIBILITY _LIBCPP_DEPRECATED_WITH_CHAR8_T
+    typename enable_if<__is_pathable<_Source>::value, path>::type
+    u8path(const _Source& __s) {
+  static_assert(
+#ifndef _LIBCPP_NO_HAS_CHAR8_T
+      is_same<typename __is_pathable<_Source>::__char_type, char8_t>::value ||
+#endif
+      is_same<typename __is_pathable<_Source>::__char_type, char>::value,
+      "u8path(Source const&) requires Source have a character type of type "
+      "'char' or 'char8_t'");
+#if defined(_LIBCPP_WIN32API)
+  using _Traits = __is_pathable<_Source>;
+  return u8path(__unwrap_iter(_Traits::__range_begin(__s)), __unwrap_iter(_Traits::__range_end(__s)));
+#else
+  return path(__s);
+#endif
 }
 
 class _LIBCPP_TYPE_VIS path::iterator {
@@ -1230,7 +1517,7 @@ public:
   typedef bidirectional_iterator_tag iterator_category;
 
   typedef path value_type;
-  typedef std::ptrdiff_t difference_type;
+  typedef ptrdiff_t difference_type;
   typedef const path* pointer;
   typedef const path& reference;
 
@@ -1374,7 +1661,7 @@ template <class... _Args>
 _LIBCPP_NORETURN inline _LIBCPP_INLINE_VISIBILITY
 #ifndef _LIBCPP_NO_EXCEPTIONS
 void __throw_filesystem_error(_Args&&... __args) {
-  throw filesystem_error(std::forward<_Args>(__args)...);
+  throw filesystem_error(_VSTD::forward<_Args>(__args)...);
 }
 #else
 void __throw_filesystem_error(_Args&&...) {
@@ -2204,7 +2491,7 @@ private:
 
   _LIBCPP_INLINE_VISIBILITY
   void __assign_iter_entry(_Path&& __p, __cached_data __dt) {
-    __p_ = std::move(__p);
+    __p_ = _VSTD::move(__p);
     __data_ = __dt;
   }
 
@@ -2505,10 +2792,10 @@ end(const directory_iterator&) noexcept {
 class recursive_directory_iterator {
 public:
   using value_type = directory_entry;
-  using difference_type = std::ptrdiff_t;
+  using difference_type = ptrdiff_t;
   using pointer = directory_entry const*;
   using reference = directory_entry const&;
-  using iterator_category = std::input_iterator_tag;
+  using iterator_category = input_iterator_tag;
 
 public:
   // constructors and destructor
diff --git a/lib/libcxx/include/forward_list b/lib/libcxx/include/forward_list
index 3bd8db8b7d..d3d6b8238f 100644
--- a/lib/libcxx/include/forward_list
+++ b/lib/libcxx/include/forward_list
@@ -603,7 +603,7 @@ __forward_list_base<_Tp, _Alloc>::swap(__forward_list_base& __x)
                     __is_nothrow_swappable<__node_allocator>::value)
 #endif
 {
-    __swap_allocator(__alloc(), __x.__alloc(),
+    _VSTD::__swap_allocator(__alloc(), __x.__alloc(),
             integral_constant<bool, __node_traits::propagate_on_container_swap::value>());
     using _VSTD::swap;
     swap(__before_begin()->__next_, __x.__before_begin()->__next_);
@@ -758,7 +758,7 @@ public:
         {return base::__before_begin()->__next_ == nullptr;}
     _LIBCPP_INLINE_VISIBILITY
     size_type max_size() const _NOEXCEPT {
-        return std::min<size_type>(
+        return _VSTD::min<size_type>(
             __node_traits::max_size(base::__alloc()),
             numeric_limits<difference_type>::max());
     }
@@ -871,7 +871,7 @@ private:
 
 #ifndef _LIBCPP_HAS_NO_DEDUCTION_GUIDES
 template<class _InputIterator,
-         class _Alloc = typename std::allocator<typename iterator_traits<_InputIterator>::value_type>,
+         class _Alloc = allocator<typename iterator_traits<_InputIterator>::value_type>,
          class = typename enable_if<__is_allocator<_Alloc>::value, void>::type
          >
 forward_list(_InputIterator, _InputIterator)
diff --git a/lib/libcxx/include/fstream b/lib/libcxx/include/fstream
index e9138998bf..d7d6b46c32 100644
--- a/lib/libcxx/include/fstream
+++ b/lib/libcxx/include/fstream
@@ -180,12 +180,16 @@ typedef basic_fstream<wchar_t> wfstream;
 */
 
 #include <__config>
+#include <__availability>
 #include <ostream>
 #include <istream>
 #include <__locale>
 #include <cstdio>
 #include <cstdlib>
-#include <filesystem>
+
+#if !defined(_LIBCPP_HAS_NO_FILESYSTEM_LIBRARY)
+#   include <filesystem>
+#endif
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #pragma GCC system_header
@@ -234,13 +238,13 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     basic_filebuf* open(const string& __s, ios_base::openmode __mode);
 
-#if _LIBCPP_STD_VER >= 17
+#if _LIBCPP_STD_VER >= 17 && !defined(_LIBCPP_HAS_NO_FILESYSTEM_LIBRARY)
     _LIBCPP_AVAILABILITY_FILESYSTEM _LIBCPP_INLINE_VISIBILITY
     basic_filebuf* open(const _VSTD_FS::path& __p, ios_base::openmode __mode) {
       return open(__p.c_str(), __mode);
     }
 #endif
-    _LIBCPP_INLINE_VISIBILITY
+    inline _LIBCPP_INLINE_VISIBILITY
     basic_filebuf* __open(int __fd, ios_base::openmode __mode);
 #endif
     basic_filebuf* close();
@@ -286,13 +290,13 @@ private:
 
 template <class _CharT, class _Traits>
 basic_filebuf<_CharT, _Traits>::basic_filebuf()
-    : __extbuf_(0),
-      __extbufnext_(0),
-      __extbufend_(0),
+    : __extbuf_(nullptr),
+      __extbufnext_(nullptr),
+      __extbufend_(nullptr),
       __ebs_(0),
-      __intbuf_(0),
+      __intbuf_(nullptr),
       __ibs_(0),
-      __file_(0),
+      __file_(nullptr),
       __cv_(nullptr),
       __st_(),
       __st_last_(),
@@ -307,7 +311,7 @@ basic_filebuf<_CharT, _Traits>::basic_filebuf()
         __cv_ = &use_facet<codecvt<char_type, char, state_type> >(this->getloc());
         __always_noconv_ = __cv_->always_noconv();
     }
-    setbuf(0, 4096);
+    setbuf(nullptr, 4096);
 }
 
 #ifndef _LIBCPP_CXX03_LANG
@@ -359,13 +363,13 @@ basic_filebuf<_CharT, _Traits>::basic_filebuf(basic_filebuf&& __rhs)
                        (char_type*)__extbuf_ + (__rhs.gptr() - __rhs.eback()),
                        (char_type*)__extbuf_ + (__rhs.egptr() - __rhs.eback()));
     }
-    __rhs.__extbuf_ = 0;
-    __rhs.__extbufnext_ = 0;
-    __rhs.__extbufend_ = 0;
+    __rhs.__extbuf_ = nullptr;
+    __rhs.__extbufnext_ = nullptr;
+    __rhs.__extbufend_ = nullptr;
     __rhs.__ebs_ = 0;
     __rhs.__intbuf_ = 0;
     __rhs.__ibs_ = 0;
-    __rhs.__file_ = 0;
+    __rhs.__file_ = nullptr;
     __rhs.__st_ = state_type();
     __rhs.__st_last_ = state_type();
     __rhs.__om_ = 0;
@@ -499,7 +503,7 @@ inline
 bool
 basic_filebuf<_CharT, _Traits>::is_open() const
 {
-    return __file_ != 0;
+    return __file_ != nullptr;
 }
 
 template <class _CharT, class _Traits>
@@ -547,8 +551,8 @@ template <class _CharT, class _Traits>
 basic_filebuf<_CharT, _Traits>*
 basic_filebuf<_CharT, _Traits>::open(const char* __s, ios_base::openmode __mode)
 {
-    basic_filebuf<_CharT, _Traits>* __rt = 0;
-    if (__file_ == 0)
+    basic_filebuf<_CharT, _Traits>* __rt = nullptr;
+    if (__file_ == nullptr)
     {
       if (const char* __mdstr = __make_mdstring(__mode)) {
         __rt = this;
@@ -558,22 +562,23 @@ basic_filebuf<_CharT, _Traits>::open(const char* __s, ios_base::openmode __mode)
           if (__mode & ios_base::ate) {
             if (fseek(__file_, 0, SEEK_END)) {
               fclose(__file_);
-              __file_ = 0;
-              __rt = 0;
+              __file_ = nullptr;
+              __rt = nullptr;
             }
           }
         } else
-          __rt = 0;
+          __rt = nullptr;
       }
     }
     return __rt;
 }
 
 template <class _CharT, class _Traits>
-_LIBCPP_INLINE_VISIBILITY basic_filebuf<_CharT, _Traits>*
+inline _LIBCPP_INLINE_VISIBILITY
+basic_filebuf<_CharT, _Traits>*
 basic_filebuf<_CharT, _Traits>::__open(int __fd, ios_base::openmode __mode) {
-  basic_filebuf<_CharT, _Traits>* __rt = 0;
-  if (__file_ == 0) {
+  basic_filebuf<_CharT, _Traits>* __rt = nullptr;
+  if (__file_ == nullptr) {
     if (const char* __mdstr = __make_mdstring(__mode)) {
       __rt = this;
       __file_ = fdopen(__fd, __mdstr);
@@ -582,12 +587,12 @@ basic_filebuf<_CharT, _Traits>::__open(int __fd, ios_base::openmode __mode) {
         if (__mode & ios_base::ate) {
           if (fseek(__file_, 0, SEEK_END)) {
             fclose(__file_);
-            __file_ = 0;
-            __rt = 0;
+            __file_ = nullptr;
+            __rt = nullptr;
           }
         }
       } else
-        __rt = 0;
+        __rt = nullptr;
     }
   }
   return __rt;
@@ -600,8 +605,8 @@ template <class _CharT, class _Traits>
 basic_filebuf<_CharT, _Traits>*
 basic_filebuf<_CharT, _Traits>::open(const wchar_t* __s, ios_base::openmode __mode)
 {
-    basic_filebuf<_CharT, _Traits>* __rt = 0;
-    if (__file_ == 0)
+    basic_filebuf<_CharT, _Traits>* __rt = nullptr;
+    if (__file_ == nullptr)
     {
         __rt = this;
         const wchar_t* __mdstr;
@@ -650,7 +655,7 @@ basic_filebuf<_CharT, _Traits>::open(const wchar_t* __s, ios_base::openmode __mo
             __mdstr = L"a+b";
             break;
         default:
-            __rt = 0;
+            __rt = nullptr;
             break;
         }
         if (__rt)
@@ -664,13 +669,13 @@ basic_filebuf<_CharT, _Traits>::open(const wchar_t* __s, ios_base::openmode __mo
                     if (fseek(__file_, 0, SEEK_END))
                     {
                         fclose(__file_);
-                        __file_ = 0;
-                        __rt = 0;
+                        __file_ = nullptr;
+                        __rt = nullptr;
                     }
                 }
             }
             else
-                __rt = 0;
+                __rt = nullptr;
         }
     }
     return __rt;
@@ -690,16 +695,16 @@ template <class _CharT, class _Traits>
 basic_filebuf<_CharT, _Traits>*
 basic_filebuf<_CharT, _Traits>::close()
 {
-    basic_filebuf<_CharT, _Traits>* __rt = 0;
+    basic_filebuf<_CharT, _Traits>* __rt = nullptr;
     if (__file_)
     {
         __rt = this;
         unique_ptr<FILE, int(*)(FILE*)> __h(__file_, fclose);
         if (sync())
-            __rt = 0;
+            __rt = nullptr;
         if (fclose(__h.release()))
-            __rt = 0;
-        __file_ = 0;
+            __rt = nullptr;
+        __file_ = nullptr;
         setbuf(0, 0);
     }
     return __rt;
@@ -709,17 +714,17 @@ template <class _CharT, class _Traits>
 typename basic_filebuf<_CharT, _Traits>::int_type
 basic_filebuf<_CharT, _Traits>::underflow()
 {
-    if (__file_ == 0)
+    if (__file_ == nullptr)
         return traits_type::eof();
     bool __initial = __read_mode();
     char_type __1buf;
-    if (this->gptr() == 0)
+    if (this->gptr() == nullptr)
         this->setg(&__1buf, &__1buf+1, &__1buf+1);
     const size_t __unget_sz = __initial ? 0 : min<size_t>((this->egptr() - this->eback()) / 2, 4);
     int_type __c = traits_type::eof();
     if (this->gptr() == this->egptr())
     {
-        memmove(this->eback(), this->egptr() - __unget_sz, __unget_sz * sizeof(char_type));
+        _VSTD::memmove(this->eback(), this->egptr() - __unget_sz, __unget_sz * sizeof(char_type));
         if (__always_noconv_)
         {
             size_t __nmemb = static_cast<size_t>(this->egptr() - this->eback() - __unget_sz);
@@ -736,7 +741,7 @@ basic_filebuf<_CharT, _Traits>::underflow()
         {
             _LIBCPP_ASSERT ( !(__extbufnext_ == NULL && (__extbufend_ != __extbufnext_)), "underflow moving from NULL" );
             if (__extbufend_ != __extbufnext_)
-                memmove(__extbuf_, __extbufnext_, __extbufend_ - __extbufnext_);
+                _VSTD::memmove(__extbuf_, __extbufnext_, __extbufend_ - __extbufnext_);
             __extbufnext_ = __extbuf_ + (__extbufend_ - __extbufnext_);
             __extbufend_ = __extbuf_ + (__extbuf_ == __extbuf_min_ ? sizeof(__extbuf_min_) : __ebs_);
             size_t __nmemb = _VSTD::min(static_cast<size_t>(__ibs_ - __unget_sz),
@@ -771,7 +776,7 @@ basic_filebuf<_CharT, _Traits>::underflow()
     else
         __c = traits_type::to_int_type(*this->gptr());
     if (this->eback() == &__1buf)
-        this->setg(0, 0, 0);
+        this->setg(nullptr, nullptr, nullptr);
     return __c;
 }
 
@@ -801,7 +806,7 @@ template <class _CharT, class _Traits>
 typename basic_filebuf<_CharT, _Traits>::int_type
 basic_filebuf<_CharT, _Traits>::overflow(int_type __c)
 {
-    if (__file_ == 0)
+    if (__file_ == nullptr)
         return traits_type::eof();
     __write_mode();
     char_type __1buf;
@@ -809,7 +814,7 @@ basic_filebuf<_CharT, _Traits>::overflow(int_type __c)
     char_type* __epb_save = this->epptr();
     if (!traits_type::eq_int_type(__c, traits_type::eof()))
     {
-        if (this->pptr() == 0)
+        if (this->pptr() == nullptr)
             this->setp(&__1buf, &__1buf+1);
         *this->pptr() = traits_type::to_char_type(__c);
         this->pbump(1);
@@ -866,8 +871,8 @@ template <class _CharT, class _Traits>
 basic_streambuf<_CharT, _Traits>*
 basic_filebuf<_CharT, _Traits>::setbuf(char_type* __s, streamsize __n)
 {
-    this->setg(0, 0, 0);
-    this->setp(0, 0);
+    this->setg(nullptr, nullptr, nullptr);
+    this->setp(nullptr, nullptr);
     if (__owns_eb_)
         delete [] __extbuf_;
     if (__owns_ib_)
@@ -909,7 +914,7 @@ basic_filebuf<_CharT, _Traits>::setbuf(char_type* __s, streamsize __n)
     else
     {
         __ibs_ = 0;
-        __intbuf_ = 0;
+        __intbuf_ = nullptr;
         __owns_ib_ = false;
     }
     return this;
@@ -924,7 +929,7 @@ basic_filebuf<_CharT, _Traits>::seekoff(off_type __off, ios_base::seekdir __way,
         __throw_bad_cast();
 
     int __width = __cv_->encoding();
-    if (__file_ == 0 || (__width <= 0 && __off != 0) || sync())
+    if (__file_ == nullptr || (__width <= 0 && __off != 0) || sync())
         return pos_type(off_type(-1));
     // __width > 0 || __off == 0
     int __whence;
@@ -959,7 +964,7 @@ template <class _CharT, class _Traits>
 typename basic_filebuf<_CharT, _Traits>::pos_type
 basic_filebuf<_CharT, _Traits>::seekpos(pos_type __sp, ios_base::openmode)
 {
-    if (__file_ == 0 || sync())
+    if (__file_ == nullptr || sync())
         return pos_type(off_type(-1));
 #if defined(_LIBCPP_HAS_NO_OFF_T_FUNCTIONS)
     if (fseek(__file_, __sp, SEEK_SET))
@@ -976,7 +981,7 @@ template <class _CharT, class _Traits>
 int
 basic_filebuf<_CharT, _Traits>::sync()
 {
-    if (__file_ == 0)
+    if (__file_ == nullptr)
         return 0;
     if (!__cv_)
         __throw_bad_cast();
@@ -1035,7 +1040,7 @@ basic_filebuf<_CharT, _Traits>::sync()
         if (__update_st)
             __st_ = __state;
         __extbufnext_ = __extbufend_ = __extbuf_;
-        this->setg(0, 0, 0);
+        this->setg(nullptr, nullptr, nullptr);
         __cm_ = 0;
     }
     return 0;
@@ -1051,8 +1056,8 @@ basic_filebuf<_CharT, _Traits>::imbue(const locale& __loc)
     __always_noconv_ = __cv_->always_noconv();
     if (__old_anc != __always_noconv_)
     {
-        this->setg(0, 0, 0);
-        this->setp(0, 0);
+        this->setg(nullptr, nullptr, nullptr);
+        this->setp(nullptr, nullptr);
         // invariant, char_type is char, else we couldn't get here
         if (__always_noconv_)  // need to dump __intbuf_
         {
@@ -1062,7 +1067,7 @@ basic_filebuf<_CharT, _Traits>::imbue(const locale& __loc)
             __ebs_ = __ibs_;
             __extbuf_ = (char*)__intbuf_;
             __ibs_ = 0;
-            __intbuf_ = 0;
+            __intbuf_ = nullptr;
             __owns_ib_ = false;
         }
         else  // need to obtain an __intbuf_.
@@ -1091,7 +1096,7 @@ basic_filebuf<_CharT, _Traits>::__read_mode()
 {
     if (!(__cm_ & ios_base::in))
     {
-        this->setp(0, 0);
+        this->setp(nullptr, nullptr);
         if (__always_noconv_)
             this->setg((char_type*)__extbuf_,
                        (char_type*)__extbuf_ + __ebs_,
@@ -1110,7 +1115,7 @@ basic_filebuf<_CharT, _Traits>::__write_mode()
 {
     if (!(__cm_ & ios_base::out))
     {
-        this->setg(0, 0, 0);
+        this->setg(nullptr, nullptr, nullptr);
         if (__ebs_ > sizeof(__extbuf_min_))
         {
             if (__always_noconv_)
@@ -1120,7 +1125,7 @@ basic_filebuf<_CharT, _Traits>::__write_mode()
                 this->setp(__intbuf_, __intbuf_ + (__ibs_ - 1));
         }
         else
-            this->setp(0, 0);
+            this->setp(nullptr, nullptr);
         __cm_ = ios_base::out;
     }
 }
@@ -1149,7 +1154,7 @@ public:
 #endif
     _LIBCPP_INLINE_VISIBILITY
     explicit basic_ifstream(const string& __s, ios_base::openmode __mode = ios_base::in);
-#if _LIBCPP_STD_VER >= 17
+#if _LIBCPP_STD_VER >= 17 && !defined(_LIBCPP_HAS_NO_FILESYSTEM_LIBRARY)
     _LIBCPP_AVAILABILITY_FILESYSTEM _LIBCPP_INLINE_VISIBILITY
     explicit basic_ifstream(const filesystem::path& __p, ios_base::openmode __mode = ios_base::in)
       : basic_ifstream(__p.c_str(), __mode) {}
@@ -1175,7 +1180,7 @@ public:
     void open(const wchar_t* __s, ios_base::openmode __mode = ios_base::in);
 #endif
     void open(const string& __s, ios_base::openmode __mode = ios_base::in);
-#if _LIBCPP_STD_VER >= 17
+#if _LIBCPP_STD_VER >= 17 && !defined(_LIBCPP_HAS_NO_FILESYSTEM_LIBRARY)
     _LIBCPP_AVAILABILITY_FILESYSTEM _LIBCPP_INLINE_VISIBILITY
     void open(const filesystem::path& __p,
               ios_base::openmode __mode = ios_base::in) {
@@ -1206,7 +1211,7 @@ inline
 basic_ifstream<_CharT, _Traits>::basic_ifstream(const char* __s, ios_base::openmode __mode)
     : basic_istream<char_type, traits_type>(&__sb_)
 {
-    if (__sb_.open(__s, __mode | ios_base::in) == 0)
+    if (__sb_.open(__s, __mode | ios_base::in) == nullptr)
         this->setstate(ios_base::failbit);
 }
 
@@ -1216,7 +1221,7 @@ inline
 basic_ifstream<_CharT, _Traits>::basic_ifstream(const wchar_t* __s, ios_base::openmode __mode)
     : basic_istream<char_type, traits_type>(&__sb_)
 {
-    if (__sb_.open(__s, __mode | ios_base::in) == 0)
+    if (__sb_.open(__s, __mode | ios_base::in) == nullptr)
         this->setstate(ios_base::failbit);
 }
 #endif
@@ -1226,7 +1231,7 @@ inline
 basic_ifstream<_CharT, _Traits>::basic_ifstream(const string& __s, ios_base::openmode __mode)
     : basic_istream<char_type, traits_type>(&__sb_)
 {
-    if (__sb_.open(__s, __mode | ios_base::in) == 0)
+    if (__sb_.open(__s, __mode | ios_base::in) == nullptr)
         this->setstate(ios_base::failbit);
 }
 #endif
@@ -1363,7 +1368,7 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     explicit basic_ofstream(const string& __s, ios_base::openmode __mode = ios_base::out);
 
-#if _LIBCPP_STD_VER >= 17
+#if _LIBCPP_STD_VER >= 17 && !defined(_LIBCPP_HAS_NO_FILESYSTEM_LIBRARY)
     _LIBCPP_AVAILABILITY_FILESYSTEM _LIBCPP_INLINE_VISIBILITY
     explicit basic_ofstream(const filesystem::path& __p, ios_base::openmode __mode = ios_base::out)
       : basic_ofstream(__p.c_str(), __mode) {}
@@ -1390,7 +1395,7 @@ public:
 #endif
     void open(const string& __s, ios_base::openmode __mode = ios_base::out);
 
-#if _LIBCPP_STD_VER >= 17
+#if _LIBCPP_STD_VER >= 17 && !defined(_LIBCPP_HAS_NO_FILESYSTEM_LIBRARY)
     _LIBCPP_AVAILABILITY_FILESYSTEM _LIBCPP_INLINE_VISIBILITY
     void open(const filesystem::path& __p, ios_base::openmode __mode = ios_base::out)
     { return open(__p.c_str(), __mode); }
@@ -1419,7 +1424,7 @@ inline
 basic_ofstream<_CharT, _Traits>::basic_ofstream(const char* __s, ios_base::openmode __mode)
     : basic_ostream<char_type, traits_type>(&__sb_)
 {
-    if (__sb_.open(__s, __mode | ios_base::out) == 0)
+    if (__sb_.open(__s, __mode | ios_base::out) == nullptr)
         this->setstate(ios_base::failbit);
 }
 
@@ -1429,7 +1434,7 @@ inline
 basic_ofstream<_CharT, _Traits>::basic_ofstream(const wchar_t* __s, ios_base::openmode __mode)
     : basic_ostream<char_type, traits_type>(&__sb_)
 {
-    if (__sb_.open(__s, __mode | ios_base::out) == 0)
+    if (__sb_.open(__s, __mode | ios_base::out) == nullptr)
         this->setstate(ios_base::failbit);
 }
 #endif
@@ -1439,7 +1444,7 @@ inline
 basic_ofstream<_CharT, _Traits>::basic_ofstream(const string& __s, ios_base::openmode __mode)
     : basic_ostream<char_type, traits_type>(&__sb_)
 {
-    if (__sb_.open(__s, __mode | ios_base::out) == 0)
+    if (__sb_.open(__s, __mode | ios_base::out) == nullptr)
         this->setstate(ios_base::failbit);
 }
 #endif
@@ -1548,7 +1553,7 @@ inline
 void
 basic_ofstream<_CharT, _Traits>::close()
 {
-    if (__sb_.close() == 0)
+    if (__sb_.close() == nullptr)
         this->setstate(ios_base::failbit);
 }
 
@@ -1577,7 +1582,7 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     explicit basic_fstream(const string& __s, ios_base::openmode __mode = ios_base::in | ios_base::out);
 
-#if _LIBCPP_STD_VER >= 17
+#if _LIBCPP_STD_VER >= 17 && !defined(_LIBCPP_HAS_NO_FILESYSTEM_LIBRARY)
     _LIBCPP_AVAILABILITY_FILESYSTEM _LIBCPP_INLINE_VISIBILITY
     explicit basic_fstream(const filesystem::path& __p, ios_base::openmode __mode = ios_base::in | ios_base::out)
       : basic_fstream(__p.c_str(), __mode) {}
@@ -1605,7 +1610,7 @@ public:
 #endif
     void open(const string& __s, ios_base::openmode __mode = ios_base::in | ios_base::out);
 
-#if _LIBCPP_STD_VER >= 17
+#if _LIBCPP_STD_VER >= 17 && !defined(_LIBCPP_HAS_NO_FILESYSTEM_LIBRARY)
     _LIBCPP_AVAILABILITY_FILESYSTEM _LIBCPP_INLINE_VISIBILITY
     void open(const filesystem::path& __p, ios_base::openmode __mode = ios_base::in|ios_base::out)
     { return open(__p.c_str(), __mode); }
@@ -1632,7 +1637,7 @@ inline
 basic_fstream<_CharT, _Traits>::basic_fstream(const char* __s, ios_base::openmode __mode)
     : basic_iostream<char_type, traits_type>(&__sb_)
 {
-    if (__sb_.open(__s, __mode) == 0)
+    if (__sb_.open(__s, __mode) == nullptr)
         this->setstate(ios_base::failbit);
 }
 
@@ -1642,7 +1647,7 @@ inline
 basic_fstream<_CharT, _Traits>::basic_fstream(const wchar_t* __s, ios_base::openmode __mode)
     : basic_iostream<char_type, traits_type>(&__sb_)
 {
-    if (__sb_.open(__s, __mode) == 0)
+    if (__sb_.open(__s, __mode) == nullptr)
         this->setstate(ios_base::failbit);
 }
 #endif
@@ -1652,7 +1657,7 @@ inline
 basic_fstream<_CharT, _Traits>::basic_fstream(const string& __s, ios_base::openmode __mode)
     : basic_iostream<char_type, traits_type>(&__sb_)
 {
-    if (__sb_.open(__s, __mode) == 0)
+    if (__sb_.open(__s, __mode) == nullptr)
         this->setstate(ios_base::failbit);
 }
 #endif
@@ -1752,10 +1757,16 @@ inline
 void
 basic_fstream<_CharT, _Traits>::close()
 {
-    if (__sb_.close() == 0)
+    if (__sb_.close() == nullptr)
         this->setstate(ios_base::failbit);
 }
 
+#if defined(_LIBCPP_ABI_ENABLE_ADDITIONAL_IOSTREAM_EXPLICIT_INSTANTIATIONS_1)
+_LIBCPP_EXTERN_TEMPLATE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_ifstream<char>)
+_LIBCPP_EXTERN_TEMPLATE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_ofstream<char>)
+_LIBCPP_EXTERN_TEMPLATE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_filebuf<char>)
+#endif
+
 _LIBCPP_END_NAMESPACE_STD
 
 _LIBCPP_POP_MACROS
diff --git a/lib/libcxx/include/functional b/lib/libcxx/include/functional
index 3e9425320f..67baa5bd4b 100644
--- a/lib/libcxx/include/functional
+++ b/lib/libcxx/include/functional
@@ -213,7 +213,8 @@ public:
 template <class Predicate> // deprecated in C++17
 binary_negate<Predicate> not2(const Predicate& pred);
 
-template <class F> unspecified not_fn(F&& f); // C++17
+template <class F>
+constexpr unspecified not_fn(F&& f); // C++17, constexpr in C++20
 
 template<class T> struct is_bind_expression;
 template<class T> struct is_placeholder;
@@ -226,11 +227,12 @@ template <class T> inline constexpr int is_placeholder_v
 
 
 template<class Fn, class... BoundArgs>
-  unspecified bind(Fn&&, BoundArgs&&...);
+  constexpr unspecified bind(Fn&&, BoundArgs&&...);  // constexpr in C++20
 template<class R, class Fn, class... BoundArgs>
-  unspecified bind(Fn&&, BoundArgs&&...);
+  constexpr unspecified bind(Fn&&, BoundArgs&&...);  // constexpr in C++20
 
 template<class F, class... Args>
+ constexpr // constexpr in C++20
  invoke_result_t<F, Args...> invoke(F&& f, Args&&... args) // C++17
     noexcept(is_nothrow_invocable_v<F, Args...>);
 
@@ -376,7 +378,8 @@ public:
 template <class S, class T>          const_mem_fun_ref_t<S,T>    mem_fun_ref(S (T::*f)() const);   // deprecated in C++11, removed in C++17
 template <class S, class T, class A> const_mem_fun1_ref_t<S,T,A> mem_fun_ref(S (T::*f)(A) const);  // deprecated in C++11, removed in C++17
 
-template<class R, class T> unspecified mem_fn(R T::*);
+template<class R, class T>
+constexpr unspecified mem_fn(R T::*); // constexpr in C++20
 
 class bad_function_call
     : public exception
@@ -470,6 +473,7 @@ template <> struct hash<bool>;
 template <> struct hash<char>;
 template <> struct hash<signed char>;
 template <> struct hash<unsigned char>;
+template <> struct hash<char8_t>; // since C++20
 template <> struct hash<char16_t>;
 template <> struct hash<char32_t>;
 template <> struct hash<wchar_t>;
@@ -508,10 +512,6 @@ POLICY:  For non-variadic implementations, the number of arguments is limited
 
 #include <__functional_base>
 
-#if defined(_LIBCPP_HAS_BLOCKS_RUNTIME) && !defined(_LIBCPP_HAS_OBJC_ARC)
-#include <Block.h>
-#endif
-
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #pragma GCC system_header
 #endif
@@ -1291,15 +1291,16 @@ private:
     type __f_;
 
 public:
-    _LIBCPP_INLINE_VISIBILITY __mem_fn(type __f) _NOEXCEPT : __f_(__f) {}
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
+    __mem_fn(type __f) _NOEXCEPT : __f_(__f) {}
 
 #ifndef _LIBCPP_CXX03_LANG
     // invoke
     template <class... _ArgTypes>
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
     typename __invoke_return<type, _ArgTypes...>::type
     operator() (_ArgTypes&&... __args) const {
-        return __invoke(__f_, _VSTD::forward<_ArgTypes>(__args)...);
+        return _VSTD::__invoke(__f_, _VSTD::forward<_ArgTypes>(__args)...);
     }
 #else
 
@@ -1307,104 +1308,104 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     typename __invoke_return0<type, _A0>::type
     operator() (_A0& __a0) const {
-        return __invoke(__f_, __a0);
+        return _VSTD::__invoke(__f_, __a0);
     }
 
     template <class _A0>
     _LIBCPP_INLINE_VISIBILITY
     typename __invoke_return0<type, _A0 const>::type
     operator() (_A0 const& __a0) const {
-        return __invoke(__f_, __a0);
+        return _VSTD::__invoke(__f_, __a0);
     }
 
     template <class _A0, class _A1>
     _LIBCPP_INLINE_VISIBILITY
     typename __invoke_return1<type, _A0, _A1>::type
     operator() (_A0& __a0, _A1& __a1) const {
-        return __invoke(__f_, __a0, __a1);
+        return _VSTD::__invoke(__f_, __a0, __a1);
     }
 
     template <class _A0, class _A1>
     _LIBCPP_INLINE_VISIBILITY
     typename __invoke_return1<type, _A0 const, _A1>::type
     operator() (_A0 const& __a0, _A1& __a1) const {
-        return __invoke(__f_, __a0, __a1);
+        return _VSTD::__invoke(__f_, __a0, __a1);
     }
 
     template <class _A0, class _A1>
     _LIBCPP_INLINE_VISIBILITY
     typename __invoke_return1<type, _A0, _A1 const>::type
     operator() (_A0& __a0, _A1 const& __a1) const {
-        return __invoke(__f_, __a0, __a1);
+        return _VSTD::__invoke(__f_, __a0, __a1);
     }
 
     template <class _A0, class _A1>
     _LIBCPP_INLINE_VISIBILITY
     typename __invoke_return1<type, _A0 const, _A1 const>::type
     operator() (_A0 const& __a0, _A1 const& __a1) const {
-        return __invoke(__f_, __a0, __a1);
+        return _VSTD::__invoke(__f_, __a0, __a1);
     }
 
     template <class _A0, class _A1, class _A2>
     _LIBCPP_INLINE_VISIBILITY
     typename __invoke_return2<type, _A0, _A1, _A2>::type
     operator() (_A0& __a0, _A1& __a1, _A2& __a2) const {
-        return __invoke(__f_, __a0, __a1, __a2);
+        return _VSTD::__invoke(__f_, __a0, __a1, __a2);
     }
 
     template <class _A0, class _A1, class _A2>
     _LIBCPP_INLINE_VISIBILITY
     typename __invoke_return2<type, _A0 const, _A1, _A2>::type
     operator() (_A0 const& __a0, _A1& __a1, _A2& __a2) const {
-        return __invoke(__f_, __a0, __a1, __a2);
+        return _VSTD::__invoke(__f_, __a0, __a1, __a2);
     }
 
     template <class _A0, class _A1, class _A2>
     _LIBCPP_INLINE_VISIBILITY
     typename __invoke_return2<type, _A0, _A1 const, _A2>::type
     operator() (_A0& __a0, _A1 const& __a1, _A2& __a2) const {
-        return __invoke(__f_, __a0, __a1, __a2);
+        return _VSTD::__invoke(__f_, __a0, __a1, __a2);
     }
 
     template <class _A0, class _A1, class _A2>
     _LIBCPP_INLINE_VISIBILITY
     typename __invoke_return2<type, _A0, _A1, _A2 const>::type
     operator() (_A0& __a0, _A1& __a1, _A2 const& __a2) const {
-        return __invoke(__f_, __a0, __a1, __a2);
+        return _VSTD::__invoke(__f_, __a0, __a1, __a2);
     }
 
     template <class _A0, class _A1, class _A2>
     _LIBCPP_INLINE_VISIBILITY
     typename __invoke_return2<type, _A0 const, _A1 const, _A2>::type
     operator() (_A0 const& __a0, _A1 const& __a1, _A2& __a2) const {
-        return __invoke(__f_, __a0, __a1, __a2);
+        return _VSTD::__invoke(__f_, __a0, __a1, __a2);
     }
 
     template <class _A0, class _A1, class _A2>
     _LIBCPP_INLINE_VISIBILITY
     typename __invoke_return2<type, _A0 const, _A1, _A2 const>::type
     operator() (_A0 const& __a0, _A1& __a1, _A2 const& __a2) const {
-        return __invoke(__f_, __a0, __a1, __a2);
+        return _VSTD::__invoke(__f_, __a0, __a1, __a2);
     }
 
     template <class _A0, class _A1, class _A2>
     _LIBCPP_INLINE_VISIBILITY
     typename __invoke_return2<type, _A0, _A1 const, _A2 const>::type
     operator() (_A0& __a0, _A1 const& __a1, _A2 const& __a2) const {
-        return __invoke(__f_, __a0, __a1, __a2);
+        return _VSTD::__invoke(__f_, __a0, __a1, __a2);
     }
 
     template <class _A0, class _A1, class _A2>
     _LIBCPP_INLINE_VISIBILITY
     typename __invoke_return2<type, _A0 const, _A1 const, _A2 const>::type
     operator() (_A0 const& __a0, _A1 const& __a1, _A2 const& __a2) const {
-        return __invoke(__f_, __a0, __a1, __a2);
+        return _VSTD::__invoke(__f_, __a0, __a1, __a2);
     }
 #endif
 };
 
 template<class _Rp, class _Tp>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 __mem_fn<_Rp _Tp::*>
 mem_fn(_Rp _Tp::* __pm) _NOEXCEPT
 {
@@ -1596,7 +1597,7 @@ public:
   const _Target& __target() const { return __f_; }
 
   _LIBCPP_INLINE_VISIBILITY
-  explicit __default_alloc_func(_Target&& __f) : __f_(std::move(__f)) {}
+  explicit __default_alloc_func(_Target&& __f) : __f_(_VSTD::move(__f)) {}
 
   _LIBCPP_INLINE_VISIBILITY
   explicit __default_alloc_func(const _Target& __f) : __f_(__f) {}
@@ -1612,7 +1613,7 @@ public:
       __builtin_new_allocator::__holder_t __hold =
         __builtin_new_allocator::__allocate_type<__default_alloc_func>(1);
     __default_alloc_func* __res =
-        ::new (__hold.get()) __default_alloc_func(__f_);
+        ::new ((void*)__hold.get()) __default_alloc_func(__f_);
     (void)__hold.release();
     return __res;
   }
@@ -1703,7 +1704,7 @@ template<class _Fp, class _Alloc, class _Rp, class ..._ArgTypes>
 void
 __func<_Fp, _Alloc, _Rp(_ArgTypes...)>::__clone(__base<_Rp(_ArgTypes...)>* __p) const
 {
-    ::new (__p) __func(__f_.__target(), __f_.__get_allocator());
+    ::new ((void*)__p) __func(__f_.__target(), __f_.__get_allocator());
 }
 
 template<class _Fp, class _Alloc, class _Rp, class ..._ArgTypes>
@@ -1739,7 +1740,7 @@ __func<_Fp, _Alloc, _Rp(_ArgTypes...)>::target(const type_info& __ti) const _NOE
 {
     if (__ti == typeid(_Fp))
         return &__f_.__target();
-    return (const void*)0;
+    return nullptr;
 }
 
 template<class _Fp, class _Alloc, class _Rp, class ..._ArgTypes>
@@ -1769,11 +1770,11 @@ template <class _Rp, class... _ArgTypes> class __value_func<_Rp(_ArgTypes...)>
 
   public:
     _LIBCPP_INLINE_VISIBILITY
-    __value_func() _NOEXCEPT : __f_(0) {}
+    __value_func() _NOEXCEPT : __f_(nullptr) {}
 
     template <class _Fp, class _Alloc>
     _LIBCPP_INLINE_VISIBILITY __value_func(_Fp&& __f, const _Alloc& __a)
-        : __f_(0)
+        : __f_(nullptr)
     {
         typedef allocator_traits<_Alloc> __alloc_traits;
         typedef __function::__func<_Fp, _Alloc, _Rp(_ArgTypes...)> _Fun;
@@ -1803,13 +1804,13 @@ template <class _Rp, class... _ArgTypes> class __value_func<_Rp(_ArgTypes...)>
     template <class _Fp,
         class = typename enable_if<!is_same<typename decay<_Fp>::type, __value_func>::value>::type>
     _LIBCPP_INLINE_VISIBILITY explicit __value_func(_Fp&& __f)
-        : __value_func(std::forward<_Fp>(__f), allocator<_Fp>()) {}
+        : __value_func(_VSTD::forward<_Fp>(__f), allocator<_Fp>()) {}
 
     _LIBCPP_INLINE_VISIBILITY
     __value_func(const __value_func& __f)
     {
-        if (__f.__f_ == 0)
-            __f_ = 0;
+        if (__f.__f_ == nullptr)
+            __f_ = nullptr;
         else if ((void*)__f.__f_ == &__f.__buf_)
         {
             __f_ = __as_base(&__buf_);
@@ -1822,8 +1823,8 @@ template <class _Rp, class... _ArgTypes> class __value_func<_Rp(_ArgTypes...)>
     _LIBCPP_INLINE_VISIBILITY
     __value_func(__value_func&& __f) _NOEXCEPT
     {
-        if (__f.__f_ == 0)
-            __f_ = 0;
+        if (__f.__f_ == nullptr)
+            __f_ = nullptr;
         else if ((void*)__f.__f_ == &__f.__buf_)
         {
             __f_ = __as_base(&__buf_);
@@ -1832,7 +1833,7 @@ template <class _Rp, class... _ArgTypes> class __value_func<_Rp(_ArgTypes...)>
         else
         {
             __f_ = __f.__f_;
-            __f.__f_ = 0;
+            __f.__f_ = nullptr;
         }
     }
 
@@ -1849,8 +1850,8 @@ template <class _Rp, class... _ArgTypes> class __value_func<_Rp(_ArgTypes...)>
     __value_func& operator=(__value_func&& __f)
     {
         *this = nullptr;
-        if (__f.__f_ == 0)
-            __f_ = 0;
+        if (__f.__f_ == nullptr)
+            __f_ = nullptr;
         else if ((void*)__f.__f_ == &__f.__buf_)
         {
             __f_ = __as_base(&__buf_);
@@ -1859,7 +1860,7 @@ template <class _Rp, class... _ArgTypes> class __value_func<_Rp(_ArgTypes...)>
         else
         {
             __f_ = __f.__f_;
-            __f.__f_ = 0;
+            __f.__f_ = nullptr;
         }
         return *this;
     }
@@ -1868,7 +1869,7 @@ template <class _Rp, class... _ArgTypes> class __value_func<_Rp(_ArgTypes...)>
     __value_func& operator=(nullptr_t)
     {
         __func* __f = __f_;
-        __f_ = 0;
+        __f_ = nullptr;
         if ((void*)__f == &__buf_)
             __f->destroy();
         else if (__f)
@@ -1879,7 +1880,7 @@ template <class _Rp, class... _ArgTypes> class __value_func<_Rp(_ArgTypes...)>
     _LIBCPP_INLINE_VISIBILITY
     _Rp operator()(_ArgTypes&&... __args) const
     {
-        if (__f_ == 0)
+        if (__f_ == nullptr)
             __throw_bad_function_call();
         return (*__f_)(_VSTD::forward<_ArgTypes>(__args)...);
     }
@@ -1895,10 +1896,10 @@ template <class _Rp, class... _ArgTypes> class __value_func<_Rp(_ArgTypes...)>
             __func* __t = __as_base(&__tempbuf);
             __f_->__clone(__t);
             __f_->destroy();
-            __f_ = 0;
+            __f_ = nullptr;
             __f.__f_->__clone(__as_base(&__buf_));
             __f.__f_->destroy();
-            __f.__f_ = 0;
+            __f.__f_ = nullptr;
             __f_ = __as_base(&__buf_);
             __t->__clone(__as_base(&__f.__buf_));
             __t->destroy();
@@ -1923,13 +1924,13 @@ template <class _Rp, class... _ArgTypes> class __value_func<_Rp(_ArgTypes...)>
     }
 
     _LIBCPP_INLINE_VISIBILITY
-    _LIBCPP_EXPLICIT operator bool() const _NOEXCEPT { return __f_ != 0; }
+    _LIBCPP_EXPLICIT operator bool() const _NOEXCEPT { return __f_ != nullptr; }
 
 #ifndef _LIBCPP_NO_RTTI
     _LIBCPP_INLINE_VISIBILITY
     const std::type_info& target_type() const _NOEXCEPT
     {
-        if (__f_ == 0)
+        if (__f_ == nullptr)
             return typeid(void);
         return __f_->target_type();
     }
@@ -1937,8 +1938,8 @@ template <class _Rp, class... _ArgTypes> class __value_func<_Rp(_ArgTypes...)>
     template <typename _Tp>
     _LIBCPP_INLINE_VISIBILITY const _Tp* target() const _NOEXCEPT
     {
-        if (__f_ == 0)
-            return 0;
+        if (__f_ == nullptr)
+            return nullptr;
         return (const _Tp*)__f_->target(typeid(_Tp));
     }
 #endif // _LIBCPP_NO_RTTI
@@ -2157,7 +2158,7 @@ template <class _Rp, class... _ArgTypes> class __policy_func<_Rp(_ArgTypes...)>
         } else {
           __builtin_new_allocator::__holder_t __hold =
               __builtin_new_allocator::__allocate_type<_Fun>(1);
-          __buf_.__large = ::new (__hold.get()) _Fun(_VSTD::move(__f));
+          __buf_.__large = ::new ((void*)__hold.get()) _Fun(_VSTD::move(__f));
           (void)__hold.release();
         }
       }
@@ -2257,6 +2258,9 @@ template <class _Rp, class... _ArgTypes> class __policy_func<_Rp(_ArgTypes...)>
 
 #if defined(_LIBCPP_HAS_BLOCKS_RUNTIME) && !defined(_LIBCPP_HAS_OBJC_ARC)
 
+extern "C" void *_Block_copy(const void *);
+extern "C" void _Block_release(const void *);
+
 template<class _Rp1, class ..._ArgTypes1, class _Alloc, class _Rp, class ..._ArgTypes>
 class __func<_Rp1(^)(_ArgTypes1...), _Alloc, _Rp(_ArgTypes...)>
     : public  __base<_Rp(_ArgTypes...)>
@@ -2267,14 +2271,14 @@ class __func<_Rp1(^)(_ArgTypes1...), _Alloc, _Rp(_ArgTypes...)>
 public:
     _LIBCPP_INLINE_VISIBILITY
     explicit __func(__block_type const& __f)
-        : __f_(__f ? Block_copy(__f) : (__block_type)0)
+        : __f_(reinterpret_cast<__block_type>(__f ? _Block_copy(__f) : nullptr))
     { }
 
     // [TODO] add && to save on a retain
 
     _LIBCPP_INLINE_VISIBILITY
     explicit __func(__block_type __f, const _Alloc& /* unused */)
-        : __f_(__f ? Block_copy(__f) : (__block_type)0)
+        : __f_(reinterpret_cast<__block_type>(__f ? _Block_copy(__f) : nullptr))
     { }
 
     virtual __base<_Rp(_ArgTypes...)>* __clone() const {
@@ -2286,12 +2290,12 @@ public:
     }
 
     virtual void __clone(__base<_Rp(_ArgTypes...)>* __p) const {
-        ::new (__p) __func(__f_);
+        ::new ((void*)__p) __func(__f_);
     }
 
     virtual void destroy() _NOEXCEPT {
         if (__f_)
-            Block_release(__f_);
+            _Block_release(__f_);
         __f_ = 0;
     }
 
@@ -2303,7 +2307,7 @@ public:
     }
 
     virtual _Rp operator()(_ArgTypes&& ... __arg) {
-        return __invoke(__f_, _VSTD::forward<_ArgTypes>(__arg)...);
+        return _VSTD::__invoke(__f_, _VSTD::forward<_ArgTypes>(__arg)...);
     }
 
 #ifndef _LIBCPP_NO_RTTI
@@ -2344,9 +2348,9 @@ class _LIBCPP_TEMPLATE_VIS function<_Rp(_ArgTypes...)>
     template <class _Fp>
         struct __callable<_Fp, true>
         {
-            static const bool value = is_same<void, _Rp>::value ||
-                is_convertible<typename __invoke_of<_Fp, _ArgTypes...>::type,
-                               _Rp>::value;
+            static const bool value = is_void<_Rp>::value ||
+                __is_core_convertible<typename __invoke_of<_Fp, _ArgTypes...>::type,
+                                      _Rp>::value;
         };
     template <class _Fp>
         struct __callable<_Fp, false>
@@ -2518,7 +2522,7 @@ template<class _Rp, class ..._ArgTypes>
 function<_Rp(_ArgTypes...)>&
 function<_Rp(_ArgTypes...)>::operator=(function&& __f) _NOEXCEPT
 {
-    __f_ = std::move(__f.__f_);
+    __f_ = _VSTD::move(__f.__f_);
     return *this;
 }
 
@@ -2701,7 +2705,7 @@ typename _EnableIf
 __mu(_Ti& __ti, tuple<_Uj...>& __uj)
 {
     typedef typename __make_tuple_indices<sizeof...(_Uj)>::type __indices;
-    return  __mu_expand(__ti, __uj, __indices());
+    return _VSTD::__mu_expand(__ti, __uj, __indices());
 }
 
 template <bool IsPh, class _Ti, class _Uj>
@@ -2873,13 +2877,13 @@ public:
                                   !is_same<typename remove_reference<_Gp>::type,
                                            __bind>::value
                                >::type>
-      _LIBCPP_INLINE_VISIBILITY
+      _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
       explicit __bind(_Gp&& __f, _BA&& ...__bound_args)
         : __f_(_VSTD::forward<_Gp>(__f)),
           __bound_args_(_VSTD::forward<_BA>(__bound_args)...) {}
 
     template <class ..._Args>
-        _LIBCPP_INLINE_VISIBILITY
+        _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
         typename __bind_return<_Fd, _Td, tuple<_Args&&...> >::type
         operator()(_Args&& ...__args)
         {
@@ -2888,7 +2892,7 @@ public:
         }
 
     template <class ..._Args>
-        _LIBCPP_INLINE_VISIBILITY
+        _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
         typename __bind_return<const _Fd, const _Td, tuple<_Args&&...> >::type
         operator()(_Args&& ...__args) const
         {
@@ -2918,13 +2922,13 @@ public:
                                   !is_same<typename remove_reference<_Gp>::type,
                                            __bind_r>::value
                                >::type>
-      _LIBCPP_INLINE_VISIBILITY
+      _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
       explicit __bind_r(_Gp&& __f, _BA&& ...__bound_args)
         : base(_VSTD::forward<_Gp>(__f),
                _VSTD::forward<_BA>(__bound_args)...) {}
 
     template <class ..._Args>
-        _LIBCPP_INLINE_VISIBILITY
+        _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
         typename enable_if
         <
             is_convertible<typename __bind_return<_Fd, _Td, tuple<_Args&&...> >::type,
@@ -2938,7 +2942,7 @@ public:
         }
 
     template <class ..._Args>
-        _LIBCPP_INLINE_VISIBILITY
+        _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
         typename enable_if
         <
             is_convertible<typename __bind_return<const _Fd, const _Td, tuple<_Args&&...> >::type,
@@ -2956,7 +2960,7 @@ template<class _Rp, class _Fp, class ..._BoundArgs>
 struct __is_bind_expression<__bind_r<_Rp, _Fp, _BoundArgs...> > : public true_type {};
 
 template<class _Fp, class ..._BoundArgs>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 __bind<_Fp, _BoundArgs...>
 bind(_Fp&& __f, _BoundArgs&&... __bound_args)
 {
@@ -2965,7 +2969,7 @@ bind(_Fp&& __f, _BoundArgs&&... __bound_args)
 }
 
 template<class _Rp, class _Fp, class ..._BoundArgs>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 __bind_r<_Rp, _Fp, _BoundArgs...>
 bind(_Fp&& __f, _BoundArgs&&... __bound_args)
 {
@@ -2978,7 +2982,7 @@ bind(_Fp&& __f, _BoundArgs&&... __bound_args)
 #if _LIBCPP_STD_VER > 14
 
 template <class _Fn, class ..._Args>
-invoke_result_t<_Fn, _Args...>
+_LIBCPP_CONSTEXPR_AFTER_CXX17 invoke_result_t<_Fn, _Args...>
 invoke(_Fn&& __f, _Args&&... __args)
     noexcept(is_nothrow_invocable_v<_Fn, _Args...>)
 {
@@ -2993,21 +2997,21 @@ public:
     __not_fn_imp() = delete;
 
     template <class ..._Args>
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
     auto operator()(_Args&& ...__args) &
             noexcept(noexcept(!_VSTD::invoke(__fd, _VSTD::forward<_Args>(__args)...)))
         -> decltype(          !_VSTD::invoke(__fd, _VSTD::forward<_Args>(__args)...))
         { return              !_VSTD::invoke(__fd, _VSTD::forward<_Args>(__args)...); }
 
     template <class ..._Args>
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
     auto operator()(_Args&& ...__args) &&
             noexcept(noexcept(!_VSTD::invoke(_VSTD::move(__fd), _VSTD::forward<_Args>(__args)...)))
         -> decltype(          !_VSTD::invoke(_VSTD::move(__fd), _VSTD::forward<_Args>(__args)...))
         { return              !_VSTD::invoke(_VSTD::move(__fd), _VSTD::forward<_Args>(__args)...); }
 
     template <class ..._Args>
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
     auto operator()(_Args&& ...__args) const&
             noexcept(noexcept(!_VSTD::invoke(__fd, _VSTD::forward<_Args>(__args)...)))
         -> decltype(          !_VSTD::invoke(__fd, _VSTD::forward<_Args>(__args)...))
@@ -3015,7 +3019,7 @@ public:
 
 
     template <class ..._Args>
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
     auto operator()(_Args&& ...__args) const&&
             noexcept(noexcept(!_VSTD::invoke(_VSTD::move(__fd), _VSTD::forward<_Args>(__args)...)))
         -> decltype(          !_VSTD::invoke(_VSTD::move(__fd), _VSTD::forward<_Args>(__args)...))
@@ -3024,17 +3028,17 @@ public:
 private:
     template <class _RawFunc,
               class = enable_if_t<!is_same<decay_t<_RawFunc>, __not_fn_imp>::value>>
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
     explicit __not_fn_imp(_RawFunc&& __rf)
         : __fd(_VSTD::forward<_RawFunc>(__rf)) {}
 
     template <class _RawFunc>
-    friend inline _LIBCPP_INLINE_VISIBILITY
+    friend inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
     __not_fn_imp<decay_t<_RawFunc>> not_fn(_RawFunc&&);
 };
 
 template <class _RawFunc>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 __not_fn_imp<decay_t<_RawFunc>> not_fn(_RawFunc&& __fn) {
     return __not_fn_imp<decay_t<_RawFunc>>(_VSTD::forward<_RawFunc>(__fn));
 }
@@ -3131,13 +3135,13 @@ __search(_RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1,
 template<class _ForwardIterator, class _BinaryPredicate = equal_to<>>
 class _LIBCPP_TYPE_VIS default_searcher {
 public:
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
     default_searcher(_ForwardIterator __f, _ForwardIterator __l,
                        _BinaryPredicate __p = _BinaryPredicate())
         : __first_(__f), __last_(__l), __pred_(__p) {}
 
     template <typename _ForwardIterator2>
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
     pair<_ForwardIterator2, _ForwardIterator2>
     operator () (_ForwardIterator2 __f, _ForwardIterator2 __l) const
     {
diff --git a/lib/libcxx/include/future b/lib/libcxx/include/future
index bdf74e3055..db60ab69ec 100644
--- a/lib/libcxx/include/future
+++ b/lib/libcxx/include/future
@@ -362,6 +362,7 @@ template <class R, class Alloc> struct uses_allocator<packaged_task<R>, Alloc>;
 */
 
 #include <__config>
+#include <__availability>
 #include <system_error>
 #include <memory>
 #include <chrono>
@@ -624,18 +625,10 @@ protected:
 public:
 
     template <class _Arg>
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
-        void set_value(_Arg&& __arg);
-#else
-        void set_value(_Arg& __arg);
-#endif
+    void set_value(_Arg&& __arg);
 
     template <class _Arg>
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
-        void set_value_at_thread_exit(_Arg&& __arg);
-#else
-        void set_value_at_thread_exit(_Arg& __arg);
-#endif
+    void set_value_at_thread_exit(_Arg&& __arg);
 
     _Rp move();
     typename add_lvalue_reference<_Rp>::type copy();
@@ -654,16 +647,12 @@ template <class _Rp>
 template <class _Arg>
 _LIBCPP_AVAILABILITY_FUTURE
 void
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
 __assoc_state<_Rp>::set_value(_Arg&& __arg)
-#else
-__assoc_state<_Rp>::set_value(_Arg& __arg)
-#endif
 {
     unique_lock<mutex> __lk(this->__mut_);
     if (this->__has_value())
         __throw_future_error(future_errc::promise_already_satisfied);
-    ::new(&__value_) _Rp(_VSTD::forward<_Arg>(__arg));
+    ::new ((void*)&__value_) _Rp(_VSTD::forward<_Arg>(__arg));
     this->__state_ |= base::__constructed | base::ready;
     __cv_.notify_all();
 }
@@ -671,16 +660,12 @@ __assoc_state<_Rp>::set_value(_Arg& __arg)
 template <class _Rp>
 template <class _Arg>
 void
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
 __assoc_state<_Rp>::set_value_at_thread_exit(_Arg&& __arg)
-#else
-__assoc_state<_Rp>::set_value_at_thread_exit(_Arg& __arg)
-#endif
 {
     unique_lock<mutex> __lk(this->__mut_);
     if (this->__has_value())
         __throw_future_error(future_errc::promise_already_satisfied);
-    ::new(&__value_) _Rp(_VSTD::forward<_Arg>(__arg));
+    ::new ((void*)&__value_) _Rp(_VSTD::forward<_Arg>(__arg));
     this->__state_ |= base::__constructed;
     __thread_local_data()->__make_ready_at_thread_exit(this);
 }
@@ -856,16 +841,12 @@ class _LIBCPP_AVAILABILITY_FUTURE __deferred_assoc_state
     _Fp __func_;
 
 public:
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
     _LIBCPP_INLINE_VISIBILITY
     explicit __deferred_assoc_state(_Fp&& __f);
-#endif
 
     virtual void __execute();
 };
 
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
-
 template <class _Rp, class _Fp>
 inline
 __deferred_assoc_state<_Rp, _Fp>::__deferred_assoc_state(_Fp&& __f)
@@ -874,8 +855,6 @@ __deferred_assoc_state<_Rp, _Fp>::__deferred_assoc_state(_Fp&& __f)
     this->__set_deferred();
 }
 
-#endif  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
-
 template <class _Rp, class _Fp>
 void
 __deferred_assoc_state<_Rp, _Fp>::__execute()
@@ -903,16 +882,12 @@ class _LIBCPP_AVAILABILITY_FUTURE __deferred_assoc_state<void, _Fp>
     _Fp __func_;
 
 public:
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
     _LIBCPP_INLINE_VISIBILITY
     explicit __deferred_assoc_state(_Fp&& __f);
-#endif
 
     virtual void __execute();
 };
 
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
-
 template <class _Fp>
 inline
 __deferred_assoc_state<void, _Fp>::__deferred_assoc_state(_Fp&& __f)
@@ -921,8 +896,6 @@ __deferred_assoc_state<void, _Fp>::__deferred_assoc_state(_Fp&& __f)
     this->__set_deferred();
 }
 
-#endif  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
-
 template <class _Fp>
 void
 __deferred_assoc_state<void, _Fp>::__execute()
@@ -952,16 +925,12 @@ class _LIBCPP_AVAILABILITY_FUTURE __async_assoc_state
 
     virtual void __on_zero_shared() _NOEXCEPT;
 public:
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
     _LIBCPP_INLINE_VISIBILITY
     explicit __async_assoc_state(_Fp&& __f);
-#endif
 
     virtual void __execute();
 };
 
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
-
 template <class _Rp, class _Fp>
 inline
 __async_assoc_state<_Rp, _Fp>::__async_assoc_state(_Fp&& __f)
@@ -969,8 +938,6 @@ __async_assoc_state<_Rp, _Fp>::__async_assoc_state(_Fp&& __f)
 {
 }
 
-#endif  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
-
 template <class _Rp, class _Fp>
 void
 __async_assoc_state<_Rp, _Fp>::__execute()
@@ -1007,16 +974,12 @@ class _LIBCPP_AVAILABILITY_FUTURE __async_assoc_state<void, _Fp>
 
     virtual void __on_zero_shared() _NOEXCEPT;
 public:
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
     _LIBCPP_INLINE_VISIBILITY
     explicit __async_assoc_state(_Fp&& __f);
-#endif
 
     virtual void __execute();
 };
 
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
-
 template <class _Fp>
 inline
 __async_assoc_state<void, _Fp>::__async_assoc_state(_Fp&& __f)
@@ -1024,8 +987,6 @@ __async_assoc_state<void, _Fp>::__async_assoc_state(_Fp&& __f)
 {
 }
 
-#endif  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
-
 template <class _Fp>
 void
 __async_assoc_state<void, _Fp>::__execute()
@@ -1062,19 +1023,11 @@ template <class _Rp> class _LIBCPP_TEMPLATE_VIS future;
 
 template <class _Rp, class _Fp>
 _LIBCPP_INLINE_VISIBILITY future<_Rp>
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
 __make_deferred_assoc_state(_Fp&& __f);
-#else
-__make_deferred_assoc_state(_Fp __f);
-#endif
 
 template <class _Rp, class _Fp>
 _LIBCPP_INLINE_VISIBILITY future<_Rp>
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
 __make_async_assoc_state(_Fp&& __f);
-#else
-__make_async_assoc_state(_Fp __f);
-#endif
 
 template <class _Rp>
 class _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FUTURE future
@@ -1086,22 +1039,14 @@ class _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FUTURE future
     template <class> friend class promise;
     template <class> friend class shared_future;
 
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
     template <class _R1, class _Fp>
         friend future<_R1> __make_deferred_assoc_state(_Fp&& __f);
     template <class _R1, class _Fp>
         friend future<_R1> __make_async_assoc_state(_Fp&& __f);
-#else
-    template <class _R1, class _Fp>
-        friend future<_R1> __make_deferred_assoc_state(_Fp __f);
-    template <class _R1, class _Fp>
-        friend future<_R1> __make_async_assoc_state(_Fp __f);
-#endif
 
 public:
     _LIBCPP_INLINE_VISIBILITY
     future() _NOEXCEPT : __state_(nullptr) {}
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
     _LIBCPP_INLINE_VISIBILITY
     future(future&& __rhs) _NOEXCEPT
         : __state_(__rhs.__state_) {__rhs.__state_ = nullptr;}
@@ -1110,15 +1055,10 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     future& operator=(future&& __rhs) _NOEXCEPT
         {
-            future(std::move(__rhs)).swap(*this);
+            future(_VSTD::move(__rhs)).swap(*this);
             return *this;
         }
-#else  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
-private:
-    future(const future&);
-    future& operator=(const future&);
-public:
-#endif  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
+
     ~future();
     _LIBCPP_INLINE_VISIBILITY
     shared_future<_Rp> share() _NOEXCEPT;
@@ -1186,22 +1126,14 @@ class _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FUTURE future<_Rp&>
     template <class> friend class promise;
     template <class> friend class shared_future;
 
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
     template <class _R1, class _Fp>
         friend future<_R1> __make_deferred_assoc_state(_Fp&& __f);
     template <class _R1, class _Fp>
         friend future<_R1> __make_async_assoc_state(_Fp&& __f);
-#else
-    template <class _R1, class _Fp>
-        friend future<_R1> __make_deferred_assoc_state(_Fp __f);
-    template <class _R1, class _Fp>
-        friend future<_R1> __make_async_assoc_state(_Fp __f);
-#endif
 
 public:
     _LIBCPP_INLINE_VISIBILITY
     future() _NOEXCEPT : __state_(nullptr) {}
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
     _LIBCPP_INLINE_VISIBILITY
     future(future&& __rhs) _NOEXCEPT
         : __state_(__rhs.__state_) {__rhs.__state_ = nullptr;}
@@ -1210,15 +1142,10 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     future& operator=(future&& __rhs) _NOEXCEPT
         {
-            future(std::move(__rhs)).swap(*this);
+            future(_VSTD::move(__rhs)).swap(*this);
             return *this;
         }
-#else  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
-private:
-    future(const future&);
-    future& operator=(const future&);
-public:
-#endif  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
+
     ~future();
     _LIBCPP_INLINE_VISIBILITY
     shared_future<_Rp&> share() _NOEXCEPT;
@@ -1281,22 +1208,14 @@ class _LIBCPP_TYPE_VIS _LIBCPP_AVAILABILITY_FUTURE future<void>
     template <class> friend class promise;
     template <class> friend class shared_future;
 
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
     template <class _R1, class _Fp>
         friend future<_R1> __make_deferred_assoc_state(_Fp&& __f);
     template <class _R1, class _Fp>
         friend future<_R1> __make_async_assoc_state(_Fp&& __f);
-#else
-    template <class _R1, class _Fp>
-        friend future<_R1> __make_deferred_assoc_state(_Fp __f);
-    template <class _R1, class _Fp>
-        friend future<_R1> __make_async_assoc_state(_Fp __f);
-#endif
 
 public:
     _LIBCPP_INLINE_VISIBILITY
     future() _NOEXCEPT : __state_(nullptr) {}
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
     _LIBCPP_INLINE_VISIBILITY
     future(future&& __rhs) _NOEXCEPT
         : __state_(__rhs.__state_) {__rhs.__state_ = nullptr;}
@@ -1305,15 +1224,10 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     future& operator=(future&& __rhs) _NOEXCEPT
         {
-            future(std::move(__rhs)).swap(*this);
+            future(_VSTD::move(__rhs)).swap(*this);
             return *this;
         }
-#else  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
-private:
-    future(const future&);
-    future& operator=(const future&);
-public:
-#endif  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
+
     ~future();
     _LIBCPP_INLINE_VISIBILITY
     shared_future<void> share() _NOEXCEPT;
@@ -1367,32 +1281,21 @@ public:
     promise();
     template <class _Alloc>
         promise(allocator_arg_t, const _Alloc& __a);
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
     _LIBCPP_INLINE_VISIBILITY
     promise(promise&& __rhs) _NOEXCEPT
         : __state_(__rhs.__state_) {__rhs.__state_ = nullptr;}
     promise(const promise& __rhs) = delete;
-#else  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
-private:
-    promise(const promise& __rhs);
-public:
-#endif  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
     ~promise();
 
     // assignment
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
     _LIBCPP_INLINE_VISIBILITY
     promise& operator=(promise&& __rhs) _NOEXCEPT
         {
-            promise(std::move(__rhs)).swap(*this);
+            promise(_VSTD::move(__rhs)).swap(*this);
             return *this;
         }
     promise& operator=(const promise& __rhs) = delete;
-#else  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
-private:
-    promise& operator=(const promise& __rhs);
-public:
-#endif  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
+
     _LIBCPP_INLINE_VISIBILITY
     void swap(promise& __rhs) _NOEXCEPT {_VSTD::swap(__state_, __rhs.__state_);}
 
@@ -1401,16 +1304,12 @@ public:
 
     // setting the result
     void set_value(const _Rp& __r);
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
     void set_value(_Rp&& __r);
-#endif
     void set_exception(exception_ptr __p);
 
     // setting the result with deferred notification
     void set_value_at_thread_exit(const _Rp& __r);
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
     void set_value_at_thread_exit(_Rp&& __r);
-#endif
     void set_exception_at_thread_exit(exception_ptr __p);
 };
 
@@ -1429,7 +1328,7 @@ promise<_Rp>::promise(allocator_arg_t, const _Alloc& __a0)
     typedef __allocator_destructor<_A2> _D2;
     _A2 __a(__a0);
     unique_ptr<_State, _D2> __hold(__a.allocate(1), _D2(__a, 1));
-    ::new(static_cast<void*>(_VSTD::addressof(*__hold.get()))) _State(__a0);
+    ::new ((void*)_VSTD::addressof(*__hold.get())) _State(__a0);
     __state_ = _VSTD::addressof(*__hold.release());
 }
 
@@ -1464,8 +1363,6 @@ promise<_Rp>::set_value(const _Rp& __r)
     __state_->set_value(__r);
 }
 
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
-
 template <class _Rp>
 void
 promise<_Rp>::set_value(_Rp&& __r)
@@ -1475,8 +1372,6 @@ promise<_Rp>::set_value(_Rp&& __r)
     __state_->set_value(_VSTD::move(__r));
 }
 
-#endif  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
-
 template <class _Rp>
 void
 promise<_Rp>::set_exception(exception_ptr __p)
@@ -1496,8 +1391,6 @@ promise<_Rp>::set_value_at_thread_exit(const _Rp& __r)
     __state_->set_value_at_thread_exit(__r);
 }
 
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
-
 template <class _Rp>
 void
 promise<_Rp>::set_value_at_thread_exit(_Rp&& __r)
@@ -1507,8 +1400,6 @@ promise<_Rp>::set_value_at_thread_exit(_Rp&& __r)
     __state_->set_value_at_thread_exit(_VSTD::move(__r));
 }
 
-#endif  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
-
 template <class _Rp>
 void
 promise<_Rp>::set_exception_at_thread_exit(exception_ptr __p)
@@ -1535,32 +1426,21 @@ public:
     promise();
     template <class _Allocator>
         promise(allocator_arg_t, const _Allocator& __a);
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
     _LIBCPP_INLINE_VISIBILITY
     promise(promise&& __rhs) _NOEXCEPT
         : __state_(__rhs.__state_) {__rhs.__state_ = nullptr;}
     promise(const promise& __rhs) = delete;
-#else  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
-private:
-    promise(const promise& __rhs);
-public:
-#endif  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
     ~promise();
 
     // assignment
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
     _LIBCPP_INLINE_VISIBILITY
     promise& operator=(promise&& __rhs) _NOEXCEPT
         {
-            promise(std::move(__rhs)).swap(*this);
+            promise(_VSTD::move(__rhs)).swap(*this);
             return *this;
         }
     promise& operator=(const promise& __rhs) = delete;
-#else  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
-private:
-    promise& operator=(const promise& __rhs);
-public:
-#endif  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
+
     _LIBCPP_INLINE_VISIBILITY
     void swap(promise& __rhs) _NOEXCEPT {_VSTD::swap(__state_, __rhs.__state_);}
 
@@ -1591,7 +1471,7 @@ promise<_Rp&>::promise(allocator_arg_t, const _Alloc& __a0)
     typedef __allocator_destructor<_A2> _D2;
     _A2 __a(__a0);
     unique_ptr<_State, _D2> __hold(__a.allocate(1), _D2(__a, 1));
-    ::new(static_cast<void*>(_VSTD::addressof(*__hold.get()))) _State(__a0);
+    ::new ((void*)_VSTD::addressof(*__hold.get())) _State(__a0);
     __state_ = _VSTD::addressof(*__hold.release());
 }
 
@@ -1672,32 +1552,21 @@ public:
     template <class _Allocator>
         _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS
         promise(allocator_arg_t, const _Allocator& __a);
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
     _LIBCPP_INLINE_VISIBILITY
     promise(promise&& __rhs) _NOEXCEPT
         : __state_(__rhs.__state_) {__rhs.__state_ = nullptr;}
     promise(const promise& __rhs) = delete;
-#else  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
-private:
-    promise(const promise& __rhs);
-public:
-#endif  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
     ~promise();
 
     // assignment
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
     _LIBCPP_INLINE_VISIBILITY
     promise& operator=(promise&& __rhs) _NOEXCEPT
         {
-            promise(std::move(__rhs)).swap(*this);
+            promise(_VSTD::move(__rhs)).swap(*this);
             return *this;
         }
     promise& operator=(const promise& __rhs) = delete;
-#else  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
-private:
-    promise& operator=(const promise& __rhs);
-public:
-#endif  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
+
     _LIBCPP_INLINE_VISIBILITY
     void swap(promise& __rhs) _NOEXCEPT {_VSTD::swap(__state_, __rhs.__state_);}
 
@@ -1721,7 +1590,7 @@ promise<void>::promise(allocator_arg_t, const _Alloc& __a0)
     typedef __allocator_destructor<_A2> _D2;
     _A2 __a(__a0);
     unique_ptr<_State, _D2> __hold(__a.allocate(1), _D2(__a, 1));
-    ::new(static_cast<void*>(_VSTD::addressof(*__hold.get()))) _State(__a0);
+    ::new ((void*)_VSTD::addressof(*__hold.get())) _State(__a0);
     __state_ = _VSTD::addressof(*__hold.release());
 }
 
@@ -1737,8 +1606,6 @@ template <class _Rp, class _Alloc>
     struct _LIBCPP_TEMPLATE_VIS uses_allocator<promise<_Rp>, _Alloc>
         : public true_type {};
 
-#ifndef _LIBCPP_HAS_NO_VARIADICS
-
 // packaged_task
 
 template<class _Fp> class __packaged_task_base;
@@ -1788,7 +1655,7 @@ void
 __packaged_task_func<_Fp, _Alloc, _Rp(_ArgTypes...)>::__move_to(
                               __packaged_task_base<_Rp(_ArgTypes...)>* __p) _NOEXCEPT
 {
-    ::new (__p) __packaged_task_func(_VSTD::move(__f_.first()), _VSTD::move(__f_.second()));
+    ::new ((void*)__p) __packaged_task_func(_VSTD::move(__f_.first()), _VSTD::move(__f_.second()));
 }
 
 template<class _Fp, class _Alloc, class _Rp, class ..._ArgTypes>
@@ -1814,7 +1681,7 @@ template<class _Fp, class _Alloc, class _Rp, class ..._ArgTypes>
 _Rp
 __packaged_task_func<_Fp, _Alloc, _Rp(_ArgTypes...)>::operator()(_ArgTypes&& ... __arg)
 {
-    return __invoke(__f_.first(), _VSTD::forward<_ArgTypes>(__arg)...);
+    return _VSTD::__invoke(__f_.first(), _VSTD::forward<_ArgTypes>(__arg)...);
 }
 
 template <class _Callable> class __packaged_task_function;
@@ -1823,6 +1690,10 @@ template<class _Rp, class ..._ArgTypes>
 class _LIBCPP_AVAILABILITY_FUTURE __packaged_task_function<_Rp(_ArgTypes...)>
 {
     typedef __packaged_task_base<_Rp(_ArgTypes...)> __base;
+
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_NO_CFI
+    __base* __get_buf() { return (__base*)&__buf_; }
+
     typename aligned_storage<3*sizeof(void*)>::type __buf_;
     __base* __f_;
 
@@ -1856,10 +1727,10 @@ __packaged_task_function<_Rp(_ArgTypes...)>::__packaged_task_function(__packaged
 {
     if (__f.__f_ == nullptr)
         __f_ = nullptr;
-    else if (__f.__f_ == (__base*)&__f.__buf_)
+    else if (__f.__f_ == __f.__get_buf())
     {
+        __f.__f_->__move_to(__get_buf());
         __f_ = (__base*)&__buf_;
-        __f.__f_->__move_to(__f_);
     }
     else
     {
@@ -1877,8 +1748,8 @@ __packaged_task_function<_Rp(_ArgTypes...)>::__packaged_task_function(_Fp&& __f)
     typedef __packaged_task_func<_FR, allocator<_FR>, _Rp(_ArgTypes...)> _FF;
     if (sizeof(_FF) <= sizeof(__buf_))
     {
+        ::new ((void*)&__buf_) _FF(_VSTD::forward<_Fp>(__f));
         __f_ = (__base*)&__buf_;
-        ::new (__f_) _FF(_VSTD::forward<_Fp>(__f));
     }
     else
     {
@@ -1886,7 +1757,7 @@ __packaged_task_function<_Rp(_ArgTypes...)>::__packaged_task_function(_Fp&& __f)
         _Ap __a;
         typedef __allocator_destructor<_Ap> _Dp;
         unique_ptr<__base, _Dp> __hold(__a.allocate(1), _Dp(__a, 1));
-        ::new (__hold.get()) _FF(_VSTD::forward<_Fp>(__f), allocator<_FR>(__a));
+        ::new ((void*)__hold.get()) _FF(_VSTD::forward<_Fp>(__f), allocator<_FR>(__a));
         __f_ = __hold.release();
     }
 }
@@ -1902,7 +1773,7 @@ __packaged_task_function<_Rp(_ArgTypes...)>::__packaged_task_function(
     if (sizeof(_FF) <= sizeof(__buf_))
     {
         __f_ = (__base*)&__buf_;
-        ::new (__f_) _FF(_VSTD::forward<_Fp>(__f));
+        ::new ((void*)__f_) _FF(_VSTD::forward<_Fp>(__f));
     }
     else
     {
@@ -1910,7 +1781,7 @@ __packaged_task_function<_Rp(_ArgTypes...)>::__packaged_task_function(
         _Ap __a(__a0);
         typedef __allocator_destructor<_Ap> _Dp;
         unique_ptr<__base, _Dp> __hold(__a.allocate(1), _Dp(__a, 1));
-        ::new (static_cast<void*>(_VSTD::addressof(*__hold.get())))
+        ::new ((void*)_VSTD::addressof(*__hold.get()))
             _FF(_VSTD::forward<_Fp>(__f), _Alloc(__a));
         __f_ = _VSTD::addressof(*__hold.release());
     }
@@ -1920,17 +1791,17 @@ template<class _Rp, class ..._ArgTypes>
 __packaged_task_function<_Rp(_ArgTypes...)>&
 __packaged_task_function<_Rp(_ArgTypes...)>::operator=(__packaged_task_function&& __f) _NOEXCEPT
 {
-    if (__f_ == (__base*)&__buf_)
+    if (__f_ == __get_buf())
         __f_->destroy();
     else if (__f_)
         __f_->destroy_deallocate();
     __f_ = nullptr;
     if (__f.__f_ == nullptr)
         __f_ = nullptr;
-    else if (__f.__f_ == (__base*)&__f.__buf_)
+    else if (__f.__f_ == __f.__get_buf())
     {
-        __f_ = (__base*)&__buf_;
-        __f.__f_->__move_to(__f_);
+        __f.__f_->__move_to(__get_buf());
+        __f_ = __get_buf();
     }
     else
     {
@@ -1943,13 +1814,14 @@ __packaged_task_function<_Rp(_ArgTypes...)>::operator=(__packaged_task_function&
 template<class _Rp, class ..._ArgTypes>
 __packaged_task_function<_Rp(_ArgTypes...)>::~__packaged_task_function()
 {
-    if (__f_ == (__base*)&__buf_)
+    if (__f_ == __get_buf())
         __f_->destroy();
     else if (__f_)
         __f_->destroy_deallocate();
 }
 
 template<class _Rp, class ..._ArgTypes>
+_LIBCPP_NO_CFI
 void
 __packaged_task_function<_Rp(_ArgTypes...)>::swap(__packaged_task_function& __f) _NOEXCEPT
 {
@@ -2268,11 +2140,7 @@ struct _LIBCPP_TEMPLATE_VIS uses_allocator<packaged_task<_Callable>, _Alloc>
 
 template <class _Rp, class _Fp>
 _LIBCPP_INLINE_VISIBILITY future<_Rp>
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
 __make_deferred_assoc_state(_Fp&& __f)
-#else
-__make_deferred_assoc_state(_Fp __f)
-#endif
 {
     unique_ptr<__deferred_assoc_state<_Rp, _Fp>, __release_shared_count>
         __h(new __deferred_assoc_state<_Rp, _Fp>(_VSTD::forward<_Fp>(__f)));
@@ -2281,11 +2149,7 @@ __make_deferred_assoc_state(_Fp __f)
 
 template <class _Rp, class _Fp>
 _LIBCPP_INLINE_VISIBILITY future<_Rp>
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
 __make_async_assoc_state(_Fp&& __f)
-#else
-__make_async_assoc_state(_Fp __f)
-#endif
 {
     unique_ptr<__async_assoc_state<_Rp, _Fp>, __release_shared_count>
         __h(new __async_assoc_state<_Rp, _Fp>(_VSTD::forward<_Fp>(__f)));
@@ -2293,6 +2157,8 @@ __make_async_assoc_state(_Fp __f)
     return future<_Rp>(__h.get());
 }
 
+#ifndef _LIBCPP_CXX03_LANG
+
 template <class _Fp, class... _Args>
 class _LIBCPP_HIDDEN __async_func
 {
@@ -2318,7 +2184,7 @@ private:
     _Rp
     __execute(__tuple_indices<_Indices...>)
     {
-        return __invoke(_VSTD::move(_VSTD::get<0>(__f_)), _VSTD::move(_VSTD::get<_Indices>(__f_))...);
+        return _VSTD::__invoke(_VSTD::move(_VSTD::get<0>(__f_)), _VSTD::move(_VSTD::get<_Indices>(__f_))...);
     }
 };
 
@@ -2338,16 +2204,16 @@ async(launch __policy, _Fp&& __f, _Args&&... __args)
     {
 #endif
         if (__does_policy_contain(__policy, launch::async))
-        return _VSTD::__make_async_assoc_state<_Rp>(_BF(__decay_copy(_VSTD::forward<_Fp>(__f)),
-                                                     __decay_copy(_VSTD::forward<_Args>(__args))...));
+        return _VSTD::__make_async_assoc_state<_Rp>(_BF(_VSTD::__decay_copy(_VSTD::forward<_Fp>(__f)),
+                                                     _VSTD::__decay_copy(_VSTD::forward<_Args>(__args))...));
 #ifndef _LIBCPP_NO_EXCEPTIONS
     }
     catch ( ... ) { if (__policy == launch::async) throw ; }
 #endif
 
     if (__does_policy_contain(__policy, launch::deferred))
-        return _VSTD::__make_deferred_assoc_state<_Rp>(_BF(__decay_copy(_VSTD::forward<_Fp>(__f)),
-                                                        __decay_copy(_VSTD::forward<_Args>(__args))...));
+        return _VSTD::__make_deferred_assoc_state<_Rp>(_BF(_VSTD::__decay_copy(_VSTD::forward<_Fp>(__f)),
+                                                        _VSTD::__decay_copy(_VSTD::forward<_Args>(__args))...));
     return future<_Rp>{};
 }
 
@@ -2360,7 +2226,7 @@ async(_Fp&& __f, _Args&&... __args)
                                     _VSTD::forward<_Args>(__args)...);
 }
 
-#endif  // _LIBCPP_HAS_NO_VARIADICS
+#endif // C++03
 
 // shared_future
 
@@ -2375,24 +2241,20 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     shared_future(const shared_future& __rhs)  _NOEXCEPT : __state_(__rhs.__state_)
         {if (__state_) __state_->__add_shared();}
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
     _LIBCPP_INLINE_VISIBILITY
     shared_future(future<_Rp>&& __f) _NOEXCEPT : __state_(__f.__state_)
         {__f.__state_ = nullptr;}
     _LIBCPP_INLINE_VISIBILITY
     shared_future(shared_future&& __rhs) _NOEXCEPT : __state_(__rhs.__state_)
         {__rhs.__state_ = nullptr;}
-#endif  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
     ~shared_future();
     shared_future& operator=(const shared_future& __rhs) _NOEXCEPT;
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
     _LIBCPP_INLINE_VISIBILITY
     shared_future& operator=(shared_future&& __rhs) _NOEXCEPT
         {
-            shared_future(std::move(__rhs)).swap(*this);
+            shared_future(_VSTD::move(__rhs)).swap(*this);
             return *this;
         }
-#endif  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
 
     // retrieving the value
     _LIBCPP_INLINE_VISIBILITY
@@ -2449,24 +2311,20 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     shared_future(const shared_future& __rhs) : __state_(__rhs.__state_)
         {if (__state_) __state_->__add_shared();}
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
     _LIBCPP_INLINE_VISIBILITY
     shared_future(future<_Rp&>&& __f) _NOEXCEPT : __state_(__f.__state_)
         {__f.__state_ = nullptr;}
     _LIBCPP_INLINE_VISIBILITY
     shared_future(shared_future&& __rhs) _NOEXCEPT : __state_(__rhs.__state_)
         {__rhs.__state_ = nullptr;}
-#endif  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
     ~shared_future();
     shared_future& operator=(const shared_future& __rhs);
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
     _LIBCPP_INLINE_VISIBILITY
     shared_future& operator=(shared_future&& __rhs) _NOEXCEPT
         {
-            shared_future(std::move(__rhs)).swap(*this);
+            shared_future(_VSTD::move(__rhs)).swap(*this);
             return *this;
         }
-#endif  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
 
     // retrieving the value
     _LIBCPP_INLINE_VISIBILITY
@@ -2523,24 +2381,20 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     shared_future(const shared_future& __rhs) : __state_(__rhs.__state_)
         {if (__state_) __state_->__add_shared();}
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
     _LIBCPP_INLINE_VISIBILITY
     shared_future(future<void>&& __f) _NOEXCEPT : __state_(__f.__state_)
         {__f.__state_ = nullptr;}
     _LIBCPP_INLINE_VISIBILITY
     shared_future(shared_future&& __rhs) _NOEXCEPT : __state_(__rhs.__state_)
         {__rhs.__state_ = nullptr;}
-#endif  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
     ~shared_future();
     shared_future& operator=(const shared_future& __rhs);
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
     _LIBCPP_INLINE_VISIBILITY
     shared_future& operator=(shared_future&& __rhs) _NOEXCEPT
         {
-            shared_future(std::move(__rhs)).swap(*this);
+            shared_future(_VSTD::move(__rhs)).swap(*this);
             return *this;
         }
-#endif  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
 
     // retrieving the value
     _LIBCPP_INLINE_VISIBILITY
@@ -2591,8 +2445,6 @@ future<_Rp&>::share() _NOEXCEPT
     return shared_future<_Rp&>(_VSTD::move(*this));
 }
 
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
-
 inline
 shared_future<void>
 future<void>::share() _NOEXCEPT
@@ -2600,8 +2452,6 @@ future<void>::share() _NOEXCEPT
     return shared_future<void>(_VSTD::move(*this));
 }
 
-#endif  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
-
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // !_LIBCPP_HAS_NO_THREADS
diff --git a/lib/libcxx/include/iomanip b/lib/libcxx/include/iomanip
index 3f78f4d02b..4b6418bc0b 100644
--- a/lib/libcxx/include/iomanip
+++ b/lib/libcxx/include/iomanip
@@ -514,7 +514,7 @@ put_time(const tm* __tm, const _CharT* __fmt)
 }
 
 template <class _CharT, class _Traits, class _ForwardIterator>
-std::basic_ostream<_CharT, _Traits> &
+basic_ostream<_CharT, _Traits> &
 __quoted_output ( basic_ostream<_CharT, _Traits> &__os,
         _ForwardIterator __first, _ForwardIterator __last, _CharT __delim, _CharT __escape )
 {
@@ -527,7 +527,7 @@ __quoted_output ( basic_ostream<_CharT, _Traits> &__os,
         __str.push_back(*__first);
     }
     __str.push_back(__delim);
-    return __put_character_sequence(__os, __str.data(), __str.size());
+    return _VSTD::__put_character_sequence(__os, __str.data(), __str.size());
 }
 
 template <class _CharT, class _Traits, class _String>
diff --git a/lib/libcxx/include/ios b/lib/libcxx/include/ios
index 7f0e2d65e6..653e3a95fc 100644
--- a/lib/libcxx/include/ios
+++ b/lib/libcxx/include/ios
@@ -710,7 +710,7 @@ void
 basic_ios<_CharT, _Traits>::init(basic_streambuf<char_type, traits_type>* __sb)
 {
     ios_base::init(__sb);
-    __tie_ = 0;
+    __tie_ = nullptr;
     __fill_ = traits_type::eof();
 }
 
@@ -821,7 +821,7 @@ basic_ios<_CharT, _Traits>::move(basic_ios& __rhs)
 {
     ios_base::move(__rhs);
     __tie_ = __rhs.__tie_;
-    __rhs.__tie_ = 0;
+    __rhs.__tie_ = nullptr;
     __fill_ = __rhs.__fill_;
 }
 
@@ -1035,33 +1035,6 @@ defaultfloat(ios_base& __str)
     return __str;
 }
 
-template <class _CharT, class _Traits>
-class __save_flags
-{
-    typedef basic_ios<_CharT, _Traits> __stream_type;
-    typedef typename __stream_type::fmtflags fmtflags;
-
-    __stream_type& __stream_;
-    fmtflags       __fmtflags_;
-    _CharT         __fill_;
-
-    __save_flags(const __save_flags&);
-    __save_flags& operator=(const __save_flags&);
-public:
-    _LIBCPP_INLINE_VISIBILITY
-    explicit __save_flags(__stream_type& __stream)
-        : __stream_(__stream),
-          __fmtflags_(__stream.flags()),
-          __fill_(__stream.fill())
-        {}
-    _LIBCPP_INLINE_VISIBILITY
-    ~__save_flags()
-    {
-        __stream_.flags(__fmtflags_);
-        __stream_.fill(__fill_);
-    }
-};
-
 _LIBCPP_END_NAMESPACE_STD
 
 #endif  // _LIBCPP_IOS
diff --git a/lib/libcxx/include/iosfwd b/lib/libcxx/include/iosfwd
index 0ffe75f197..0a0de99ff3 100644
--- a/lib/libcxx/include/iosfwd
+++ b/lib/libcxx/include/iosfwd
@@ -185,6 +185,36 @@ typedef basic_ifstream<wchar_t>      wifstream;
 typedef basic_ofstream<wchar_t>      wofstream;
 typedef basic_fstream<wchar_t>       wfstream;
 
+template <class _CharT, class _Traits>
+    class _LIBCPP_PREFERRED_NAME(ios) _LIBCPP_PREFERRED_NAME(wios) basic_ios;
+
+template <class _CharT, class _Traits>
+    class _LIBCPP_PREFERRED_NAME(streambuf) _LIBCPP_PREFERRED_NAME(wstreambuf) basic_streambuf;
+template <class _CharT, class _Traits>
+    class _LIBCPP_PREFERRED_NAME(istream) _LIBCPP_PREFERRED_NAME(wistream) basic_istream;
+template <class _CharT, class _Traits>
+    class _LIBCPP_PREFERRED_NAME(ostream) _LIBCPP_PREFERRED_NAME(wostream) basic_ostream;
+template <class _CharT, class _Traits>
+    class _LIBCPP_PREFERRED_NAME(iostream) _LIBCPP_PREFERRED_NAME(wiostream) basic_iostream;
+
+template <class _CharT, class _Traits, class _Allocator>
+    class _LIBCPP_PREFERRED_NAME(stringbuf) _LIBCPP_PREFERRED_NAME(wstringbuf) basic_stringbuf;
+template <class _CharT, class _Traits, class _Allocator>
+    class _LIBCPP_PREFERRED_NAME(istringstream) _LIBCPP_PREFERRED_NAME(wistringstream) basic_istringstream;
+template <class _CharT, class _Traits, class _Allocator>
+    class _LIBCPP_PREFERRED_NAME(ostringstream) _LIBCPP_PREFERRED_NAME(wostringstream) basic_ostringstream;
+template <class _CharT, class _Traits, class _Allocator>
+    class _LIBCPP_PREFERRED_NAME(stringstream) _LIBCPP_PREFERRED_NAME(wstringstream) basic_stringstream;
+
+template <class _CharT, class _Traits>
+    class _LIBCPP_PREFERRED_NAME(filebuf) _LIBCPP_PREFERRED_NAME(wfilebuf) basic_filebuf;
+template <class _CharT, class _Traits>
+    class _LIBCPP_PREFERRED_NAME(ifstream) _LIBCPP_PREFERRED_NAME(wifstream) basic_ifstream;
+template <class _CharT, class _Traits>
+    class _LIBCPP_PREFERRED_NAME(ofstream) _LIBCPP_PREFERRED_NAME(wofstream) basic_ofstream;
+template <class _CharT, class _Traits>
+    class _LIBCPP_PREFERRED_NAME(fstream) _LIBCPP_PREFERRED_NAME(wfstream) basic_fstream;
+
 template <class _State>             class _LIBCPP_TEMPLATE_VIS fpos;
 typedef fpos<mbstate_t>    streampos;
 typedef fpos<mbstate_t>    wstreampos;
@@ -210,11 +240,40 @@ template <class _CharT,             // for <stdexcept>
 typedef basic_string<char, char_traits<char>, allocator<char> > string;
 typedef basic_string<wchar_t, char_traits<wchar_t>, allocator<wchar_t> > wstring;
 
+template <class _CharT, class _Traits, class _Allocator>
+    class _LIBCPP_PREFERRED_NAME(string) _LIBCPP_PREFERRED_NAME(wstring) basic_string;
 
 // Include other forward declarations here
 template <class _Tp, class _Alloc = allocator<_Tp> >
 class _LIBCPP_TEMPLATE_VIS vector;
 
+template <class _CharT, class _Traits>
+class __save_flags
+{
+    typedef basic_ios<_CharT, _Traits> __stream_type;
+    typedef typename __stream_type::fmtflags fmtflags;
+
+    __stream_type& __stream_;
+    fmtflags       __fmtflags_;
+    _CharT         __fill_;
+
+    __save_flags(const __save_flags&);
+    __save_flags& operator=(const __save_flags&);
+public:
+    _LIBCPP_INLINE_VISIBILITY
+    explicit __save_flags(__stream_type& __stream)
+        : __stream_(__stream),
+          __fmtflags_(__stream.flags()),
+          __fill_(__stream.fill())
+        {}
+    _LIBCPP_INLINE_VISIBILITY
+    ~__save_flags()
+    {
+        __stream_.flags(__fmtflags_);
+        __stream_.fill(__fill_);
+    }
+};
+
 _LIBCPP_END_NAMESPACE_STD
 
 #endif  // _LIBCPP_IOSFWD
diff --git a/lib/libcxx/include/istream b/lib/libcxx/include/istream
index bfbe5f2472..5e984909f8 100644
--- a/lib/libcxx/include/istream
+++ b/lib/libcxx/include/istream
@@ -150,9 +150,9 @@ template <class charT, class traits>
   basic_istream<charT,traits>&
   ws(basic_istream<charT,traits>& is);
 
-template <class charT, class traits, class T>
-  basic_istream<charT, traits>&
-  operator>>(basic_istream<charT, traits>&& is, T& x);
+// rvalue stream extraction
+template <class Stream, class T>
+  Stream&& operator>>(Stream&& is, T&& x);
 
 }  // std
 
@@ -1142,7 +1142,7 @@ basic_istream<_CharT, _Traits>::putback(char_type __c)
         try
         {
 #endif  // _LIBCPP_NO_EXCEPTIONS
-            if (this->rdbuf() == 0 || this->rdbuf()->sputbackc(__c) == traits_type::eof())
+            if (this->rdbuf() == nullptr || this->rdbuf()->sputbackc(__c) == traits_type::eof())
                 __state |= ios_base::badbit;
 #ifndef _LIBCPP_NO_EXCEPTIONS
         }
@@ -1179,7 +1179,7 @@ basic_istream<_CharT, _Traits>::unget()
         try
         {
 #endif  // _LIBCPP_NO_EXCEPTIONS
-            if (this->rdbuf() == 0 || this->rdbuf()->sungetc() == traits_type::eof())
+            if (this->rdbuf() == nullptr || this->rdbuf()->sungetc() == traits_type::eof())
                 __state |= ios_base::badbit;
 #ifndef _LIBCPP_NO_EXCEPTIONS
         }
@@ -1215,7 +1215,7 @@ basic_istream<_CharT, _Traits>::sync()
         try
         {
 #endif  // _LIBCPP_NO_EXCEPTIONS
-            if (this->rdbuf() == 0)
+            if (this->rdbuf() == nullptr)
                 return -1;
             if (this->rdbuf()->pubsync() == -1)
             {
@@ -1378,13 +1378,23 @@ ws(basic_istream<_CharT, _Traits>& __is)
 
 #ifndef _LIBCPP_CXX03_LANG
 
-template <class _CharT, class _Traits, class _Tp>
-inline _LIBCPP_INLINE_VISIBILITY
-basic_istream<_CharT, _Traits>&
-operator>>(basic_istream<_CharT, _Traits>&& __is, _Tp&& __x)
+template <class _Stream, class _Tp, class = void>
+struct __is_istreamable : false_type { };
+
+template <class _Stream, class _Tp>
+struct __is_istreamable<_Stream, _Tp, decltype(
+    _VSTD::declval<_Stream>() >> _VSTD::declval<_Tp>(), void()
+)> : true_type { };
+
+template <class _Stream, class _Tp, class = typename enable_if<
+    _And<is_base_of<ios_base, _Stream>,
+         __is_istreamable<_Stream&, _Tp&&>>::value
+>::type>
+_LIBCPP_INLINE_VISIBILITY
+_Stream&& operator>>(_Stream&& __is, _Tp&& __x)
 {
     __is >> _VSTD::forward<_Tp>(__x);
-    return __is;
+    return _VSTD::move(__is);
 }
 
 #endif  // _LIBCPP_CXX03_LANG
@@ -1638,11 +1648,9 @@ operator>>(basic_istream<_CharT, _Traits>& __is, bitset<_Size>& __x)
     return __is;
 }
 
-#ifndef _LIBCPP_DO_NOT_ASSUME_STREAMS_EXPLICIT_INSTANTIATION_IN_DYLIB
 _LIBCPP_EXTERN_TEMPLATE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_istream<char>)
 _LIBCPP_EXTERN_TEMPLATE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_istream<wchar_t>)
 _LIBCPP_EXTERN_TEMPLATE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_iostream<char>)
-#endif
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/lib/libcxx/include/iterator b/lib/libcxx/include/iterator
index a13214fca5..00a3451e7a 100644
--- a/lib/libcxx/include/iterator
+++ b/lib/libcxx/include/iterator
@@ -420,10 +420,8 @@ template <class E> constexpr const E* data(initializer_list<E> il) noexcept;
 #include <type_traits>
 #include <cstddef>
 #include <initializer_list>
+#include <__memory/base.h>
 #include <version>
-#ifdef __APPLE__
-#include <Availability.h>
-#endif
 
 #include <__debug>
 
@@ -498,12 +496,11 @@ struct __has_iterator_typedefs
 private:
     struct __two {char __lx; char __lxx;};
     template <class _Up> static __two __test(...);
-    template <class _Up> static char __test(typename std::__void_t<typename _Up::iterator_category>::type* = 0,
-    										typename std::__void_t<typename _Up::difference_type>::type* = 0,
-    										typename std::__void_t<typename _Up::value_type>::type* = 0,
-    										typename std::__void_t<typename _Up::reference>::type* = 0,
-    										typename std::__void_t<typename _Up::pointer>::type* = 0
-    										);
+    template <class _Up> static char __test(typename __void_t<typename _Up::iterator_category>::type* = 0,
+                                            typename __void_t<typename _Up::difference_type>::type* = 0,
+                                            typename __void_t<typename _Up::value_type>::type* = 0,
+                                            typename __void_t<typename _Up::reference>::type* = 0,
+                                            typename __void_t<typename _Up::pointer>::type* = 0);
 public:
     static const bool value = sizeof(__test<_Tp>(0,0,0,0,0)) == 1;
 };
@@ -515,9 +512,9 @@ struct __has_iterator_category
 private:
     struct __two {char __lx; char __lxx;};
     template <class _Up> static __two __test(...);
-    template <class _Up> static char __test(typename _Up::iterator_category* = 0);
+    template <class _Up> static char __test(typename _Up::iterator_category* = nullptr);
 public:
-    static const bool value = sizeof(__test<_Tp>(0)) == 1;
+    static const bool value = sizeof(__test<_Tp>(nullptr)) == 1;
 };
 
 template <class _Iter, bool> struct __iterator_traits_impl {};
@@ -667,9 +664,9 @@ void advance(_InputIter& __i, _Distance __orig_n)
 {
     _LIBCPP_ASSERT(__orig_n >= 0 || __is_cpp17_bidirectional_iterator<_InputIter>::value,
                    "Attempt to advance(it, n) with negative n on a non-bidirectional iterator");
-    typedef decltype(__convert_to_integral(__orig_n)) _IntegralSize;
+    typedef decltype(_VSTD::__convert_to_integral(__orig_n)) _IntegralSize;
     _IntegralSize __n = __orig_n;
-    __advance(__i, __n, typename iterator_traits<_InputIter>::iterator_category());
+    _VSTD::__advance(__i, __n, typename iterator_traits<_InputIter>::iterator_category());
 }
 
 template <class _InputIter>
@@ -696,7 +693,7 @@ inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
 typename iterator_traits<_InputIter>::difference_type
 distance(_InputIter __first, _InputIter __last)
 {
-    return __distance(__first, __last, typename iterator_traits<_InputIter>::iterator_category());
+    return _VSTD::__distance(__first, __last, typename iterator_traits<_InputIter>::iterator_category());
 }
 
 template <class _InputIter>
@@ -998,11 +995,11 @@ private:
     istream_type* __in_stream_;
     _Tp __value_;
 public:
-    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR istream_iterator() : __in_stream_(0), __value_() {}
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR istream_iterator() : __in_stream_(nullptr), __value_() {}
     _LIBCPP_INLINE_VISIBILITY istream_iterator(istream_type& __s) : __in_stream_(_VSTD::addressof(__s))
         {
             if (!(*__in_stream_ >> __value_))
-                __in_stream_ = 0;
+                __in_stream_ = nullptr;
         }
 
     _LIBCPP_INLINE_VISIBILITY const _Tp& operator*() const {return __value_;}
@@ -1010,7 +1007,7 @@ public:
     _LIBCPP_INLINE_VISIBILITY istream_iterator& operator++()
         {
             if (!(*__in_stream_ >> __value_))
-                __in_stream_ = 0;
+                __in_stream_ = nullptr;
             return *this;
         }
     _LIBCPP_INLINE_VISIBILITY istream_iterator  operator++(int)
@@ -1052,15 +1049,25 @@ class _LIBCPP_TEMPLATE_VIS ostream_iterator
     : public iterator<output_iterator_tag, void, void, void, void>
 {
 public:
-    typedef _CharT char_type;
-    typedef _Traits traits_type;
-    typedef basic_ostream<_CharT,_Traits> ostream_type;
+    typedef output_iterator_tag             iterator_category;
+    typedef void                            value_type;
+#if _LIBCPP_STD_VER > 17
+    typedef std::ptrdiff_t                  difference_type;
+#else
+    typedef void                            difference_type;
+#endif
+    typedef void                            pointer;
+    typedef void                            reference;
+    typedef _CharT                          char_type;
+    typedef _Traits                         traits_type;
+    typedef basic_ostream<_CharT, _Traits>  ostream_type;
+
 private:
     ostream_type* __out_stream_;
     const char_type* __delim_;
 public:
     _LIBCPP_INLINE_VISIBILITY ostream_iterator(ostream_type& __s) _NOEXCEPT
-        : __out_stream_(_VSTD::addressof(__s)), __delim_(0) {}
+        : __out_stream_(_VSTD::addressof(__s)), __delim_(nullptr) {}
     _LIBCPP_INLINE_VISIBILITY ostream_iterator(ostream_type& __s, const _CharT* __delimiter) _NOEXCEPT
         : __out_stream_(_VSTD::addressof(__s)), __delim_(__delimiter) {}
     _LIBCPP_INLINE_VISIBILITY ostream_iterator& operator=(const _Tp& __value_)
@@ -1106,11 +1113,11 @@ private:
     bool __test_for_eof() const
     {
         if (__sbuf_ && traits_type::eq_int_type(__sbuf_->sgetc(), traits_type::eof()))
-            __sbuf_ = 0;
-        return __sbuf_ == 0;
+            __sbuf_ = nullptr;
+        return __sbuf_ == nullptr;
     }
 public:
-    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR istreambuf_iterator() _NOEXCEPT : __sbuf_(0) {}
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR istreambuf_iterator() _NOEXCEPT : __sbuf_(nullptr) {}
     _LIBCPP_INLINE_VISIBILITY istreambuf_iterator(istream_type& __s) _NOEXCEPT
         : __sbuf_(__s.rdbuf()) {}
     _LIBCPP_INLINE_VISIBILITY istreambuf_iterator(streambuf_type* __s) _NOEXCEPT
@@ -1151,10 +1158,20 @@ class _LIBCPP_TEMPLATE_VIS ostreambuf_iterator
     : public iterator<output_iterator_tag, void, void, void, void>
 {
 public:
-    typedef _CharT                          char_type;
-    typedef _Traits                         traits_type;
-    typedef basic_streambuf<_CharT,_Traits> streambuf_type;
-    typedef basic_ostream<_CharT,_Traits>   ostream_type;
+    typedef output_iterator_tag                 iterator_category;
+    typedef void                                value_type;
+#if _LIBCPP_STD_VER > 17
+    typedef std::ptrdiff_t                      difference_type;
+#else
+    typedef void                                difference_type;
+#endif
+    typedef void                                pointer;
+    typedef void                                reference;
+    typedef _CharT                              char_type;
+    typedef _Traits                             traits_type;
+    typedef basic_streambuf<_CharT, _Traits>    streambuf_type;
+    typedef basic_ostream<_CharT, _Traits>      ostream_type;
+
 private:
     streambuf_type* __sbuf_;
 public:
@@ -1165,13 +1182,13 @@ public:
     _LIBCPP_INLINE_VISIBILITY ostreambuf_iterator& operator=(_CharT __c)
         {
             if (__sbuf_ && traits_type::eq_int_type(__sbuf_->sputc(__c), traits_type::eof()))
-                __sbuf_ = 0;
+                __sbuf_ = nullptr;
             return *this;
         }
     _LIBCPP_INLINE_VISIBILITY ostreambuf_iterator& operator*()     {return *this;}
     _LIBCPP_INLINE_VISIBILITY ostreambuf_iterator& operator++()    {return *this;}
     _LIBCPP_INLINE_VISIBILITY ostreambuf_iterator& operator++(int) {return *this;}
-    _LIBCPP_INLINE_VISIBILITY bool failed() const _NOEXCEPT {return __sbuf_ == 0;}
+    _LIBCPP_INLINE_VISIBILITY bool failed() const _NOEXCEPT {return __sbuf_ == nullptr;}
 
     template <class _Ch, class _Tr>
     friend
@@ -1373,13 +1390,13 @@ operator+(typename __wrap_iter<_Iter>::difference_type, __wrap_iter<_Iter>) _NOE
 
 template <class _Ip, class _Op> _Op _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17 copy(_Ip, _Ip, _Op);
 template <class _B1, class _B2> _B2 _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17 copy_backward(_B1, _B1, _B2);
-template <class _Ip, class _Op> _Op _LIBCPP_INLINE_VISIBILITY move(_Ip, _Ip, _Op);
-template <class _B1, class _B2> _B2 _LIBCPP_INLINE_VISIBILITY move_backward(_B1, _B1, _B2);
+template <class _Ip, class _Op> _Op _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17 move(_Ip, _Ip, _Op);
+template <class _B1, class _B2> _B2 _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17 move_backward(_B1, _B1, _B2);
 
 #if _LIBCPP_DEBUG_LEVEL < 2
 
 template <class _Tp>
-_LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_IF_NODEBUG
+_LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
 typename enable_if
 <
     is_trivially_copy_assignable<_Tp>::value,
@@ -1390,7 +1407,7 @@ __unwrap_iter(__wrap_iter<_Tp*>);
 #else
 
 template <class _Tp>
-inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_IF_NODEBUG
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
 typename enable_if
 <
     is_trivially_copy_assignable<_Tp>::value,
@@ -1418,20 +1435,20 @@ public:
                 : __i{}
 #endif
     {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         __get_db()->__insert_i(this);
 #endif
     }
     template <class _Up> _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_IF_NODEBUG
         __wrap_iter(const __wrap_iter<_Up>& __u,
-            typename enable_if<is_convertible<_Up, iterator_type>::value>::type* = 0) _NOEXCEPT
+            typename enable_if<is_convertible<_Up, iterator_type>::value>::type* = nullptr) _NOEXCEPT
             : __i(__u.base())
     {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         __get_db()->__iterator_copy(this, &__u);
 #endif
     }
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_IF_NODEBUG
     __wrap_iter(const __wrap_iter& __x)
         : __i(__x.base())
@@ -1456,7 +1473,7 @@ public:
 #endif
     _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_IF_NODEBUG reference operator*() const _NOEXCEPT
     {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         _LIBCPP_ASSERT(__get_const_db()->__dereferenceable(this),
                        "Attempted to dereference a non-dereferenceable iterator");
 #endif
@@ -1464,7 +1481,7 @@ public:
     }
     _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_IF_NODEBUG pointer  operator->() const _NOEXCEPT
     {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         _LIBCPP_ASSERT(__get_const_db()->__dereferenceable(this),
                        "Attempted to dereference a non-dereferenceable iterator");
 #endif
@@ -1472,7 +1489,7 @@ public:
     }
     _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_IF_NODEBUG __wrap_iter& operator++() _NOEXCEPT
     {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         _LIBCPP_ASSERT(__get_const_db()->__dereferenceable(this),
                        "Attempted to increment non-incrementable iterator");
 #endif
@@ -1484,7 +1501,7 @@ public:
 
     _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_IF_NODEBUG __wrap_iter& operator--() _NOEXCEPT
     {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         _LIBCPP_ASSERT(__get_const_db()->__decrementable(this),
                        "Attempted to decrement non-decrementable iterator");
 #endif
@@ -1497,7 +1514,7 @@ public:
         {__wrap_iter __w(*this); __w += __n; return __w;}
     _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_IF_NODEBUG __wrap_iter& operator+=(difference_type __n) _NOEXCEPT
     {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         _LIBCPP_ASSERT(__get_const_db()->__addable(this, __n),
                    "Attempted to add/subtract iterator outside of valid range");
 #endif
@@ -1510,7 +1527,7 @@ public:
         {*this += -__n; return *this;}
     _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_IF_NODEBUG reference    operator[](difference_type __n) const _NOEXCEPT
     {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         _LIBCPP_ASSERT(__get_const_db()->__subscriptable(this, __n),
                    "Attempted to subscript iterator outside of valid range");
 #endif
@@ -1520,7 +1537,7 @@ public:
     _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_IF_NODEBUG iterator_type base() const _NOEXCEPT {return __i;}
 
 private:
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_IF_NODEBUG __wrap_iter(const void* __p, iterator_type __x) : __i(__x)
     {
         __get_db()->__insert_ic(this, __p);
@@ -1584,12 +1601,12 @@ private:
 
     template <class _Ip, class _Op> friend _LIBCPP_CONSTEXPR_AFTER_CXX17 _Op copy(_Ip, _Ip, _Op);
     template <class _B1, class _B2> friend _LIBCPP_CONSTEXPR_AFTER_CXX17 _B2 copy_backward(_B1, _B1, _B2);
-    template <class _Ip, class _Op> friend _Op move(_Ip, _Ip, _Op);
-    template <class _B1, class _B2> friend _B2 move_backward(_B1, _B1, _B2);
+    template <class _Ip, class _Op> friend _LIBCPP_CONSTEXPR_AFTER_CXX17 _Op move(_Ip, _Ip, _Op);
+    template <class _B1, class _B2> friend _LIBCPP_CONSTEXPR_AFTER_CXX17 _B2 move_backward(_B1, _B1, _B2);
 
 #if _LIBCPP_DEBUG_LEVEL < 2
     template <class _Tp>
-    _LIBCPP_CONSTEXPR_IF_NODEBUG friend
+    _LIBCPP_CONSTEXPR friend
     typename enable_if
     <
         is_trivially_copy_assignable<_Tp>::value,
@@ -1598,7 +1615,7 @@ private:
     __unwrap_iter(__wrap_iter<_Tp*>);
 #else
   template <class _Tp>
-  inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_IF_NODEBUG
+  inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR friend
   typename enable_if
   <
       is_trivially_copy_assignable<_Tp>::value,
@@ -1621,7 +1638,7 @@ inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_IF_NODEBUG
 bool
 operator<(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter2>& __y) _NOEXCEPT
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_ASSERT(__get_const_db()->__less_than_comparable(&__x, &__y),
                    "Attempted to compare incomparable iterators");
 #endif
@@ -1699,7 +1716,7 @@ auto
 operator-(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter2>& __y) _NOEXCEPT
 -> decltype(__x.base() - __y.base())
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_ASSERT(__get_const_db()->__less_than_comparable(&__x, &__y),
                    "Attempted to subtract incompatible iterators");
 #endif
@@ -1711,7 +1728,7 @@ inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_IF_NODEBUG
 typename __wrap_iter<_Iter1>::difference_type
 operator-(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter2>& __y) _NOEXCEPT
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_ASSERT(__get_const_db()->__less_than_comparable(&__x, &__y),
                    "Attempted to subtract incompatible iterators");
 #endif
diff --git a/lib/libcxx/include/latch b/lib/libcxx/include/latch
index f669f5860d..a894f8cafd 100644
--- a/lib/libcxx/include/latch
+++ b/lib/libcxx/include/latch
@@ -19,6 +19,8 @@ namespace std
   class latch
   {
   public:
+    static constexpr ptrdiff_t max() noexcept;
+
     constexpr explicit latch(ptrdiff_t __expected);
     ~latch();
 
@@ -39,6 +41,7 @@ namespace std
 */
 
 #include <__config>
+#include <__availability>
 #include <atomic>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -49,6 +52,9 @@ namespace std
 # error <latch> is not supported on this single threaded system
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 14
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -101,4 +107,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 14
 
+_LIBCPP_POP_MACROS
+
 #endif //_LIBCPP_LATCH
diff --git a/lib/libcxx/include/limits b/lib/libcxx/include/limits
index 6d5d1e1aca..8f97cd10a8 100644
--- a/lib/libcxx/include/limits
+++ b/lib/libcxx/include/limits
@@ -105,11 +105,11 @@ template<> class numeric_limits<cv long double>;
 #include <type_traits>
 
 #if defined(_LIBCPP_COMPILER_MSVC)
-#include "support/win32/limits_msvc_win32.h"
+#include "__support/win32/limits_msvc_win32.h"
 #endif // _LIBCPP_MSVCRT
 
 #if defined(__IBMCPP__)
-#include "support/ibm/limits.h"
+#include "__support/ibm/limits.h"
 #endif // __IBMCPP__
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/lib/libcxx/include/list b/lib/libcxx/include/list
index 55b45f1a67..a18514d74e 100644
--- a/lib/libcxx/include/list
+++ b/lib/libcxx/include/list
@@ -293,7 +293,7 @@ class _LIBCPP_TEMPLATE_VIS __list_iterator
 
     __link_pointer __ptr_;
 
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_INLINE_VISIBILITY
     explicit __list_iterator(__link_pointer __p, const void* __c) _NOEXCEPT
         : __ptr_(__p)
@@ -320,12 +320,12 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     __list_iterator() _NOEXCEPT : __ptr_(nullptr)
     {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         __get_db()->__insert_i(this);
 #endif
     }
 
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
 
     _LIBCPP_INLINE_VISIBILITY
     __list_iterator(const __list_iterator& __p)
@@ -351,12 +351,12 @@ public:
         return *this;
     }
 
-#endif  // _LIBCPP_DEBUG_LEVEL >= 2
+#endif  // _LIBCPP_DEBUG_LEVEL == 2
 
     _LIBCPP_INLINE_VISIBILITY
     reference operator*() const
     {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         _LIBCPP_ASSERT(__get_const_db()->__dereferenceable(this),
                        "Attempted to dereference a non-dereferenceable list::iterator");
 #endif
@@ -365,7 +365,7 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     pointer operator->() const
     {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         _LIBCPP_ASSERT(__get_const_db()->__dereferenceable(this),
                        "Attempted to dereference a non-dereferenceable list::iterator");
 #endif
@@ -375,7 +375,7 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     __list_iterator& operator++()
     {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         _LIBCPP_ASSERT(__get_const_db()->__dereferenceable(this),
                        "Attempted to increment non-incrementable list::iterator");
 #endif
@@ -388,7 +388,7 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     __list_iterator& operator--()
     {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         _LIBCPP_ASSERT(__get_const_db()->__decrementable(this),
                        "Attempted to decrement non-decrementable list::iterator");
 #endif
@@ -416,7 +416,7 @@ class _LIBCPP_TEMPLATE_VIS __list_const_iterator
 
     __link_pointer __ptr_;
 
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_INLINE_VISIBILITY
     explicit __list_const_iterator(__link_pointer __p, const void* __c) _NOEXCEPT
         : __ptr_(__p)
@@ -440,7 +440,7 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     __list_const_iterator() _NOEXCEPT : __ptr_(nullptr)
     {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         __get_db()->__insert_i(this);
 #endif
     }
@@ -448,12 +448,12 @@ public:
     __list_const_iterator(const __list_iterator<_Tp, _VoidPtr>& __p) _NOEXCEPT
         : __ptr_(__p.__ptr_)
     {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         __get_db()->__iterator_copy(this, &__p);
 #endif
     }
 
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
 
     _LIBCPP_INLINE_VISIBILITY
     __list_const_iterator(const __list_const_iterator& __p)
@@ -479,11 +479,11 @@ public:
         return *this;
     }
 
-#endif  // _LIBCPP_DEBUG_LEVEL >= 2
+#endif  // _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_INLINE_VISIBILITY
     reference operator*() const
     {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         _LIBCPP_ASSERT(__get_const_db()->__dereferenceable(this),
                        "Attempted to dereference a non-dereferenceable list::const_iterator");
 #endif
@@ -492,7 +492,7 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     pointer operator->() const
     {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         _LIBCPP_ASSERT(__get_const_db()->__dereferenceable(this),
                        "Attempted to dereference a non-dereferenceable list::const_iterator");
 #endif
@@ -502,7 +502,7 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     __list_const_iterator& operator++()
     {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         _LIBCPP_ASSERT(__get_const_db()->__dereferenceable(this),
                        "Attempted to increment non-incrementable list::const_iterator");
 #endif
@@ -515,7 +515,7 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     __list_const_iterator& operator--()
     {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         _LIBCPP_ASSERT(__get_const_db()->__decrementable(this),
                        "Attempted to decrement non-decrementable list::const_iterator");
 #endif
@@ -614,7 +614,7 @@ protected:
     _LIBCPP_INLINE_VISIBILITY
     iterator begin() _NOEXCEPT
     {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         return iterator(__end_.__next_, this);
 #else
         return iterator(__end_.__next_);
@@ -623,7 +623,7 @@ protected:
     _LIBCPP_INLINE_VISIBILITY
     const_iterator begin() const  _NOEXCEPT
     {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         return const_iterator(__end_.__next_, this);
 #else
         return const_iterator(__end_.__next_);
@@ -632,7 +632,7 @@ protected:
     _LIBCPP_INLINE_VISIBILITY
     iterator end() _NOEXCEPT
     {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         return iterator(__end_as_link(), this);
 #else
         return iterator(__end_as_link());
@@ -641,7 +641,7 @@ protected:
     _LIBCPP_INLINE_VISIBILITY
     const_iterator end() const _NOEXCEPT
     {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         return const_iterator(__end_as_link(), this);
 #else
         return const_iterator(__end_as_link());
@@ -696,7 +696,7 @@ private:
 
     _LIBCPP_INLINE_VISIBILITY
     void __invalidate_all_iterators() {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
       __get_db()->__invalidate_all(this);
 #endif
     }
@@ -735,13 +735,13 @@ inline __list_imp<_Tp, _Alloc>::__list_imp(const __node_allocator& __a)
 #ifndef _LIBCPP_CXX03_LANG
 template <class _Tp, class _Alloc>
 inline __list_imp<_Tp, _Alloc>::__list_imp(__node_allocator&& __a) _NOEXCEPT
-    : __size_alloc_(0, std::move(__a)) {}
+    : __size_alloc_(0, _VSTD::move(__a)) {}
 #endif
 
 template <class _Tp, class _Alloc>
 __list_imp<_Tp, _Alloc>::~__list_imp() {
   clear();
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__erase_c(this);
 #endif
 }
@@ -783,7 +783,7 @@ __list_imp<_Tp, _Alloc>::swap(__list_imp& __c)
                    "list::swap: Either propagate_on_container_swap must be true"
                    " or the allocators must compare equal");
     using _VSTD::swap;
-    __swap_allocator(__node_alloc(), __c.__node_alloc());
+    _VSTD::__swap_allocator(__node_alloc(), __c.__node_alloc());
     swap(__sz(), __c.__sz());
     swap(__end_, __c.__end_);
     if (__sz() == 0)
@@ -795,13 +795,13 @@ __list_imp<_Tp, _Alloc>::swap(__list_imp& __c)
     else
         __c.__end_.__prev_->__next_ = __c.__end_.__next_->__prev_ = __c.__end_as_link();
 
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __libcpp_db* __db = __get_db();
     __c_node* __cn1 = __db->__find_c_and_lock(this);
     __c_node* __cn2 = __db->__find_c(&__c);
-    std::swap(__cn1->beg_, __cn2->beg_);
-    std::swap(__cn1->end_, __cn2->end_);
-    std::swap(__cn1->cap_, __cn2->cap_);
+    _VSTD::swap(__cn1->beg_, __cn2->beg_);
+    _VSTD::swap(__cn1->end_, __cn2->end_);
+    _VSTD::swap(__cn1->cap_, __cn2->cap_);
     for (__i_node** __p = __cn1->end_; __p != __cn1->beg_;)
     {
         --__p;
@@ -810,7 +810,7 @@ __list_imp<_Tp, _Alloc>::swap(__list_imp& __c)
         {
             __cn2->__add(*__p);
             if (--__cn1->end_ != __p)
-                memmove(__p, __p+1, (__cn1->end_ - __p)*sizeof(__i_node*));
+                _VSTD::memmove(__p, __p+1, (__cn1->end_ - __p)*sizeof(__i_node*));
         }
         else
             (*__p)->__c_ = __cn1;
@@ -823,7 +823,7 @@ __list_imp<_Tp, _Alloc>::swap(__list_imp& __c)
         {
             __cn1->__add(*__p);
             if (--__cn2->end_ != __p)
-                memmove(__p, __p+1, (__cn2->end_ - __p)*sizeof(__i_node*));
+                _VSTD::memmove(__p, __p+1, (__cn2->end_ - __p)*sizeof(__i_node*));
         }
         else
             (*__p)->__c_ = __cn2;
@@ -870,14 +870,14 @@ public:
     list()
         _NOEXCEPT_(is_nothrow_default_constructible<__node_allocator>::value)
     {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         __get_db()->__insert_c(this);
 #endif
     }
     _LIBCPP_INLINE_VISIBILITY
     explicit list(const allocator_type& __a) : base(__a)
     {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         __get_db()->__insert_c(this);
 #endif
     }
@@ -937,7 +937,7 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     size_type max_size() const _NOEXCEPT
         {
-            return std::min<size_type>(
+            return _VSTD::min<size_type>(
                 base::__node_alloc_max_size(),
                 numeric_limits<difference_type >::max());
         }
@@ -1117,14 +1117,14 @@ public:
       return __hold_pointer(__p, __node_destructor(__na, 1));
     }
 
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
 
     bool __dereferenceable(const const_iterator* __i) const;
     bool __decrementable(const const_iterator* __i) const;
     bool __addable(const const_iterator* __i, ptrdiff_t __n) const;
     bool __subscriptable(const const_iterator* __i, ptrdiff_t __n) const;
 
-#endif  // _LIBCPP_DEBUG_LEVEL >= 2
+#endif  // _LIBCPP_DEBUG_LEVEL == 2
 
 private:
     _LIBCPP_INLINE_VISIBILITY
@@ -1144,7 +1144,7 @@ private:
 
 #ifndef _LIBCPP_HAS_NO_DEDUCTION_GUIDES
 template<class _InputIterator,
-         class _Alloc = typename std::allocator<typename iterator_traits<_InputIterator>::value_type>,
+         class _Alloc = allocator<typename iterator_traits<_InputIterator>::value_type>,
          class = typename enable_if<__is_allocator<_Alloc>::value, void>::type
          >
 list(_InputIterator, _InputIterator)
@@ -1207,7 +1207,7 @@ list<_Tp, _Alloc>::__iterator(size_type __n)
 template <class _Tp, class _Alloc>
 list<_Tp, _Alloc>::list(size_type __n)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     for (; __n > 0; --__n)
@@ -1222,7 +1222,7 @@ list<_Tp, _Alloc>::list(size_type __n)
 template <class _Tp, class _Alloc>
 list<_Tp, _Alloc>::list(size_type __n, const allocator_type& __a) : base(__a)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     for (; __n > 0; --__n)
@@ -1233,7 +1233,7 @@ list<_Tp, _Alloc>::list(size_type __n, const allocator_type& __a) : base(__a)
 template <class _Tp, class _Alloc>
 list<_Tp, _Alloc>::list(size_type __n, const value_type& __x)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     for (; __n > 0; --__n)
@@ -1244,7 +1244,7 @@ template <class _Tp, class _Alloc>
 list<_Tp, _Alloc>::list(size_type __n, const value_type& __x, const allocator_type& __a)
     : base(__a)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     for (; __n > 0; --__n)
@@ -1256,7 +1256,7 @@ template <class _InpIter>
 list<_Tp, _Alloc>::list(_InpIter __f, _InpIter __l,
                         typename enable_if<__is_cpp17_input_iterator<_InpIter>::value>::type*)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     for (; __f != __l; ++__f)
@@ -1269,7 +1269,7 @@ list<_Tp, _Alloc>::list(_InpIter __f, _InpIter __l, const allocator_type& __a,
                         typename enable_if<__is_cpp17_input_iterator<_InpIter>::value>::type*)
     : base(__a)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     for (; __f != __l; ++__f)
@@ -1280,7 +1280,7 @@ template <class _Tp, class _Alloc>
 list<_Tp, _Alloc>::list(const list& __c)
     : base(__node_alloc_traits::select_on_container_copy_construction(
           __c.__node_alloc())) {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     for (const_iterator __i = __c.begin(), __e = __c.end(); __i != __e; ++__i)
@@ -1291,7 +1291,7 @@ template <class _Tp, class _Alloc>
 list<_Tp, _Alloc>::list(const list& __c, const allocator_type& __a)
     : base(__a)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     for (const_iterator __i = __c.begin(), __e = __c.end(); __i != __e; ++__i)
@@ -1304,7 +1304,7 @@ template <class _Tp, class _Alloc>
 list<_Tp, _Alloc>::list(initializer_list<value_type> __il, const allocator_type& __a)
     : base(__a)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     for (typename initializer_list<value_type>::const_iterator __i = __il.begin(),
@@ -1315,7 +1315,7 @@ list<_Tp, _Alloc>::list(initializer_list<value_type> __il, const allocator_type&
 template <class _Tp, class _Alloc>
 list<_Tp, _Alloc>::list(initializer_list<value_type> __il)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     for (typename initializer_list<value_type>::const_iterator __i = __il.begin(),
@@ -1327,7 +1327,7 @@ template <class _Tp, class _Alloc>
 inline list<_Tp, _Alloc>::list(list&& __c)
     _NOEXCEPT_(is_nothrow_move_constructible<__node_allocator>::value)
     : base(_VSTD::move(__c.__node_alloc())) {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     splice(end(), __c);
@@ -1338,7 +1338,7 @@ inline
 list<_Tp, _Alloc>::list(list&& __c, const allocator_type& __a)
     : base(__a)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     if (__a == __c.get_allocator())
@@ -1415,7 +1415,7 @@ list<_Tp, _Alloc>::assign(_InpIter __f, _InpIter __l,
         insert(__e, __f, __l);
     else
         erase(__i, __e);
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
       __get_db()->__invalidate_all(this);
 #endif
 }
@@ -1432,7 +1432,7 @@ list<_Tp, _Alloc>::assign(size_type __n, const value_type& __x)
         insert(__e, __n, __x);
     else
         erase(__i, __e);
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
       __get_db()->__invalidate_all(this);
 #endif
 }
@@ -1449,7 +1449,7 @@ template <class _Tp, class _Alloc>
 typename list<_Tp, _Alloc>::iterator
 list<_Tp, _Alloc>::insert(const_iterator __p, const value_type& __x)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__p) == this,
         "list::insert(iterator, x) called with an iterator not"
         " referring to this list");
@@ -1459,7 +1459,7 @@ list<_Tp, _Alloc>::insert(const_iterator __p, const value_type& __x)
     __node_alloc_traits::construct(__na, _VSTD::addressof(__hold->__value_), __x);
     __link_nodes(__p.__ptr_, __hold->__as_link(), __hold->__as_link());
     ++base::__sz();
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     return iterator(__hold.release()->__as_link(), this);
 #else
     return iterator(__hold.release()->__as_link());
@@ -1470,7 +1470,7 @@ template <class _Tp, class _Alloc>
 typename list<_Tp, _Alloc>::iterator
 list<_Tp, _Alloc>::insert(const_iterator __p, size_type __n, const value_type& __x)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__p) == this,
         "list::insert(iterator, n, x) called with an iterator not"
         " referring to this list");
@@ -1485,7 +1485,7 @@ list<_Tp, _Alloc>::insert(const_iterator __p, size_type __n, const value_type& _
         __hold_pointer __hold = __allocate_node(__na);
         __node_alloc_traits::construct(__na, _VSTD::addressof(__hold->__value_), __x);
         ++__ds;
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         __r = iterator(__hold->__as_link(), this);
 #else
         __r = iterator(__hold->__as_link());
@@ -1515,7 +1515,7 @@ list<_Tp, _Alloc>::insert(const_iterator __p, size_type __n, const value_type& _
                 __node_alloc_traits::deallocate(__na, __e.__ptr_->__as_node(), 1);
                 if (__prev == 0)
                     break;
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
                 __e = iterator(__prev, this);
 #else
                 __e = iterator(__prev);
@@ -1536,7 +1536,7 @@ typename list<_Tp, _Alloc>::iterator
 list<_Tp, _Alloc>::insert(const_iterator __p, _InpIter __f, _InpIter __l,
              typename enable_if<__is_cpp17_input_iterator<_InpIter>::value>::type*)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__p) == this,
         "list::insert(iterator, range) called with an iterator not"
         " referring to this list");
@@ -1551,7 +1551,7 @@ list<_Tp, _Alloc>::insert(const_iterator __p, _InpIter __f, _InpIter __l,
         __hold_pointer __hold = __allocate_node(__na);
         __node_alloc_traits::construct(__na, _VSTD::addressof(__hold->__value_), *__f);
         ++__ds;
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         __r = iterator(__hold.get()->__as_link(), this);
 #else
         __r = iterator(__hold.get()->__as_link());
@@ -1581,7 +1581,7 @@ list<_Tp, _Alloc>::insert(const_iterator __p, _InpIter __f, _InpIter __l,
                 __node_alloc_traits::deallocate(__na, __e.__ptr_->__as_node(), 1);
                 if (__prev == 0)
                     break;
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
                 __e = iterator(__prev, this);
 #else
                 __e = iterator(__prev);
@@ -1695,7 +1695,7 @@ template <class... _Args>
 typename list<_Tp, _Alloc>::iterator
 list<_Tp, _Alloc>::emplace(const_iterator __p, _Args&&... __args)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__p) == this,
         "list::emplace(iterator, args...) called with an iterator not"
         " referring to this list");
@@ -1707,7 +1707,7 @@ list<_Tp, _Alloc>::emplace(const_iterator __p, _Args&&... __args)
     __link_nodes(__p.__ptr_, __nl, __nl);
     ++base::__sz();
     __hold.release();
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     return iterator(__nl, this);
 #else
     return iterator(__nl);
@@ -1718,7 +1718,7 @@ template <class _Tp, class _Alloc>
 typename list<_Tp, _Alloc>::iterator
 list<_Tp, _Alloc>::insert(const_iterator __p, value_type&& __x)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__p) == this,
         "list::insert(iterator, x) called with an iterator not"
         " referring to this list");
@@ -1730,7 +1730,7 @@ list<_Tp, _Alloc>::insert(const_iterator __p, value_type&& __x)
     __link_nodes(__p.__ptr_, __nl, __nl);
     ++base::__sz();
     __hold.release();
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     return iterator(__nl, this);
 #else
     return iterator(__nl);
@@ -1748,7 +1748,7 @@ list<_Tp, _Alloc>::pop_front()
     __link_pointer __n = base::__end_.__next_;
     base::__unlink_nodes(__n, __n);
     --base::__sz();
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __c_node* __c = __get_db()->__find_c_and_lock(this);
     for (__i_node** __p = __c->end_; __p != __c->beg_; )
     {
@@ -1758,7 +1758,7 @@ list<_Tp, _Alloc>::pop_front()
         {
             (*__p)->__c_ = nullptr;
             if (--__c->end_ != __p)
-                memmove(__p, __p+1, (__c->end_ - __p)*sizeof(__i_node*));
+                _VSTD::memmove(__p, __p+1, (__c->end_ - __p)*sizeof(__i_node*));
         }
     }
     __get_db()->unlock();
@@ -1777,7 +1777,7 @@ list<_Tp, _Alloc>::pop_back()
     __link_pointer __n = base::__end_.__prev_;
     base::__unlink_nodes(__n, __n);
     --base::__sz();
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __c_node* __c = __get_db()->__find_c_and_lock(this);
     for (__i_node** __p = __c->end_; __p != __c->beg_; )
     {
@@ -1787,7 +1787,7 @@ list<_Tp, _Alloc>::pop_back()
         {
             (*__p)->__c_ = nullptr;
             if (--__c->end_ != __p)
-                memmove(__p, __p+1, (__c->end_ - __p)*sizeof(__i_node*));
+                _VSTD::memmove(__p, __p+1, (__c->end_ - __p)*sizeof(__i_node*));
         }
     }
     __get_db()->unlock();
@@ -1801,7 +1801,7 @@ template <class _Tp, class _Alloc>
 typename list<_Tp, _Alloc>::iterator
 list<_Tp, _Alloc>::erase(const_iterator __p)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__p) == this,
         "list::erase(iterator) called with an iterator not"
         " referring to this list");
@@ -1813,7 +1813,7 @@ list<_Tp, _Alloc>::erase(const_iterator __p)
     __link_pointer __r = __n->__next_;
     base::__unlink_nodes(__n, __n);
     --base::__sz();
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __c_node* __c = __get_db()->__find_c_and_lock(this);
     for (__i_node** __ip = __c->end_; __ip != __c->beg_; )
     {
@@ -1823,7 +1823,7 @@ list<_Tp, _Alloc>::erase(const_iterator __p)
         {
             (*__ip)->__c_ = nullptr;
             if (--__c->end_ != __ip)
-                memmove(__ip, __ip+1, (__c->end_ - __ip)*sizeof(__i_node*));
+                _VSTD::memmove(__ip, __ip+1, (__c->end_ - __ip)*sizeof(__i_node*));
         }
     }
     __get_db()->unlock();
@@ -1831,7 +1831,7 @@ list<_Tp, _Alloc>::erase(const_iterator __p)
     __node_pointer __np = __n->__as_node();
     __node_alloc_traits::destroy(__na, _VSTD::addressof(__np->__value_));
     __node_alloc_traits::deallocate(__na, __np, 1);
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     return iterator(__r, this);
 #else
     return iterator(__r);
@@ -1842,7 +1842,7 @@ template <class _Tp, class _Alloc>
 typename list<_Tp, _Alloc>::iterator
 list<_Tp, _Alloc>::erase(const_iterator __f, const_iterator __l)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__f) == this,
         "list::erase(iterator, iterator) called with an iterator not"
         " referring to this list");
@@ -1859,7 +1859,7 @@ list<_Tp, _Alloc>::erase(const_iterator __f, const_iterator __l)
             __link_pointer __n = __f.__ptr_;
             ++__f;
             --base::__sz();
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
             __c_node* __c = __get_db()->__find_c_and_lock(this);
             for (__i_node** __p = __c->end_; __p != __c->beg_; )
             {
@@ -1869,7 +1869,7 @@ list<_Tp, _Alloc>::erase(const_iterator __f, const_iterator __l)
                 {
                     (*__p)->__c_ = nullptr;
                     if (--__c->end_ != __p)
-                        memmove(__p, __p+1, (__c->end_ - __p)*sizeof(__i_node*));
+                        _VSTD::memmove(__p, __p+1, (__c->end_ - __p)*sizeof(__i_node*));
                 }
             }
             __get_db()->unlock();
@@ -1879,7 +1879,7 @@ list<_Tp, _Alloc>::erase(const_iterator __f, const_iterator __l)
             __node_alloc_traits::deallocate(__na, __np, 1);
         }
     }
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     return iterator(__l.__ptr_, this);
 #else
     return iterator(__l.__ptr_);
@@ -1900,7 +1900,7 @@ list<_Tp, _Alloc>::resize(size_type __n)
         __hold_pointer __hold = __allocate_node(__na);
         __node_alloc_traits::construct(__na, _VSTD::addressof(__hold->__value_));
         ++__ds;
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         iterator __r = iterator(__hold.release()->__as_link(), this);
 #else
         iterator __r = iterator(__hold.release()->__as_link());
@@ -1929,7 +1929,7 @@ list<_Tp, _Alloc>::resize(size_type __n)
                 __node_alloc_traits::deallocate(__na, __e.__ptr_->__as_node(), 1);
                 if (__prev == 0)
                     break;
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
                 __e = iterator(__prev, this);
 #else
                 __e = iterator(__prev);
@@ -1958,7 +1958,7 @@ list<_Tp, _Alloc>::resize(size_type __n, const value_type& __x)
         __node_alloc_traits::construct(__na, _VSTD::addressof(__hold->__value_), __x);
         ++__ds;
         __link_pointer __nl = __hold.release()->__as_link();
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         iterator __r = iterator(__nl, this);
 #else
         iterator __r = iterator(__nl);
@@ -1987,7 +1987,7 @@ list<_Tp, _Alloc>::resize(size_type __n, const value_type& __x)
                 __node_alloc_traits::deallocate(__na, __e.__ptr_->__as_node(), 1);
                 if (__prev == 0)
                     break;
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
                 __e = iterator(__prev, this);
 #else
                 __e = iterator(__prev);
@@ -2007,7 +2007,7 @@ list<_Tp, _Alloc>::splice(const_iterator __p, list& __c)
 {
     _LIBCPP_ASSERT(this != &__c,
                    "list::splice(iterator, list) called with this == &list");
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__p) == this,
         "list::splice(iterator, list) called with an iterator not"
         " referring to this list");
@@ -2020,7 +2020,7 @@ list<_Tp, _Alloc>::splice(const_iterator __p, list& __c)
         __link_nodes(__p.__ptr_, __f, __l);
         base::__sz() += __c.__sz();
         __c.__sz() = 0;
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         if (&__c != this) {
             __libcpp_db* __db = __get_db();
             __c_node* __cn1 = __db->__find_c_and_lock(this);
@@ -2034,7 +2034,7 @@ list<_Tp, _Alloc>::splice(const_iterator __p, list& __c)
                     __cn1->__add(*__ip);
                     (*__ip)->__c_ = __cn1;
                     if (--__cn2->end_ != __ip)
-                        memmove(__ip, __ip+1, (__cn2->end_ - __ip)*sizeof(__i_node*));
+                        _VSTD::memmove(__ip, __ip+1, (__cn2->end_ - __ip)*sizeof(__i_node*));
                 }
             }
             __db->unlock();
@@ -2047,7 +2047,7 @@ template <class _Tp, class _Alloc>
 void
 list<_Tp, _Alloc>::splice(const_iterator __p, list& __c, const_iterator __i)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__p) == this,
         "list::splice(iterator, list, iterator) called with first iterator not"
         " referring to this list");
@@ -2056,7 +2056,7 @@ list<_Tp, _Alloc>::splice(const_iterator __p, list& __c, const_iterator __i)
         " referring to list argument");
     _LIBCPP_ASSERT(__get_const_db()->__dereferenceable(&__i),
         "list::splice(iterator, list, iterator) called with second iterator not"
-        " derefereceable");
+        " dereferenceable");
 #endif
     if (__p.__ptr_ != __i.__ptr_ && __p.__ptr_ != __i.__ptr_->__next_)
     {
@@ -2065,7 +2065,7 @@ list<_Tp, _Alloc>::splice(const_iterator __p, list& __c, const_iterator __i)
         __link_nodes(__p.__ptr_, __f, __f);
         --__c.__sz();
         ++base::__sz();
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         if (&__c != this) {
             __libcpp_db* __db = __get_db();
             __c_node* __cn1 = __db->__find_c_and_lock(this);
@@ -2079,7 +2079,7 @@ list<_Tp, _Alloc>::splice(const_iterator __p, list& __c, const_iterator __i)
                     __cn1->__add(*__ip);
                     (*__ip)->__c_ = __cn1;
                     if (--__cn2->end_ != __ip)
-                        memmove(__ip, __ip+1, (__cn2->end_ - __ip)*sizeof(__i_node*));
+                        _VSTD::memmove(__ip, __ip+1, (__cn2->end_ - __ip)*sizeof(__i_node*));
                 }
             }
             __db->unlock();
@@ -2092,7 +2092,7 @@ template <class _Tp, class _Alloc>
 void
 list<_Tp, _Alloc>::splice(const_iterator __p, list& __c, const_iterator __f, const_iterator __l)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__p) == this,
         "list::splice(iterator, list, iterator, iterator) called with first iterator not"
         " referring to this list");
@@ -2121,7 +2121,7 @@ list<_Tp, _Alloc>::splice(const_iterator __p, list& __c, const_iterator __f, con
         }
         base::__unlink_nodes(__first, __last);
         __link_nodes(__p.__ptr_, __first, __last);
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         if (&__c != this) {
             __libcpp_db* __db = __get_db();
             __c_node* __cn1 = __db->__find_c_and_lock(this);
@@ -2138,7 +2138,7 @@ list<_Tp, _Alloc>::splice(const_iterator __p, list& __c, const_iterator __f, con
                         __cn1->__add(*__ip);
                         (*__ip)->__c_ = __cn1;
                         if (--__cn2->end_ != __ip)
-                            memmove(__ip, __ip+1, (__cn2->end_ - __ip)*sizeof(__i_node*));
+                            _VSTD::memmove(__ip, __ip+1, (__cn2->end_ - __ip)*sizeof(__i_node*));
                     }
                 }
             }
@@ -2258,7 +2258,7 @@ list<_Tp, _Alloc>::merge(list& __c, _Comp __comp)
                 ++__f1;
         }
         splice(__e1, __c);
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         __libcpp_db* __db = __get_db();
         __c_node* __cn1 = __db->__find_c_and_lock(this);
         __c_node* __cn2 = __db->__find_c(&__c);
@@ -2271,7 +2271,7 @@ list<_Tp, _Alloc>::merge(list& __c, _Comp __comp)
                 __cn1->__add(*__p);
                 (*__p)->__c_ = __cn1;
                 if (--__cn2->end_ != __p)
-                    memmove(__p, __p+1, (__cn2->end_ - __p)*sizeof(__i_node*));
+                    _VSTD::memmove(__p, __p+1, (__cn2->end_ - __p)*sizeof(__i_node*));
             }
         }
         __db->unlock();
@@ -2382,7 +2382,7 @@ list<_Tp, _Alloc>::__invariants() const
     return size() == _VSTD::distance(begin(), end());
 }
 
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
 
 template <class _Tp, class _Alloc>
 bool
@@ -2412,7 +2412,7 @@ list<_Tp, _Alloc>::__subscriptable(const const_iterator*, ptrdiff_t) const
     return false;
 }
 
-#endif  // _LIBCPP_DEBUG_LEVEL >= 2
+#endif  // _LIBCPP_DEBUG_LEVEL == 2
 
 template <class _Tp, class _Alloc>
 inline _LIBCPP_INLINE_VISIBILITY
diff --git a/lib/libcxx/include/locale b/lib/libcxx/include/locale
index 3fe4430022..9a705c77cf 100644
--- a/lib/libcxx/include/locale
+++ b/lib/libcxx/include/locale
@@ -92,7 +92,11 @@ public:
     typedef typename Codecvt::state_type                      state_type;
     typedef typename wide_string::traits_type::int_type       int_type;
 
-    explicit wstring_convert(Codecvt* pcvt = new Codecvt);          // explicit in C++14
+    wstring_convert(Codecvt* pcvt = new Codecvt);          // before C++14
+    explicit wstring_convert(Codecvt* pcvt = new Codecvt); // before C++20
+    wstring_convert() : wstring_convert(new Codecvt) {}    // C++20
+    explicit wstring_convert(Codecvt* pcvt);               // C++20
+
     wstring_convert(Codecvt* pcvt, state_type state);
     explicit wstring_convert(const byte_string& byte_err,           // explicit in C++14
                     const wide_string& wide_err = wide_string());
@@ -121,8 +125,14 @@ class wbuffer_convert
 public:
     typedef typename Tr::state_type state_type;
 
-    explicit wbuffer_convert(streambuf* bytebuf = 0, Codecvt* pcvt = new Codecvt,
-                    state_type state = state_type());       // explicit in C++14
+    wbuffer_convert(streambuf* bytebuf = 0, Codecvt* pcvt = new Codecvt,
+                    state_type state = state_type());          // before C++14
+    explicit wbuffer_convert(streambuf* bytebuf = nullptr, Codecvt* pcvt = new Codecvt,
+                            state_type state = state_type()); // before C++20
+    wbuffer_convert() : wbuffer_convert(nullptr) {} // C++20
+    explicit wbuffer_convert(streambuf* bytebuf, Codecvt* pcvt = new Codecvt,
+                            state_type state = state_type()); // C++20
+
     wbuffer_convert(const wbuffer_convert&) = delete;               // C++14
     wbuffer_convert & operator=(const wbuffer_convert &) = delete;  // C++14
     ~wbuffer_convert();                                             // C++14
@@ -197,10 +207,6 @@ template <class charT> class messages_byname;
 #include <nl_types.h>
 #endif
 
-#ifdef __APPLE__
-#include <Availability.h>
-#endif
-
 #ifdef _LIBCPP_LOCALE__L_EXTENSIONS
 #include <__bsd_locale_defaults.h>
 #else
@@ -261,11 +267,11 @@ __scan_keyword(_InputIterator& __b, _InputIterator __e,
     const unsigned char __does_match = '\2';
     unsigned char __statbuf[100];
     unsigned char* __status = __statbuf;
-    unique_ptr<unsigned char, void(*)(void*)> __stat_hold(0, free);
+    unique_ptr<unsigned char, void(*)(void*)> __stat_hold(nullptr, free);
     if (__nkw > sizeof(__statbuf))
     {
         __status = (unsigned char*)malloc(__nkw);
-        if (__status == 0)
+        if (__status == nullptr)
             __throw_bad_alloc();
         __stat_hold.reset(__status);
     }
@@ -561,8 +567,8 @@ __num_get<_CharT>::__stage2_float_loop(_CharT __ct, bool& __in_units, char& __ex
     return 0;
 }
 
-_LIBCPP_EXTERN_TEMPLATE2(struct _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS __num_get<char>)
-_LIBCPP_EXTERN_TEMPLATE2(struct _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS __num_get<wchar_t>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(struct _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS __num_get<char>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(struct _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS __num_get<wchar_t>)
 
 template <class _CharT, class _InputIterator = istreambuf_iterator<_CharT> >
 class _LIBCPP_TEMPLATE_VIS num_get
@@ -877,8 +883,8 @@ num_get<_CharT, _InputIterator>::do_get(iter_type __b, iter_type __e,
     const numpunct<_CharT>& __np = use_facet<numpunct<_CharT> >(__iob.getloc());
     typedef typename numpunct<_CharT>::string_type string_type;
     const string_type __names[2] = {__np.truename(), __np.falsename()};
-    const string_type* __i = __scan_keyword(__b, __e, __names, __names+2,
-                                            __ct, __err);
+    const string_type* __i = _VSTD::__scan_keyword(__b, __e, __names, __names+2,
+                                                   __ct, __err);
     __v = __i == __names;
     return __b;
 }
@@ -1099,8 +1105,8 @@ num_get<_CharT, _InputIterator>::do_get(iter_type __b, iter_type __e,
     return __b;
 }
 
-_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS num_get<char>)
-_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS num_get<wchar_t>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS num_get<char>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS num_get<wchar_t>)
 
 struct _LIBCPP_TYPE_VIS __num_put_base
 {
@@ -1249,8 +1255,8 @@ __num_put<_CharT>::__widen_and_group_float(char* __nb, char* __np, char* __ne,
         __op = __ob + (__np - __nb);
 }
 
-_LIBCPP_EXTERN_TEMPLATE2(struct _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS __num_put<char>)
-_LIBCPP_EXTERN_TEMPLATE2(struct _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS __num_put<wchar_t>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(struct _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS __num_put<char>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(struct _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS __num_put<wchar_t>)
 
 template <class _CharT, class _OutputIterator = ostreambuf_iterator<_CharT> >
 class _LIBCPP_TEMPLATE_VIS num_put
@@ -1427,7 +1433,7 @@ num_put<_CharT, _OutputIterator>::do_put(iter_type __s, ios_base& __iob,
         return do_put(__s, __iob, __fl, (unsigned long)__v);
     const numpunct<char_type>& __np = use_facet<numpunct<char_type> >(__iob.getloc());
     typedef typename numpunct<char_type>::string_type string_type;
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     string_type __tmp(__v ? __np.truename() : __np.falsename());
     string_type __nm = _VSTD::move(__tmp);
 #else
@@ -1564,14 +1570,14 @@ num_put<_CharT, _OutputIterator>::do_put(iter_type __s, ios_base& __iob,
                                    (int)__iob.precision(), __v);
     else
         __nc = __libcpp_snprintf_l(__nb, __nbuf, _LIBCPP_GET_C_LOCALE, __fmt, __v);
-    unique_ptr<char, void(*)(void*)> __nbh(0, free);
+    unique_ptr<char, void(*)(void*)> __nbh(nullptr, free);
     if (__nc > static_cast<int>(__nbuf-1))
     {
         if (__specify_precision)
             __nc = __libcpp_asprintf_l(&__nb, _LIBCPP_GET_C_LOCALE, __fmt, (int)__iob.precision(), __v);
         else
             __nc = __libcpp_asprintf_l(&__nb, _LIBCPP_GET_C_LOCALE, __fmt, __v);
-        if (__nb == 0)
+        if (__nc == -1)
             __throw_bad_alloc();
         __nbh.reset(__nb);
     }
@@ -1615,14 +1621,14 @@ num_put<_CharT, _OutputIterator>::do_put(iter_type __s, ios_base& __iob,
                                    (int)__iob.precision(), __v);
     else
         __nc = __libcpp_snprintf_l(__nb, __nbuf, _LIBCPP_GET_C_LOCALE, __fmt, __v);
-    unique_ptr<char, void(*)(void*)> __nbh(0, free);
+    unique_ptr<char, void(*)(void*)> __nbh(nullptr, free);
     if (__nc > static_cast<int>(__nbuf-1))
     {
         if (__specify_precision)
             __nc = __libcpp_asprintf_l(&__nb, _LIBCPP_GET_C_LOCALE, __fmt, (int)__iob.precision(), __v);
         else
             __nc = __libcpp_asprintf_l(&__nb, _LIBCPP_GET_C_LOCALE, __fmt, __v);
-        if (__nb == 0)
+        if (__nc == -1)
             __throw_bad_alloc();
         __nbh.reset(__nb);
     }
@@ -1676,8 +1682,8 @@ num_put<_CharT, _OutputIterator>::do_put(iter_type __s, ios_base& __iob,
     return __pad_and_output(__s, __o, __op, __oe, __iob, __fl);
 }
 
-_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS num_put<char>)
-_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS num_put<wchar_t>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS num_put<char>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS num_put<wchar_t>)
 
 template <class _CharT, class _InputIterator>
 _LIBCPP_HIDDEN
@@ -1916,7 +1922,7 @@ time_get<_CharT, _InputIterator>::__get_weekdayname(int& __w,
 {
     // Note:  ignoring case comes from the POSIX strptime spec
     const string_type* __wk = this->__weeks();
-    ptrdiff_t __i = __scan_keyword(__b, __e, __wk, __wk+14, __ct, __err, false) - __wk;
+    ptrdiff_t __i = _VSTD::__scan_keyword(__b, __e, __wk, __wk+14, __ct, __err, false) - __wk;
     if (__i < 14)
         __w = __i % 7;
 }
@@ -1930,7 +1936,7 @@ time_get<_CharT, _InputIterator>::__get_monthname(int& __m,
 {
     // Note:  ignoring case comes from the POSIX strptime spec
     const string_type* __month = this->__months();
-    ptrdiff_t __i = __scan_keyword(__b, __e, __month, __month+24, __ct, __err, false) - __month;
+    ptrdiff_t __i = _VSTD::__scan_keyword(__b, __e, __month, __month+24, __ct, __err, false) - __month;
     if (__i < 24)
         __m = __i % 12;
 }
@@ -1942,7 +1948,7 @@ time_get<_CharT, _InputIterator>::__get_day(int& __d,
                                             ios_base::iostate& __err,
                                             const ctype<char_type>& __ct) const
 {
-    int __t = __get_up_to_n_digits(__b, __e, __err, __ct, 2);
+    int __t = _VSTD::__get_up_to_n_digits(__b, __e, __err, __ct, 2);
     if (!(__err & ios_base::failbit) && 1 <= __t && __t <= 31)
         __d = __t;
     else
@@ -2102,7 +2108,7 @@ time_get<_CharT, _InputIterator>::__get_am_pm(int& __h,
         __err |= ios_base::failbit;
         return;
     }
-    ptrdiff_t __i = __scan_keyword(__b, __e, __ap, __ap+2, __ct, __err, false) - __ap;
+    ptrdiff_t __i = _VSTD::__scan_keyword(__b, __e, __ap, __ap+2, __ct, __err, false) - __ap;
     if (__i == 0 && __h == 12)
         __h = 0;
     else if (__i == 1 && __h < 12)
@@ -2362,8 +2368,8 @@ time_get<_CharT, _InputIterator>::do_get(iter_type __b, iter_type __e,
     return __b;
 }
 
-_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS time_get<char>)
-_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS time_get<wchar_t>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS time_get<char>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS time_get<wchar_t>)
 
 class _LIBCPP_TYPE_VIS __time_get
 {
@@ -2462,8 +2468,8 @@ private:
     virtual const string_type& __X() const      {return this->__X_;}
 };
 
-_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS time_get_byname<char>)
-_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS time_get_byname<wchar_t>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS time_get_byname<char>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS time_get_byname<wchar_t>)
 
 class _LIBCPP_TYPE_VIS __time_put
 {
@@ -2575,8 +2581,8 @@ time_put<_CharT, _OutputIterator>::do_put(iter_type __s, ios_base&,
     return _VSTD::copy(__nb, __ne, __s);
 }
 
-_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS time_put<char>)
-_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS time_put<wchar_t>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS time_put<char>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS time_put<wchar_t>)
 
 template <class _CharT, class _OutputIterator = ostreambuf_iterator<_CharT> >
 class _LIBCPP_TEMPLATE_VIS time_put_byname
@@ -2596,8 +2602,8 @@ protected:
     ~time_put_byname() {}
 };
 
-_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS time_put_byname<char>)
-_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS time_put_byname<wchar_t>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS time_put_byname<char>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS time_put_byname<wchar_t>)
 
 // money_base
 
@@ -2663,10 +2669,10 @@ template <class _CharT, bool _International>
 const bool
 moneypunct<_CharT, _International>::intl;
 
-_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS moneypunct<char, false>)
-_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS moneypunct<char, true>)
-_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS moneypunct<wchar_t, false>)
-_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS moneypunct<wchar_t, true>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS moneypunct<char, false>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS moneypunct<char, true>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS moneypunct<wchar_t, false>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS moneypunct<wchar_t, true>)
 
 // moneypunct_byname
 
@@ -2720,10 +2726,10 @@ template<> _LIBCPP_FUNC_VIS void moneypunct_byname<char, true>::init(const char*
 template<> _LIBCPP_FUNC_VIS void moneypunct_byname<wchar_t, false>::init(const char*);
 template<> _LIBCPP_FUNC_VIS void moneypunct_byname<wchar_t, true>::init(const char*);
 
-_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS moneypunct_byname<char, false>)
-_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS moneypunct_byname<char, true>)
-_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS moneypunct_byname<wchar_t, false>)
-_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS moneypunct_byname<wchar_t, true>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS moneypunct_byname<char, false>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS moneypunct_byname<char, true>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS moneypunct_byname<wchar_t, false>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS moneypunct_byname<wchar_t, true>)
 
 // money_get
 
@@ -2779,8 +2785,8 @@ __money_get<_CharT>::__gather_info(bool __intl, const locale& __loc,
     }
 }
 
-_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS __money_get<char>)
-_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS __money_get<wchar_t>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS __money_get<char>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS __money_get<wchar_t>)
 
 template <class _CharT, class _InputIterator = istreambuf_iterator<_CharT> >
 class _LIBCPP_TEMPLATE_VIS money_get
@@ -3108,11 +3114,11 @@ money_get<_CharT, _InputIterator>::do_get(iter_type __b, iter_type __e,
         __ct.widen(__src, __src + (sizeof(__src)-1), __atoms);
         char __nbuf[__bz];
         char* __nc = __nbuf;
-        unique_ptr<char, void(*)(void*)> __h(0, free);
+        unique_ptr<char, void(*)(void*)> __h(nullptr, free);
         if (__wn - __wb.get() > __bz-2)
         {
             __h.reset((char*)malloc(static_cast<size_t>(__wn - __wb.get() + 2)));
-            if (__h.get() == 0)
+            if (__h.get() == nullptr)
                 __throw_bad_alloc();
             __nc = __h.get();
         }
@@ -3162,8 +3168,8 @@ money_get<_CharT, _InputIterator>::do_get(iter_type __b, iter_type __e,
     return __b;
 }
 
-_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS money_get<char>)
-_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS money_get<wchar_t>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS money_get<char>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS money_get<wchar_t>)
 
 // money_put
 
@@ -3337,8 +3343,8 @@ __money_put<_CharT>::__format(char_type* __mb, char_type*& __mi, char_type*& __m
         __mi = __mb;
 }
 
-_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS __money_put<char>)
-_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS __money_put<wchar_t>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS __money_put<char>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS __money_put<wchar_t>)
 
 template <class _CharT, class _OutputIterator = ostreambuf_iterator<_CharT> >
 class _LIBCPP_TEMPLATE_VIS money_put
@@ -3396,17 +3402,17 @@ money_put<_CharT, _OutputIterator>::do_put(iter_type __s, bool __intl,
     char* __bb = __buf;
     char_type __digits[__bs];
     char_type* __db = __digits;
-    size_t __n = static_cast<size_t>(snprintf(__bb, __bs, "%.0Lf", __units));
-    unique_ptr<char, void(*)(void*)> __hn(0, free);
+    int __n = snprintf(__bb, __bs, "%.0Lf", __units);
+    unique_ptr<char, void(*)(void*)> __hn(nullptr, free);
     unique_ptr<char_type, void(*)(void*)> __hd(0, free);
     // secure memory for digit storage
-    if (__n > __bs-1)
+    if (static_cast<size_t>(__n) > __bs-1)
     {
-        __n = static_cast<size_t>(__libcpp_asprintf_l(&__bb, _LIBCPP_GET_C_LOCALE, "%.0Lf", __units));
-        if (__bb == 0)
+        __n = __libcpp_asprintf_l(&__bb, _LIBCPP_GET_C_LOCALE, "%.0Lf", __units);
+        if (__n == -1)
             __throw_bad_alloc();
         __hn.reset(__bb);
-        __hd.reset((char_type*)malloc(__n * sizeof(char_type)));
+        __hd.reset((char_type*)malloc(static_cast<size_t>(__n) * sizeof(char_type)));
         if (__hd == nullptr)
             __throw_bad_alloc();
         __db = __hd.get();
@@ -3428,9 +3434,9 @@ money_put<_CharT, _OutputIterator>::do_put(iter_type __s, bool __intl,
     char_type __mbuf[__bs];
     char_type* __mb = __mbuf;
     unique_ptr<char_type, void(*)(void*)> __hw(0, free);
-    size_t __exn = static_cast<int>(__n) > __fd ?
-                   (__n - static_cast<size_t>(__fd)) * 2 + __sn.size() +
-                    __sym.size() + static_cast<size_t>(__fd) + 1
+    size_t __exn = __n > __fd ?
+                   (static_cast<size_t>(__n) - static_cast<size_t>(__fd)) * 2 +
+                    __sn.size() + __sym.size() + static_cast<size_t>(__fd) + 1
                  : __sn.size() + __sym.size() + static_cast<size_t>(__fd) + 2;
     if (__exn > __bs)
     {
@@ -3490,8 +3496,8 @@ money_put<_CharT, _OutputIterator>::do_put(iter_type __s, bool __intl,
     return __pad_and_output(__s, __mb, __mi, __me, __iob, __fl);
 }
 
-_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS money_put<char>)
-_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS money_put<wchar_t>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS money_put<char>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS money_put<wchar_t>)
 
 // messages
 
@@ -3582,7 +3588,7 @@ messages<_CharT>::do_get(catalog __c, int __set, int __msgid,
     char* __n = catgets(__cat, __set, __msgid, __ndflt.c_str());
     string_type __w;
     __widen_from_utf8<sizeof(char_type)*__CHAR_BIT__>()(back_inserter(__w),
-                                                        __n, __n + strlen(__n));
+                                                        __n, __n + _VSTD::strlen(__n));
     return __w;
 #else // !_LIBCPP_HAS_CATOPEN
     _LIBCPP_UNUSED_VAR(__c);
@@ -3606,8 +3612,8 @@ messages<_CharT>::do_close(catalog __c) const
 #endif // _LIBCPP_HAS_CATOPEN
 }
 
-_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS messages<char>)
-_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS messages<wchar_t>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS messages<char>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS messages<wchar_t>)
 
 template <class _CharT>
 class _LIBCPP_TEMPLATE_VIS messages_byname
@@ -3630,8 +3636,8 @@ protected:
     ~messages_byname() {}
 };
 
-_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS messages_byname<char>)
-_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS messages_byname<wchar_t>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS messages_byname<char>)
+_LIBCPP_EXTERN_TEMPLATE_EVEN_IN_DEBUG_MODE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS messages_byname<wchar_t>)
 
 template<class _Codecvt, class _Elem = wchar_t,
          class _Wide_alloc = allocator<_Elem>,
@@ -3654,8 +3660,17 @@ private:
     wstring_convert(const wstring_convert& __wc);
     wstring_convert& operator=(const wstring_convert& __wc);
 public:
+#ifndef _LIBCPP_CXX03_LANG
     _LIBCPP_INLINE_VISIBILITY
-    _LIBCPP_EXPLICIT_AFTER_CXX11 wstring_convert(_Codecvt* __pcvt = new _Codecvt);
+    wstring_convert() : wstring_convert(new _Codecvt) {}
+    _LIBCPP_INLINE_VISIBILITY
+    explicit wstring_convert(_Codecvt* __pcvt);
+#else
+    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_EXPLICIT_AFTER_CXX11
+    wstring_convert(_Codecvt* __pcvt = new _Codecvt);
+#endif
+
     _LIBCPP_INLINE_VISIBILITY
     wstring_convert(_Codecvt* __pcvt, state_type __state);
     _LIBCPP_EXPLICIT_AFTER_CXX11 wstring_convert(const byte_string& __byte_err,
@@ -3922,9 +3937,20 @@ private:
 
     wbuffer_convert(const wbuffer_convert&);
     wbuffer_convert& operator=(const wbuffer_convert&);
+
 public:
-    _LIBCPP_EXPLICIT_AFTER_CXX11 wbuffer_convert(streambuf* __bytebuf = 0,
-            _Codecvt* __pcvt = new _Codecvt, state_type __state = state_type());
+#ifndef _LIBCPP_CXX03_LANG
+    wbuffer_convert() : wbuffer_convert(nullptr) {}
+    explicit wbuffer_convert(streambuf* __bytebuf,
+                             _Codecvt* __pcvt = new _Codecvt,
+                             state_type __state = state_type());
+#else
+    _LIBCPP_EXPLICIT_AFTER_CXX11
+    wbuffer_convert(streambuf* __bytebuf = nullptr,
+                    _Codecvt* __pcvt = new _Codecvt,
+                    state_type __state = state_type());
+#endif
+
     ~wbuffer_convert();
 
     _LIBCPP_INLINE_VISIBILITY
@@ -3961,9 +3987,9 @@ private:
 template <class _Codecvt, class _Elem, class _Tr>
 wbuffer_convert<_Codecvt, _Elem, _Tr>::
     wbuffer_convert(streambuf* __bytebuf, _Codecvt* __pcvt, state_type __state)
-    : __extbuf_(0),
-      __extbufnext_(0),
-      __extbufend_(0),
+    : __extbuf_(nullptr),
+      __extbufnext_(nullptr),
+      __extbufend_(nullptr),
       __ebs_(0),
       __intbuf_(0),
       __ibs_(0),
@@ -4003,7 +4029,7 @@ wbuffer_convert<_Codecvt, _Elem, _Tr>::underflow()
     int_type __c = traits_type::eof();
     if (this->gptr() == this->egptr())
     {
-        memmove(this->eback(), this->egptr() - __unget_sz, __unget_sz * sizeof(char_type));
+        _VSTD::memmove(this->eback(), this->egptr() - __unget_sz, __unget_sz * sizeof(char_type));
         if (__always_noconv_)
         {
             streamsize __nmemb = static_cast<streamsize>(this->egptr() - this->eback() - __unget_sz);
@@ -4020,7 +4046,7 @@ wbuffer_convert<_Codecvt, _Elem, _Tr>::underflow()
         {
              _LIBCPP_ASSERT(!(__extbufnext_ == NULL && (__extbufend_ != __extbufnext_)), "underflow moving from NULL" );
              if (__extbufend_ != __extbufnext_)
-                memmove(__extbuf_, __extbufnext_, __extbufend_ - __extbufnext_);
+                _VSTD::memmove(__extbuf_, __extbufnext_, __extbufend_ - __extbufnext_);
             __extbufnext_ = __extbuf_ + (__extbufend_ - __extbufnext_);
             __extbufend_ = __extbuf_ + (__extbuf_ == __extbuf_min_ ? sizeof(__extbuf_min_) : __ebs_);
             streamsize __nmemb = _VSTD::min(static_cast<streamsize>(this->egptr() - this->eback() - __unget_sz),
@@ -4336,12 +4362,12 @@ template <class _Codecvt, class _Elem, class _Tr>
 wbuffer_convert<_Codecvt, _Elem, _Tr>*
 wbuffer_convert<_Codecvt, _Elem, _Tr>::__close()
 {
-    wbuffer_convert* __rt = 0;
-    if (__cv_ != 0 && __bufptr_ != 0)
+    wbuffer_convert* __rt = nullptr;
+    if (__cv_ != nullptr && __bufptr_ != nullptr)
     {
         __rt = this;
         if ((__cm_ & ios_base::out) && sync())
-            __rt = 0;
+            __rt = nullptr;
     }
     return __rt;
 }
diff --git a/lib/libcxx/include/locale.h b/lib/libcxx/include/locale.h
index a21ee385c3..81cfcee57f 100644
--- a/lib/libcxx/include/locale.h
+++ b/lib/libcxx/include/locale.h
@@ -35,8 +35,12 @@ Functions:
 
 #include <__config>
 
+#if defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#   error "Localization is not supported by this configuration of libc++"
+#endif
+
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#pragma GCC system_header
+#   pragma GCC system_header
 #endif
 
 #include_next <locale.h>
diff --git a/lib/libcxx/include/map b/lib/libcxx/include/map
index d2b8259136..a27002c279 100644
--- a/lib/libcxx/include/map
+++ b/lib/libcxx/include/map
@@ -1230,7 +1230,7 @@ public:
         return __tree_.__emplace_hint_unique_key_args(__h.__i_, __k,
             _VSTD::piecewise_construct,
             _VSTD::forward_as_tuple(__k),
-            _VSTD::forward_as_tuple(_VSTD::forward<_Args>(__args)...));
+            _VSTD::forward_as_tuple(_VSTD::forward<_Args>(__args)...)).first;
     }
 
     template <class... _Args>
@@ -1240,7 +1240,7 @@ public:
         return __tree_.__emplace_hint_unique_key_args(__h.__i_, __k,
             _VSTD::piecewise_construct,
             _VSTD::forward_as_tuple(_VSTD::move(__k)),
-            _VSTD::forward_as_tuple(_VSTD::forward<_Args>(__args)...));
+            _VSTD::forward_as_tuple(_VSTD::forward<_Args>(__args)...)).first;
     }
 
     template <class _Vp>
@@ -1270,30 +1270,30 @@ public:
     }
 
     template <class _Vp>
-        _LIBCPP_INLINE_VISIBILITY
-        iterator insert_or_assign(const_iterator __h, const key_type& __k, _Vp&& __v)
-     {
-        iterator __p = lower_bound(__k);
-        if ( __p != end() && !key_comp()(__k, __p->first))
-        {
-            __p->second = _VSTD::forward<_Vp>(__v);
-            return __p;
-        }
-        return emplace_hint(__h, __k, _VSTD::forward<_Vp>(__v));
-     }
+    _LIBCPP_INLINE_VISIBILITY iterator insert_or_assign(const_iterator __h,
+                                                        const key_type& __k,
+                                                        _Vp&& __v) {
+      auto [__r, __inserted] = __tree_.__emplace_hint_unique_key_args(
+          __h.__i_, __k, __k, _VSTD::forward<_Vp>(__v));
+
+      if (!__inserted)
+        __r->__get_value().second = _VSTD::forward<_Vp>(__v);
+
+      return __r;
+    }
 
     template <class _Vp>
-        _LIBCPP_INLINE_VISIBILITY
-        iterator insert_or_assign(const_iterator __h, key_type&& __k, _Vp&& __v)
-     {
-        iterator __p = lower_bound(__k);
-        if ( __p != end() && !key_comp()(__k, __p->first))
-        {
-            __p->second = _VSTD::forward<_Vp>(__v);
-            return __p;
-        }
-        return emplace_hint(__h, _VSTD::move(__k), _VSTD::forward<_Vp>(__v));
-     }
+    _LIBCPP_INLINE_VISIBILITY iterator insert_or_assign(const_iterator __h,
+                                                        key_type&& __k,
+                                                        _Vp&& __v) {
+      auto [__r, __inserted] = __tree_.__emplace_hint_unique_key_args(
+          __h.__i_, __k, _VSTD::move(__k), _VSTD::forward<_Vp>(__v));
+
+      if (!__inserted)
+        __r->__get_value().second = _VSTD::forward<_Vp>(__v);
+
+      return __r;
+    }
 
 #endif // _LIBCPP_STD_VER > 14
 
@@ -1546,7 +1546,7 @@ map<_Key, _Tp, _Compare, _Allocator>::__construct_node_with_key(const key_type&
     __h.get_deleter().__first_constructed = true;
     __node_traits::construct(__na, _VSTD::addressof(__h->__value_.__get_value().second));
     __h.get_deleter().__second_constructed = true;
-    return _LIBCPP_EXPLICIT_MOVE(__h);  // explicitly moved for C++03
+    return __h;
 }
 
 template <class _Key, class _Tp, class _Compare, class _Allocator>
diff --git a/lib/libcxx/include/memory b/lib/libcxx/include/memory
index 1f9f36c5bb..39d0f5bee6 100644
--- a/lib/libcxx/include/memory
+++ b/lib/libcxx/include/memory
@@ -46,7 +46,7 @@ struct pointer_traits<T*>
 };
 
 template <class T> constexpr T* to_address(T* p) noexcept; // C++20
-template <class Ptr> auto to_address(const Ptr& p) noexcept; // C++20
+template <class Ptr> constexpr auto to_address(const Ptr& p) noexcept; // C++20
 
 template <class Alloc>
 struct allocator_traits
@@ -83,21 +83,19 @@ struct allocator_traits
     template <class T> using rebind_alloc  = Alloc::rebind<T>::other | Alloc<T, Args...>;
     template <class T> using rebind_traits = allocator_traits<rebind_alloc<T>>;
 
-    static pointer allocate(allocator_type& a, size_type n);                          // [[nodiscard]] in C++20
-    static pointer allocate(allocator_type& a, size_type n, const_void_pointer hint); // [[nodiscard]] in C++20
+    static pointer allocate(allocator_type& a, size_type n);                          // constexpr and [[nodiscard]] in C++20
+    static pointer allocate(allocator_type& a, size_type n, const_void_pointer hint); // constexpr and [[nodiscard]] in C++20
 
-    static void deallocate(allocator_type& a, pointer p, size_type n) noexcept;
+    static void deallocate(allocator_type& a, pointer p, size_type n) noexcept; // constexpr in C++20
 
     template <class T, class... Args>
-        static void construct(allocator_type& a, T* p, Args&&... args);
+    static void construct(allocator_type& a, T* p, Args&&... args); // constexpr in C++20
 
     template <class T>
-        static void destroy(allocator_type& a, T* p);
+    static void destroy(allocator_type& a, T* p); // constexpr in C++20
 
-    static size_type max_size(const allocator_type& a); // noexcept in C++14
-
-    static allocator_type
-        select_on_container_copy_construction(const allocator_type& a);
+    static size_type max_size(const allocator_type& a); // noexcept in C++14, constexpr in C++20
+    static allocator_type select_on_container_copy_construction(const allocator_type& a); // constexpr in C++20
 };
 
 template <>
@@ -115,8 +113,8 @@ template <class T>
 class allocator
 {
 public:
-    typedef size_t    size_type;                         // deprecated in C++17, removed in C++20
-    typedef ptrdiff_t difference_type;                   // deprecated in C++17, removed in C++20
+    typedef size_t    size_type;
+    typedef ptrdiff_t difference_type;
     typedef T*        pointer;                           // deprecated in C++17, removed in C++20
     typedef const T*  const_pointer;                     // deprecated in C++17, removed in C++20
     typedef typename add_lvalue_reference<T>::type
@@ -135,12 +133,12 @@ public:
     constexpr allocator(const allocator&) noexcept;      // constexpr in C++20
     template <class U>
       constexpr allocator(const allocator<U>&) noexcept; // constexpr in C++20
-    ~allocator();
+    ~allocator();                                        // constexpr in C++20
     pointer address(reference x) const noexcept;             // deprecated in C++17, removed in C++20
     const_pointer address(const_reference x) const noexcept; // deprecated in C++17, removed in C++20
     T* allocate(size_t n, const void* hint);          // deprecated in C++17, removed in C++20
-    T* allocate(size_t n);
-    void deallocate(T* p, size_t n) noexcept;
+    T* allocate(size_t n);                              // constexpr in C++20
+    void deallocate(T* p, size_t n) noexcept;           // constexpr in C++20
     size_type max_size() const noexcept;              // deprecated in C++17, removed in C++20
     template<class U, class... Args>
         void construct(U* p, Args&&... args);         // deprecated in C++17, removed in C++20
@@ -149,10 +147,10 @@ public:
 };
 
 template <class T, class U>
-bool operator==(const allocator<T>&, const allocator<U>&) noexcept;
+bool operator==(const allocator<T>&, const allocator<U>&) noexcept; // constexpr in C++20
 
 template <class T, class U>
-bool operator!=(const allocator<T>&, const allocator<U>&) noexcept;
+bool operator!=(const allocator<T>&, const allocator<U>&) noexcept; // constexpr in C++20
 
 template <class OutputIterator, class T>
 class raw_storage_iterator
@@ -191,14 +189,17 @@ template <class ForwardIterator, class Size, class T>
 ForwardIterator
 uninitialized_fill_n(ForwardIterator first, Size n, const T& x);
 
+template <class T, class ...Args>
+constexpr T* construct_at(T* location, Args&& ...args); // since C++20
+
 template <class T>
-void destroy_at(T* location);
+void destroy_at(T* location); // constexpr in C++20
 
 template <class ForwardIterator>
- void destroy(ForwardIterator first, ForwardIterator last);
+void destroy(ForwardIterator first, ForwardIterator last); // constexpr in C++20
 
 template <class ForwardIterator, class Size>
- ForwardIterator destroy_n(ForwardIterator first, Size n);
+ForwardIterator destroy_n(ForwardIterator first, Size n); // constexpr in C++20
 
 template <class InputIterator, class ForwardIterator>
  ForwardIterator uninitialized_move(InputIterator first, InputIterator last, ForwardIterator result);
@@ -338,7 +339,7 @@ public:
     pointer release() noexcept;
     void reset(pointer p = pointer()) noexcept;
     void reset(nullptr_t) noexcept;
-    template <class U> void reset(U) = delete;
+  template <class U> void reset(U) = delete;
     void swap(unique_ptr& u) noexcept;
 };
 
@@ -664,6 +665,7 @@ void* align(size_t alignment, size_t size, void*& ptr, size_t& space);
 */
 
 #include <__config>
+#include <__availability>
 #include <type_traits>
 #include <typeinfo>
 #include <cstddef>
@@ -677,6 +679,10 @@ void* align(size_t alignment, size_t size, void*& ptr, size_t& space);
 #include <tuple>
 #include <stdexcept>
 #include <cstring>
+#include <__memory/allocator_traits.h>
+#include <__memory/base.h>
+#include <__memory/pointer_traits.h>
+#include <__memory/utilities.h>
 #if !defined(_LIBCPP_HAS_NO_ATOMIC_HEADER)
 #  include <atomic>
 #endif
@@ -716,413 +722,11 @@ _ValueType __libcpp_acquire_load(_ValueType const* __value) {
 #endif
 }
 
-// addressof moved to <type_traits>
-
-template <class _Tp> class allocator;
-
-#if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS)
-template <>
-class _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX17 allocator<void>
-{
-public:
-    typedef void*             pointer;
-    typedef const void*       const_pointer;
-    typedef void              value_type;
-
-    template <class _Up> struct rebind {typedef allocator<_Up> other;};
-};
-
-template <>
-class _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX17 allocator<const void>
-{
-public:
-    typedef const void*       pointer;
-    typedef const void*       const_pointer;
-    typedef const void        value_type;
-
-    template <class _Up> struct rebind {typedef allocator<_Up> other;};
-};
-#endif
-
-// pointer_traits
-
-template <class _Tp, class = void>
-struct __has_element_type : false_type {};
-
-template <class _Tp>
-struct __has_element_type<_Tp,
-              typename __void_t<typename _Tp::element_type>::type> : true_type {};
-
-template <class _Ptr, bool = __has_element_type<_Ptr>::value>
-struct __pointer_traits_element_type;
-
-template <class _Ptr>
-struct __pointer_traits_element_type<_Ptr, true>
-{
-    typedef _LIBCPP_NODEBUG_TYPE typename _Ptr::element_type type;
-};
-
-#ifndef _LIBCPP_HAS_NO_VARIADICS
-
-template <template <class, class...> class _Sp, class _Tp, class ..._Args>
-struct __pointer_traits_element_type<_Sp<_Tp, _Args...>, true>
-{
-    typedef _LIBCPP_NODEBUG_TYPE typename _Sp<_Tp, _Args...>::element_type type;
-};
-
-template <template <class, class...> class _Sp, class _Tp, class ..._Args>
-struct __pointer_traits_element_type<_Sp<_Tp, _Args...>, false>
-{
-    typedef _LIBCPP_NODEBUG_TYPE _Tp type;
-};
-
-#else  // _LIBCPP_HAS_NO_VARIADICS
-
-template <template <class> class _Sp, class _Tp>
-struct __pointer_traits_element_type<_Sp<_Tp>, true>
-{
-    typedef typename _Sp<_Tp>::element_type type;
-};
-
-template <template <class> class _Sp, class _Tp>
-struct __pointer_traits_element_type<_Sp<_Tp>, false>
-{
-    typedef _Tp type;
-};
-
-template <template <class, class> class _Sp, class _Tp, class _A0>
-struct __pointer_traits_element_type<_Sp<_Tp, _A0>, true>
-{
-    typedef typename _Sp<_Tp, _A0>::element_type type;
-};
-
-template <template <class, class> class _Sp, class _Tp, class _A0>
-struct __pointer_traits_element_type<_Sp<_Tp, _A0>, false>
-{
-    typedef _Tp type;
-};
-
-template <template <class, class, class> class _Sp, class _Tp, class _A0, class _A1>
-struct __pointer_traits_element_type<_Sp<_Tp, _A0, _A1>, true>
-{
-    typedef typename _Sp<_Tp, _A0, _A1>::element_type type;
-};
-
-template <template <class, class, class> class _Sp, class _Tp, class _A0, class _A1>
-struct __pointer_traits_element_type<_Sp<_Tp, _A0, _A1>, false>
-{
-    typedef _Tp type;
-};
-
-template <template <class, class, class, class> class _Sp, class _Tp, class _A0,
-                                                           class _A1, class _A2>
-struct __pointer_traits_element_type<_Sp<_Tp, _A0, _A1, _A2>, true>
-{
-    typedef typename _Sp<_Tp, _A0, _A1, _A2>::element_type type;
-};
-
-template <template <class, class, class, class> class _Sp, class _Tp, class _A0,
-                                                           class _A1, class _A2>
-struct __pointer_traits_element_type<_Sp<_Tp, _A0, _A1, _A2>, false>
-{
-    typedef _Tp type;
-};
-
-#endif  // _LIBCPP_HAS_NO_VARIADICS
-
-template <class _Tp, class = void>
-struct __has_difference_type : false_type {};
-
-template <class _Tp>
-struct __has_difference_type<_Tp,
-            typename __void_t<typename _Tp::difference_type>::type> : true_type {};
-
-template <class _Ptr, bool = __has_difference_type<_Ptr>::value>
-struct __pointer_traits_difference_type
-{
-    typedef _LIBCPP_NODEBUG_TYPE ptrdiff_t type;
-};
-
-template <class _Ptr>
-struct __pointer_traits_difference_type<_Ptr, true>
-{
-    typedef _LIBCPP_NODEBUG_TYPE typename _Ptr::difference_type type;
-};
-
-template <class _Tp, class _Up>
-struct __has_rebind
-{
-private:
-    struct __two {char __lx; char __lxx;};
-    template <class _Xp> static __two __test(...);
-    _LIBCPP_SUPPRESS_DEPRECATED_PUSH
-    template <class _Xp> static char __test(typename _Xp::template rebind<_Up>* = 0);
-    _LIBCPP_SUPPRESS_DEPRECATED_POP
-public:
-    static const bool value = sizeof(__test<_Tp>(0)) == 1;
-};
-
-template <class _Tp, class _Up, bool = __has_rebind<_Tp, _Up>::value>
-struct __pointer_traits_rebind
-{
-#ifndef _LIBCPP_CXX03_LANG
-    typedef _LIBCPP_NODEBUG_TYPE typename _Tp::template rebind<_Up> type;
-#else
-    typedef _LIBCPP_NODEBUG_TYPE typename _Tp::template rebind<_Up>::other type;
-#endif
-};
-
-#ifndef _LIBCPP_HAS_NO_VARIADICS
-
-template <template <class, class...> class _Sp, class _Tp, class ..._Args, class _Up>
-struct __pointer_traits_rebind<_Sp<_Tp, _Args...>, _Up, true>
-{
-#ifndef _LIBCPP_CXX03_LANG
-    typedef _LIBCPP_NODEBUG_TYPE typename _Sp<_Tp, _Args...>::template rebind<_Up> type;
-#else
-    typedef _LIBCPP_NODEBUG_TYPE typename _Sp<_Tp, _Args...>::template rebind<_Up>::other type;
-#endif
-};
-
-template <template <class, class...> class _Sp, class _Tp, class ..._Args, class _Up>
-struct __pointer_traits_rebind<_Sp<_Tp, _Args...>, _Up, false>
-{
-    typedef _Sp<_Up, _Args...> type;
-};
-
-#else  // _LIBCPP_HAS_NO_VARIADICS
-
-template <template <class> class _Sp, class _Tp, class _Up>
-struct __pointer_traits_rebind<_Sp<_Tp>, _Up, true>
-{
-#ifndef _LIBCPP_CXX03_LANG
-    typedef typename _Sp<_Tp>::template rebind<_Up> type;
-#else
-    typedef typename _Sp<_Tp>::template rebind<_Up>::other type;
-#endif
-};
-
-template <template <class> class _Sp, class _Tp, class _Up>
-struct __pointer_traits_rebind<_Sp<_Tp>, _Up, false>
-{
-    typedef _Sp<_Up> type;
-};
-
-template <template <class, class> class _Sp, class _Tp, class _A0, class _Up>
-struct __pointer_traits_rebind<_Sp<_Tp, _A0>, _Up, true>
-{
-#ifndef _LIBCPP_CXX03_LANG
-    typedef typename _Sp<_Tp, _A0>::template rebind<_Up> type;
-#else
-    typedef typename _Sp<_Tp, _A0>::template rebind<_Up>::other type;
-#endif
-};
-
-template <template <class, class> class _Sp, class _Tp, class _A0, class _Up>
-struct __pointer_traits_rebind<_Sp<_Tp, _A0>, _Up, false>
-{
-    typedef _Sp<_Up, _A0> type;
-};
-
-template <template <class, class, class> class _Sp, class _Tp, class _A0,
-                                         class _A1, class _Up>
-struct __pointer_traits_rebind<_Sp<_Tp, _A0, _A1>, _Up, true>
-{
-#ifndef _LIBCPP_CXX03_LANG
-    typedef typename _Sp<_Tp, _A0, _A1>::template rebind<_Up> type;
-#else
-    typedef typename _Sp<_Tp, _A0, _A1>::template rebind<_Up>::other type;
-#endif
-};
-
-template <template <class, class, class> class _Sp, class _Tp, class _A0,
-                                         class _A1, class _Up>
-struct __pointer_traits_rebind<_Sp<_Tp, _A0, _A1>, _Up, false>
-{
-    typedef _Sp<_Up, _A0, _A1> type;
-};
-
-template <template <class, class, class, class> class _Sp, class _Tp, class _A0,
-                                                class _A1, class _A2, class _Up>
-struct __pointer_traits_rebind<_Sp<_Tp, _A0, _A1, _A2>, _Up, true>
-{
-#ifndef _LIBCPP_CXX03_LANG
-    typedef typename _Sp<_Tp, _A0, _A1, _A2>::template rebind<_Up> type;
-#else
-    typedef typename _Sp<_Tp, _A0, _A1, _A2>::template rebind<_Up>::other type;
-#endif
-};
-
-template <template <class, class, class, class> class _Sp, class _Tp, class _A0,
-                                                class _A1, class _A2, class _Up>
-struct __pointer_traits_rebind<_Sp<_Tp, _A0, _A1, _A2>, _Up, false>
-{
-    typedef _Sp<_Up, _A0, _A1, _A2> type;
-};
-
-#endif  // _LIBCPP_HAS_NO_VARIADICS
-
-template <class _Ptr>
-struct _LIBCPP_TEMPLATE_VIS pointer_traits
-{
-    typedef _Ptr                                                     pointer;
-    typedef typename __pointer_traits_element_type<pointer>::type    element_type;
-    typedef typename __pointer_traits_difference_type<pointer>::type difference_type;
-
-#ifndef _LIBCPP_CXX03_LANG
-    template <class _Up> using rebind = typename __pointer_traits_rebind<pointer, _Up>::type;
-#else
-    template <class _Up> struct rebind
-        {typedef typename __pointer_traits_rebind<pointer, _Up>::type other;};
-#endif  // _LIBCPP_CXX03_LANG
-
-private:
-    struct __nat {};
-public:
-    _LIBCPP_INLINE_VISIBILITY
-    static pointer pointer_to(typename conditional<is_void<element_type>::value,
-                                           __nat, element_type>::type& __r)
-        {return pointer::pointer_to(__r);}
-};
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS pointer_traits<_Tp*>
-{
-    typedef _Tp*      pointer;
-    typedef _Tp       element_type;
-    typedef ptrdiff_t difference_type;
-
-#ifndef _LIBCPP_CXX03_LANG
-    template <class _Up> using rebind = _Up*;
-#else
-    template <class _Up> struct rebind {typedef _Up* other;};
-#endif
-
-private:
-    struct __nat {};
-public:
-    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
-    static pointer pointer_to(typename conditional<is_void<element_type>::value,
-                                      __nat, element_type>::type& __r) _NOEXCEPT
-        {return _VSTD::addressof(__r);}
-};
-
-template <class _From, class _To>
-struct __rebind_pointer {
-#ifndef _LIBCPP_CXX03_LANG
-    typedef typename pointer_traits<_From>::template rebind<_To>        type;
-#else
-    typedef typename pointer_traits<_From>::template rebind<_To>::other type;
-#endif
-};
-
-// allocator_traits
-
-template <class _Tp, class = void>
-struct __has_pointer_type : false_type {};
-
-template <class _Tp>
-struct __has_pointer_type<_Tp,
-          typename __void_t<typename _Tp::pointer>::type> : true_type {};
-
-namespace __pointer_type_imp
-{
-
-template <class _Tp, class _Dp, bool = __has_pointer_type<_Dp>::value>
-struct __pointer_type
-{
-    typedef _LIBCPP_NODEBUG_TYPE typename _Dp::pointer type;
-};
-
-template <class _Tp, class _Dp>
-struct __pointer_type<_Tp, _Dp, false>
-{
-    typedef _LIBCPP_NODEBUG_TYPE _Tp* type;
-};
-
-}  // __pointer_type_imp
-
-template <class _Tp, class _Dp>
-struct __pointer_type
-{
-    typedef _LIBCPP_NODEBUG_TYPE typename __pointer_type_imp::__pointer_type<_Tp, typename remove_reference<_Dp>::type>::type type;
-};
-
-template <class _Tp, class = void>
-struct __has_const_pointer : false_type {};
-
-template <class _Tp>
-struct __has_const_pointer<_Tp,
-            typename __void_t<typename _Tp::const_pointer>::type> : true_type {};
-
-template <class _Tp, class _Ptr, class _Alloc, bool = __has_const_pointer<_Alloc>::value>
-struct __const_pointer
-{
-    typedef _LIBCPP_NODEBUG_TYPE typename _Alloc::const_pointer type;
-};
-
-template <class _Tp, class _Ptr, class _Alloc>
-struct __const_pointer<_Tp, _Ptr, _Alloc, false>
-{
-#ifndef _LIBCPP_CXX03_LANG
-    typedef _LIBCPP_NODEBUG_TYPE typename pointer_traits<_Ptr>::template rebind<const _Tp> type;
-#else
-    typedef typename pointer_traits<_Ptr>::template rebind<const _Tp>::other type;
-#endif
-};
-
-template <class _Tp, class = void>
-struct __has_void_pointer : false_type {};
-
-template <class _Tp>
-struct __has_void_pointer<_Tp,
-               typename __void_t<typename _Tp::void_pointer>::type> : true_type {};
-
-template <class _Ptr, class _Alloc, bool = __has_void_pointer<_Alloc>::value>
-struct __void_pointer
-{
-    typedef _LIBCPP_NODEBUG_TYPE typename _Alloc::void_pointer type;
-};
-
-template <class _Ptr, class _Alloc>
-struct __void_pointer<_Ptr, _Alloc, false>
-{
-#ifndef _LIBCPP_CXX03_LANG
-    typedef _LIBCPP_NODEBUG_TYPE typename pointer_traits<_Ptr>::template rebind<void> type;
-#else
-    typedef _LIBCPP_NODEBUG_TYPE typename pointer_traits<_Ptr>::template rebind<void>::other type;
-#endif
-};
-
-template <class _Tp, class = void>
-struct __has_const_void_pointer : false_type {};
-
-template <class _Tp>
-struct __has_const_void_pointer<_Tp,
-            typename __void_t<typename _Tp::const_void_pointer>::type> : true_type {};
-
-template <class _Ptr, class _Alloc, bool = __has_const_void_pointer<_Alloc>::value>
-struct __const_void_pointer
-{
-    typedef _LIBCPP_NODEBUG_TYPE typename _Alloc::const_void_pointer type;
-};
-
-template <class _Ptr, class _Alloc>
-struct __const_void_pointer<_Ptr, _Alloc, false>
-{
-#ifndef _LIBCPP_CXX03_LANG
-    typedef _LIBCPP_NODEBUG_TYPE typename pointer_traits<_Ptr>::template rebind<const void> type;
-#else
-    typedef _LIBCPP_NODEBUG_TYPE typename pointer_traits<_Ptr>::template rebind<const void>::other type;
-#endif
-};
-
-
 template <bool _UsePointerTraits> struct __to_address_helper;
 
 template <> struct __to_address_helper<true> {
     template <class _Pointer>
-    using __return_type = decltype(pointer_traits<_Pointer>::to_address(std::declval<const _Pointer&>()));
+    using __return_type = decltype(pointer_traits<_Pointer>::to_address(_VSTD::declval<const _Pointer&>()));
 
     template <class _Pointer>
     _LIBCPP_CONSTEXPR
@@ -1157,7 +761,7 @@ template <> struct __to_address_helper<false> {
     template <class _Pointer>
     _LIBCPP_CONSTEXPR
     static __return_type<_Pointer>
-    __do_it(const _Pointer &__p) _NOEXCEPT { return std::__to_address(__p.operator->()); }
+    __do_it(const _Pointer &__p) _NOEXCEPT { return _VSTD::__to_address(__p.operator->()); }
 };
 
 
@@ -1172,7 +776,7 @@ to_address(_Tp* __p) _NOEXCEPT
 }
 
 template <class _Pointer>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY constexpr
 auto
 to_address(const _Pointer& __p) _NOEXCEPT
 {
@@ -1180,567 +784,31 @@ to_address(const _Pointer& __p) _NOEXCEPT
 }
 #endif
 
-template <class _Tp, class = void>
-struct __has_size_type : false_type {};
+template <class _Tp> class allocator;
 
-template <class _Tp>
-struct __has_size_type<_Tp,
-               typename __void_t<typename _Tp::size_type>::type> : true_type {};
-
-template <class _Alloc, class _DiffType, bool = __has_size_type<_Alloc>::value>
-struct __size_type
+#if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS)
+template <>
+class _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX17 allocator<void>
 {
-    typedef _LIBCPP_NODEBUG_TYPE typename make_unsigned<_DiffType>::type type;
-};
-
-template <class _Alloc, class _DiffType>
-struct __size_type<_Alloc, _DiffType, true>
-{
-    typedef _LIBCPP_NODEBUG_TYPE typename _Alloc::size_type type;
-};
-
-template <class _Tp, class = void>
-struct __has_propagate_on_container_copy_assignment : false_type {};
-
-template <class _Tp>
-struct __has_propagate_on_container_copy_assignment<_Tp,
-    typename __void_t<typename _Tp::propagate_on_container_copy_assignment>::type>
-        : true_type {};
-
-template <class _Alloc, bool = __has_propagate_on_container_copy_assignment<_Alloc>::value>
-struct __propagate_on_container_copy_assignment
-{
-    typedef _LIBCPP_NODEBUG_TYPE false_type type;
-};
-
-template <class _Alloc>
-struct __propagate_on_container_copy_assignment<_Alloc, true>
-{
-    typedef _LIBCPP_NODEBUG_TYPE typename _Alloc::propagate_on_container_copy_assignment type;
-};
-
-template <class _Tp, class = void>
-struct __has_propagate_on_container_move_assignment : false_type {};
-
-template <class _Tp>
-struct __has_propagate_on_container_move_assignment<_Tp,
-           typename __void_t<typename _Tp::propagate_on_container_move_assignment>::type>
-               : true_type {};
-
-template <class _Alloc, bool = __has_propagate_on_container_move_assignment<_Alloc>::value>
-struct __propagate_on_container_move_assignment
-{
-    typedef false_type type;
-};
-
-template <class _Alloc>
-struct __propagate_on_container_move_assignment<_Alloc, true>
-{
-    typedef _LIBCPP_NODEBUG_TYPE typename _Alloc::propagate_on_container_move_assignment type;
-};
-
-template <class _Tp, class = void>
-struct __has_propagate_on_container_swap : false_type {};
-
-template <class _Tp>
-struct __has_propagate_on_container_swap<_Tp,
-           typename __void_t<typename _Tp::propagate_on_container_swap>::type>
-               : true_type {};
-
-template <class _Alloc, bool = __has_propagate_on_container_swap<_Alloc>::value>
-struct __propagate_on_container_swap
-{
-    typedef false_type type;
-};
-
-template <class _Alloc>
-struct __propagate_on_container_swap<_Alloc, true>
-{
-    typedef _LIBCPP_NODEBUG_TYPE typename _Alloc::propagate_on_container_swap type;
-};
-
-template <class _Tp, class = void>
-struct __has_is_always_equal : false_type {};
-
-template <class _Tp>
-struct __has_is_always_equal<_Tp,
-           typename __void_t<typename _Tp::is_always_equal>::type>
-               : true_type {};
-
-template <class _Alloc, bool = __has_is_always_equal<_Alloc>::value>
-struct __is_always_equal
-{
-    typedef _LIBCPP_NODEBUG_TYPE typename _VSTD::is_empty<_Alloc>::type type;
-};
-
-template <class _Alloc>
-struct __is_always_equal<_Alloc, true>
-{
-    typedef _LIBCPP_NODEBUG_TYPE typename _Alloc::is_always_equal type;
-};
-
-template <class _Tp, class _Up, bool = __has_rebind<_Tp, _Up>::value>
-struct __has_rebind_other
-{
-private:
-    struct __two {char __lx; char __lxx;};
-    template <class _Xp> static __two __test(...);
-    _LIBCPP_SUPPRESS_DEPRECATED_PUSH
-    template <class _Xp> static char __test(typename _Xp::template rebind<_Up>::other* = 0);
-    _LIBCPP_SUPPRESS_DEPRECATED_POP
 public:
-    static const bool value = sizeof(__test<_Tp>(0)) == 1;
+    typedef void*             pointer;
+    typedef const void*       const_pointer;
+    typedef void              value_type;
+
+    template <class _Up> struct rebind {typedef allocator<_Up> other;};
 };
 
-template <class _Tp, class _Up>
-struct __has_rebind_other<_Tp, _Up, false>
+template <>
+class _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX17 allocator<const void>
 {
-    static const bool value = false;
+public:
+    typedef const void*       pointer;
+    typedef const void*       const_pointer;
+    typedef const void        value_type;
+
+    template <class _Up> struct rebind {typedef allocator<_Up> other;};
 };
-
-template <class _Tp, class _Up, bool = __has_rebind_other<_Tp, _Up>::value>
-struct __allocator_traits_rebind
-{
-    _LIBCPP_SUPPRESS_DEPRECATED_PUSH
-    typedef _LIBCPP_NODEBUG_TYPE typename _Tp::template rebind<_Up>::other type;
-    _LIBCPP_SUPPRESS_DEPRECATED_POP
-};
-
-template <template <class, class...> class _Alloc, class _Tp, class ..._Args, class _Up>
-struct __allocator_traits_rebind<_Alloc<_Tp, _Args...>, _Up, true>
-{
-    _LIBCPP_SUPPRESS_DEPRECATED_PUSH
-    typedef _LIBCPP_NODEBUG_TYPE typename _Alloc<_Tp, _Args...>::template rebind<_Up>::other type;
-    _LIBCPP_SUPPRESS_DEPRECATED_POP
-};
-
-template <template <class, class...> class _Alloc, class _Tp, class ..._Args, class _Up>
-struct __allocator_traits_rebind<_Alloc<_Tp, _Args...>, _Up, false>
-{
-    typedef _LIBCPP_NODEBUG_TYPE _Alloc<_Up, _Args...> type;
-};
-
-#ifndef _LIBCPP_CXX03_LANG
-
-_LIBCPP_SUPPRESS_DEPRECATED_PUSH
-template <class _Alloc, class _SizeType, class _ConstVoidPtr>
-auto
-__has_allocate_hint_test(_Alloc&& __a, _SizeType&& __sz, _ConstVoidPtr&& __p)
-    -> decltype((void)__a.allocate(__sz, __p), true_type());
-_LIBCPP_SUPPRESS_DEPRECATED_POP
-
-template <class _Alloc, class _SizeType, class _ConstVoidPtr>
-auto
-__has_allocate_hint_test(const _Alloc& __a, _SizeType&& __sz, _ConstVoidPtr&& __p)
-    -> false_type;
-
-template <class _Alloc, class _SizeType, class _ConstVoidPtr>
-struct __has_allocate_hint
-    : decltype(_VSTD::__has_allocate_hint_test(declval<_Alloc>(),
-                                               declval<_SizeType>(),
-                                               declval<_ConstVoidPtr>()))
-{
-};
-
-#else  // _LIBCPP_CXX03_LANG
-
-template <class _Alloc, class _SizeType, class _ConstVoidPtr>
-struct __has_allocate_hint
-    : true_type
-{
-};
-
-#endif  // _LIBCPP_CXX03_LANG
-
-_LIBCPP_SUPPRESS_DEPRECATED_PUSH
-template <class _Alloc, class ..._Args,
-    class = decltype(_VSTD::declval<_Alloc>().construct(_VSTD::declval<_Args>()...))>
-static true_type __test_has_construct(int);
-_LIBCPP_SUPPRESS_DEPRECATED_POP
-
-template <class _Alloc, class...>
-static false_type __test_has_construct(...);
-
-template <class _Alloc, class ..._Args>
-struct __has_construct : decltype(__test_has_construct<_Alloc, _Args...>(0)) {};
-
-#if !defined(_LIBCPP_CXX03_LANG)
-
-_LIBCPP_SUPPRESS_DEPRECATED_PUSH
-template <class _Alloc, class _Pointer>
-auto
-__has_destroy_test(_Alloc&& __a, _Pointer&& __p)
-    -> decltype(__a.destroy(__p), true_type());
-_LIBCPP_SUPPRESS_DEPRECATED_POP
-
-template <class _Alloc, class _Pointer>
-auto
-__has_destroy_test(const _Alloc& __a, _Pointer&& __p)
-    -> false_type;
-
-template <class _Alloc, class _Pointer>
-struct __has_destroy
-    : decltype(_VSTD::__has_destroy_test(declval<_Alloc>(),
-                                         declval<_Pointer>()))
-{
-};
-
-_LIBCPP_SUPPRESS_DEPRECATED_PUSH
-template <class _Alloc>
-auto
-__has_max_size_test(_Alloc&& __a)
-    -> decltype(__a.max_size(), true_type());
-_LIBCPP_SUPPRESS_DEPRECATED_POP
-
-template <class _Alloc>
-auto
-__has_max_size_test(const volatile _Alloc& __a)
-    -> false_type;
-
-template <class _Alloc>
-struct __has_max_size
-    : decltype(_VSTD::__has_max_size_test(declval<_Alloc&>()))
-{
-};
-
-template <class _Alloc>
-auto
-__has_select_on_container_copy_construction_test(_Alloc&& __a)
-    -> decltype(__a.select_on_container_copy_construction(), true_type());
-
-template <class _Alloc>
-auto
-__has_select_on_container_copy_construction_test(const volatile _Alloc& __a)
-    -> false_type;
-
-template <class _Alloc>
-struct __has_select_on_container_copy_construction
-    : decltype(_VSTD::__has_select_on_container_copy_construction_test(declval<_Alloc&>()))
-{
-};
-
-#else  // _LIBCPP_CXX03_LANG
-
-template <class _Alloc, class _Pointer, class = void>
-struct __has_destroy : false_type {};
-
-template <class _Alloc, class _Pointer>
-struct __has_destroy<_Alloc, _Pointer, typename __void_t<
-    decltype(_VSTD::declval<_Alloc>().destroy(_VSTD::declval<_Pointer>()))
->::type> : std::true_type {};
-
-template <class _Alloc>
-struct __has_max_size
-    : true_type
-{
-};
-
-template <class _Alloc>
-struct __has_select_on_container_copy_construction
-    : false_type
-{
-};
-
-#endif  // _LIBCPP_CXX03_LANG
-
-template <class _Alloc, class _Ptr, bool = __has_difference_type<_Alloc>::value>
-struct __alloc_traits_difference_type
-{
-    typedef _LIBCPP_NODEBUG_TYPE typename pointer_traits<_Ptr>::difference_type type;
-};
-
-template <class _Alloc, class _Ptr>
-struct __alloc_traits_difference_type<_Alloc, _Ptr, true>
-{
-    typedef _LIBCPP_NODEBUG_TYPE typename _Alloc::difference_type type;
-};
-
-template <class _Tp>
-struct __is_default_allocator : false_type {};
-
-template <class _Tp>
-struct __is_default_allocator<_VSTD::allocator<_Tp> > : true_type {};
-
-
-
-template <class _Alloc,
-    bool = __has_construct<_Alloc, typename _Alloc::value_type*,  typename _Alloc::value_type&&>::value && !__is_default_allocator<_Alloc>::value
-    >
-struct __is_cpp17_move_insertable;
-template <class _Alloc>
-struct __is_cpp17_move_insertable<_Alloc, true> : std::true_type {};
-template <class _Alloc>
-struct __is_cpp17_move_insertable<_Alloc, false> : std::is_move_constructible<typename _Alloc::value_type> {};
-
-template <class _Alloc,
-    bool = __has_construct<_Alloc, typename _Alloc::value_type*, const typename _Alloc::value_type&>::value && !__is_default_allocator<_Alloc>::value
-    >
-struct __is_cpp17_copy_insertable;
-template <class _Alloc>
-struct __is_cpp17_copy_insertable<_Alloc, true> : __is_cpp17_move_insertable<_Alloc> {};
-template <class _Alloc>
-struct __is_cpp17_copy_insertable<_Alloc, false> : integral_constant<bool,
-    std::is_copy_constructible<typename _Alloc::value_type>::value &&
-    __is_cpp17_move_insertable<_Alloc>::value>
-  {};
-
-
-
-template <class _Alloc>
-struct _LIBCPP_TEMPLATE_VIS allocator_traits
-{
-    typedef _Alloc                              allocator_type;
-    typedef typename allocator_type::value_type value_type;
-
-    typedef typename __pointer_type<value_type, allocator_type>::type pointer;
-    typedef typename __const_pointer<value_type, pointer, allocator_type>::type const_pointer;
-    typedef typename __void_pointer<pointer, allocator_type>::type void_pointer;
-    typedef typename __const_void_pointer<pointer, allocator_type>::type const_void_pointer;
-
-    typedef typename __alloc_traits_difference_type<allocator_type, pointer>::type difference_type;
-    typedef typename __size_type<allocator_type, difference_type>::type size_type;
-
-    typedef typename __propagate_on_container_copy_assignment<allocator_type>::type
-                     propagate_on_container_copy_assignment;
-    typedef typename __propagate_on_container_move_assignment<allocator_type>::type
-                     propagate_on_container_move_assignment;
-    typedef typename __propagate_on_container_swap<allocator_type>::type
-                     propagate_on_container_swap;
-    typedef typename __is_always_equal<allocator_type>::type
-                     is_always_equal;
-
-#ifndef _LIBCPP_CXX03_LANG
-    template <class _Tp> using rebind_alloc =
-                  typename __allocator_traits_rebind<allocator_type, _Tp>::type;
-    template <class _Tp> using rebind_traits = allocator_traits<rebind_alloc<_Tp> >;
-#else  // _LIBCPP_CXX03_LANG
-    template <class _Tp> struct rebind_alloc
-        {typedef typename __allocator_traits_rebind<allocator_type, _Tp>::type other;};
-    template <class _Tp> struct rebind_traits
-        {typedef allocator_traits<typename rebind_alloc<_Tp>::other> other;};
-#endif  // _LIBCPP_CXX03_LANG
-
-    _LIBCPP_NODISCARD_AFTER_CXX17 _LIBCPP_INLINE_VISIBILITY
-    static pointer allocate(allocator_type& __a, size_type __n)
-        {return __a.allocate(__n);}
-    _LIBCPP_NODISCARD_AFTER_CXX17 _LIBCPP_INLINE_VISIBILITY
-    static pointer allocate(allocator_type& __a, size_type __n, const_void_pointer __hint)
-        {return __allocate(__a, __n, __hint,
-            __has_allocate_hint<allocator_type, size_type, const_void_pointer>());}
-
-    _LIBCPP_INLINE_VISIBILITY
-    static void deallocate(allocator_type& __a, pointer __p, size_type __n) _NOEXCEPT
-        {__a.deallocate(__p, __n);}
-
-    template <class _Tp, class... _Args>
-        _LIBCPP_INLINE_VISIBILITY
-        static void construct(allocator_type& __a, _Tp* __p, _Args&&... __args)
-            {__construct(__has_construct<allocator_type, _Tp*, _Args...>(),
-                         __a, __p, _VSTD::forward<_Args>(__args)...);}
-
-    template <class _Tp>
-        _LIBCPP_INLINE_VISIBILITY
-        static void destroy(allocator_type& __a, _Tp* __p)
-            {__destroy(__has_destroy<allocator_type, _Tp*>(), __a, __p);}
-
-    _LIBCPP_INLINE_VISIBILITY
-    static size_type max_size(const allocator_type& __a) _NOEXCEPT
-        {return __max_size(__has_max_size<const allocator_type>(), __a);}
-
-    _LIBCPP_INLINE_VISIBILITY
-    static allocator_type
-        select_on_container_copy_construction(const allocator_type& __a)
-            {return __select_on_container_copy_construction(
-                __has_select_on_container_copy_construction<const allocator_type>(),
-                __a);}
-
-    template <class _Ptr>
-        _LIBCPP_INLINE_VISIBILITY
-        static
-        void
-        __construct_forward_with_exception_guarantees(allocator_type& __a, _Ptr __begin1, _Ptr __end1, _Ptr& __begin2)
-        {
-            static_assert(__is_cpp17_move_insertable<allocator_type>::value,
-              "The specified type does not meet the requirements of Cpp17MoveInsertible");
-            for (; __begin1 != __end1; ++__begin1, (void) ++__begin2)
-              construct(__a, _VSTD::__to_address(__begin2),
-#ifdef _LIBCPP_NO_EXCEPTIONS
-                        _VSTD::move(*__begin1)
-#else
-                        _VSTD::move_if_noexcept(*__begin1)
 #endif
-                        );
-        }
-
-    template <class _Tp>
-        _LIBCPP_INLINE_VISIBILITY
-        static
-        typename enable_if
-        <
-            (__is_default_allocator<allocator_type>::value
-                || !__has_construct<allocator_type, _Tp*, _Tp>::value) &&
-             is_trivially_move_constructible<_Tp>::value,
-            void
-        >::type
-        __construct_forward_with_exception_guarantees(allocator_type&, _Tp* __begin1, _Tp* __end1, _Tp*& __begin2)
-        {
-            ptrdiff_t _Np = __end1 - __begin1;
-            if (_Np > 0)
-            {
-                _VSTD::memcpy(__begin2, __begin1, _Np * sizeof(_Tp));
-                __begin2 += _Np;
-            }
-        }
-
-    template <class _Iter, class _Ptr>
-        _LIBCPP_INLINE_VISIBILITY
-        static
-        void
-        __construct_range_forward(allocator_type& __a, _Iter __begin1, _Iter __end1, _Ptr& __begin2)
-        {
-            for (; __begin1 != __end1; ++__begin1, (void) ++__begin2)
-                construct(__a, _VSTD::__to_address(__begin2), *__begin1);
-        }
-
-    template <class _SourceTp, class _DestTp,
-              class _RawSourceTp = typename remove_const<_SourceTp>::type,
-              class _RawDestTp = typename remove_const<_DestTp>::type>
-        _LIBCPP_INLINE_VISIBILITY
-        static
-        typename enable_if
-        <
-            is_trivially_copy_constructible<_DestTp>::value &&
-            is_same<_RawSourceTp, _RawDestTp>::value &&
-            (__is_default_allocator<allocator_type>::value ||
-             !__has_construct<allocator_type, _DestTp*, _SourceTp&>::value),
-            void
-        >::type
-        __construct_range_forward(allocator_type&, _SourceTp* __begin1, _SourceTp* __end1, _DestTp*& __begin2)
-        {
-            ptrdiff_t _Np = __end1 - __begin1;
-            if (_Np > 0)
-            {
-                _VSTD::memcpy(const_cast<_RawDestTp*>(__begin2), __begin1, _Np * sizeof(_DestTp));
-                __begin2 += _Np;
-            }
-        }
-
-    template <class _Ptr>
-        _LIBCPP_INLINE_VISIBILITY
-        static
-        void
-        __construct_backward_with_exception_guarantees(allocator_type& __a, _Ptr __begin1, _Ptr __end1, _Ptr& __end2)
-        {
-            static_assert(__is_cpp17_move_insertable<allocator_type>::value,
-              "The specified type does not meet the requirements of Cpp17MoveInsertable");
-            while (__end1 != __begin1)
-            {
-              construct(__a, _VSTD::__to_address(__end2 - 1),
-#ifdef _LIBCPP_NO_EXCEPTIONS
-                        _VSTD::move(*--__end1)
-#else
-                        _VSTD::move_if_noexcept(*--__end1)
-#endif
-                        );
-              --__end2;
-            }
-        }
-
-    template <class _Tp>
-        _LIBCPP_INLINE_VISIBILITY
-        static
-        typename enable_if
-        <
-            (__is_default_allocator<allocator_type>::value
-                || !__has_construct<allocator_type, _Tp*, _Tp>::value) &&
-             is_trivially_move_constructible<_Tp>::value,
-            void
-        >::type
-        __construct_backward_with_exception_guarantees(allocator_type&, _Tp* __begin1, _Tp* __end1, _Tp*& __end2)
-        {
-            ptrdiff_t _Np = __end1 - __begin1;
-            __end2 -= _Np;
-            if (_Np > 0)
-                _VSTD::memcpy(__end2, __begin1, _Np * sizeof(_Tp));
-        }
-
-private:
-
-    _LIBCPP_INLINE_VISIBILITY
-    static pointer __allocate(allocator_type& __a, size_type __n,
-        const_void_pointer __hint, true_type)
-        {
-            _LIBCPP_SUPPRESS_DEPRECATED_PUSH
-            return __a.allocate(__n, __hint);
-            _LIBCPP_SUPPRESS_DEPRECATED_POP
-        }
-    _LIBCPP_INLINE_VISIBILITY
-    static pointer __allocate(allocator_type& __a, size_type __n,
-        const_void_pointer, false_type)
-        {return __a.allocate(__n);}
-
-    template <class _Tp, class... _Args>
-        _LIBCPP_INLINE_VISIBILITY
-        static void __construct(true_type, allocator_type& __a, _Tp* __p, _Args&&... __args)
-            {
-                _LIBCPP_SUPPRESS_DEPRECATED_PUSH
-                __a.construct(__p, _VSTD::forward<_Args>(__args)...);
-                _LIBCPP_SUPPRESS_DEPRECATED_POP
-            }
-
-    template <class _Tp, class... _Args>
-        _LIBCPP_INLINE_VISIBILITY
-        static void __construct(false_type, allocator_type&, _Tp* __p, _Args&&... __args)
-            {
-                ::new ((void*)__p) _Tp(_VSTD::forward<_Args>(__args)...);
-            }
-
-    template <class _Tp>
-        _LIBCPP_INLINE_VISIBILITY
-        static void __destroy(true_type, allocator_type& __a, _Tp* __p)
-            {
-                _LIBCPP_SUPPRESS_DEPRECATED_PUSH
-                __a.destroy(__p);
-                _LIBCPP_SUPPRESS_DEPRECATED_POP
-            }
-    template <class _Tp>
-        _LIBCPP_INLINE_VISIBILITY
-        static void __destroy(false_type, allocator_type&, _Tp* __p)
-            {
-                __p->~_Tp();
-            }
-
-    _LIBCPP_INLINE_VISIBILITY
-    static size_type __max_size(true_type, const allocator_type& __a) _NOEXCEPT
-            {
-                _LIBCPP_SUPPRESS_DEPRECATED_PUSH
-                return __a.max_size();
-                _LIBCPP_SUPPRESS_DEPRECATED_POP
-            }
-
-    _LIBCPP_INLINE_VISIBILITY
-    static size_type __max_size(false_type, const allocator_type&) _NOEXCEPT
-            {return numeric_limits<size_type>::max() / sizeof(value_type);}
-
-    _LIBCPP_INLINE_VISIBILITY
-    static allocator_type
-        __select_on_container_copy_construction(true_type, const allocator_type& __a)
-            {return __a.select_on_container_copy_construction();}
-    _LIBCPP_INLINE_VISIBILITY
-    static allocator_type
-        __select_on_container_copy_construction(false_type, const allocator_type& __a)
-            {return __a;}
-};
-
-template <class _Traits, class _Tp>
-struct __rebind_alloc_helper
-{
-#ifndef _LIBCPP_CXX03_LANG
-    typedef _LIBCPP_NODEBUG_TYPE typename _Traits::template rebind_alloc<_Tp>        type;
-#else
-    typedef typename _Traits::template rebind_alloc<_Tp>::other type;
-#endif
-};
 
 // allocator
 
@@ -1748,67 +816,80 @@ template <class _Tp>
 class _LIBCPP_TEMPLATE_VIS allocator
 {
 public:
+    typedef size_t      size_type;
+    typedef ptrdiff_t   difference_type;
+    typedef _Tp         value_type;
+    typedef true_type   propagate_on_container_move_assignment;
+    typedef true_type   is_always_equal;
+
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
+    allocator() _NOEXCEPT { }
+
+    template <class _Up>
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
+    allocator(const allocator<_Up>&) _NOEXCEPT { }
+
+    _LIBCPP_NODISCARD_AFTER_CXX17 _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
+    _Tp* allocate(size_t __n) {
+        if (__n > allocator_traits<allocator>::max_size(*this))
+            __throw_length_error("allocator<T>::allocate(size_t n)"
+                                 " 'n' exceeds maximum supported size");
+        if (__libcpp_is_constant_evaluated()) {
+            return static_cast<_Tp*>(::operator new(__n * sizeof(_Tp)));
+        } else {
+            return static_cast<_Tp*>(_VSTD::__libcpp_allocate(__n * sizeof(_Tp), _LIBCPP_ALIGNOF(_Tp)));
+        }
+    }
+
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
+    void deallocate(_Tp* __p, size_t __n) _NOEXCEPT {
+        if (__libcpp_is_constant_evaluated()) {
+            ::operator delete(__p);
+        } else {
+            _VSTD::__libcpp_deallocate((void*)__p, __n * sizeof(_Tp), _LIBCPP_ALIGNOF(_Tp));
+        }
+    }
+
+    // C++20 Removed members
 #if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS)
-    _LIBCPP_DEPRECATED_IN_CXX17 typedef size_t     size_type;
-    _LIBCPP_DEPRECATED_IN_CXX17 typedef ptrdiff_t  difference_type;
     _LIBCPP_DEPRECATED_IN_CXX17 typedef _Tp*       pointer;
     _LIBCPP_DEPRECATED_IN_CXX17 typedef const _Tp* const_pointer;
     _LIBCPP_DEPRECATED_IN_CXX17 typedef _Tp&       reference;
     _LIBCPP_DEPRECATED_IN_CXX17 typedef const _Tp& const_reference;
 
-    template <class _Up> struct _LIBCPP_DEPRECATED_IN_CXX17 rebind {typedef allocator<_Up> other;};
-#endif
-
-    typedef _Tp value_type;
-
-    typedef true_type propagate_on_container_move_assignment;
-    typedef true_type is_always_equal;
-
-    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
-    allocator() _NOEXCEPT {}
-
     template <class _Up>
-    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
-    allocator(const allocator<_Up>&) _NOEXCEPT {}
+    struct _LIBCPP_DEPRECATED_IN_CXX17 rebind {
+        typedef allocator<_Up> other;
+    };
 
-#if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS)
     _LIBCPP_DEPRECATED_IN_CXX17 _LIBCPP_INLINE_VISIBILITY
-    pointer address(reference __x) const _NOEXCEPT
-        {return _VSTD::addressof(__x);}
+    pointer address(reference __x) const _NOEXCEPT {
+        return _VSTD::addressof(__x);
+    }
     _LIBCPP_DEPRECATED_IN_CXX17 _LIBCPP_INLINE_VISIBILITY
-    const_pointer address(const_reference __x) const _NOEXCEPT
-        {return _VSTD::addressof(__x);}
-#endif
+    const_pointer address(const_reference __x) const _NOEXCEPT {
+        return _VSTD::addressof(__x);
+    }
 
-    _LIBCPP_NODISCARD_AFTER_CXX17 _LIBCPP_INLINE_VISIBILITY _Tp* allocate(size_t __n)
-        {
-        // TODO(mpark): Replace with `allocator_traits<allocator>::max_size(*this)`.
-        if (__n > (size_t(~0) / sizeof(_Tp)))
-            __throw_length_error("allocator<T>::allocate(size_t n)"
-                                 " 'n' exceeds maximum supported size");
-        return static_cast<_Tp*>(_VSTD::__libcpp_allocate(__n * sizeof(_Tp), _LIBCPP_ALIGNOF(_Tp)));
-        }
-
-#if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS)
     _LIBCPP_NODISCARD_AFTER_CXX17 _LIBCPP_INLINE_VISIBILITY _LIBCPP_DEPRECATED_IN_CXX17
-    _Tp* allocate(size_t __n, const void*) { return allocate(__n); }
-#endif
+    _Tp* allocate(size_t __n, const void*) {
+        return allocate(__n);
+    }
 
-    _LIBCPP_INLINE_VISIBILITY void deallocate(_Tp* __p, size_t __n) _NOEXCEPT
-        {_VSTD::__libcpp_deallocate((void*)__p, __n * sizeof(_Tp), _LIBCPP_ALIGNOF(_Tp));}
-
-#if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS)
-    _LIBCPP_DEPRECATED_IN_CXX17 _LIBCPP_INLINE_VISIBILITY size_type max_size() const _NOEXCEPT
-        {return size_type(~0) / sizeof(_Tp);}
+    _LIBCPP_DEPRECATED_IN_CXX17 _LIBCPP_INLINE_VISIBILITY size_type max_size() const _NOEXCEPT {
+        return size_type(~0) / sizeof(_Tp);
+    }
 
     template <class _Up, class... _Args>
-        _LIBCPP_DEPRECATED_IN_CXX17 _LIBCPP_INLINE_VISIBILITY
-        void
-        construct(_Up* __p, _Args&&... __args)
-        {
-            ::new((void*)__p) _Up(_VSTD::forward<_Args>(__args)...);
-        }
-    _LIBCPP_DEPRECATED_IN_CXX17 _LIBCPP_INLINE_VISIBILITY void destroy(pointer __p) {__p->~_Tp();}
+    _LIBCPP_DEPRECATED_IN_CXX17 _LIBCPP_INLINE_VISIBILITY
+    void construct(_Up* __p, _Args&&... __args) {
+        ::new ((void*)__p) _Up(_VSTD::forward<_Args>(__args)...);
+    }
+
+    _LIBCPP_DEPRECATED_IN_CXX17 _LIBCPP_INLINE_VISIBILITY
+    void destroy(pointer __p) {
+        __p->~_Tp();
+    }
 #endif
 };
 
@@ -1816,129 +897,174 @@ template <class _Tp>
 class _LIBCPP_TEMPLATE_VIS allocator<const _Tp>
 {
 public:
+    typedef size_t      size_type;
+    typedef ptrdiff_t   difference_type;
+    typedef const _Tp   value_type;
+    typedef true_type   propagate_on_container_move_assignment;
+    typedef true_type   is_always_equal;
+
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
+    allocator() _NOEXCEPT { }
+
+    template <class _Up>
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
+    allocator(const allocator<_Up>&) _NOEXCEPT { }
+
+    _LIBCPP_NODISCARD_AFTER_CXX17 _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
+    const _Tp* allocate(size_t __n) {
+        if (__n > allocator_traits<allocator>::max_size(*this))
+            __throw_length_error("allocator<const T>::allocate(size_t n)"
+                                 " 'n' exceeds maximum supported size");
+        if (__libcpp_is_constant_evaluated()) {
+            return static_cast<const _Tp*>(::operator new(__n * sizeof(_Tp)));
+        } else {
+            return static_cast<const _Tp*>(_VSTD::__libcpp_allocate(__n * sizeof(_Tp), _LIBCPP_ALIGNOF(_Tp)));
+        }
+    }
+
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
+    void deallocate(const _Tp* __p, size_t __n) {
+        if (__libcpp_is_constant_evaluated()) {
+            ::operator delete(const_cast<_Tp*>(__p));
+        } else {
+            _VSTD::__libcpp_deallocate((void*) const_cast<_Tp *>(__p), __n * sizeof(_Tp), _LIBCPP_ALIGNOF(_Tp));
+        }
+    }
+
+    // C++20 Removed members
 #if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS)
-    _LIBCPP_DEPRECATED_IN_CXX17 typedef size_t     size_type;
-    _LIBCPP_DEPRECATED_IN_CXX17 typedef ptrdiff_t  difference_type;
     _LIBCPP_DEPRECATED_IN_CXX17 typedef const _Tp* pointer;
     _LIBCPP_DEPRECATED_IN_CXX17 typedef const _Tp* const_pointer;
     _LIBCPP_DEPRECATED_IN_CXX17 typedef const _Tp& reference;
     _LIBCPP_DEPRECATED_IN_CXX17 typedef const _Tp& const_reference;
 
-    template <class _Up> struct _LIBCPP_DEPRECATED_IN_CXX17 rebind {typedef allocator<_Up> other;};
-#endif
-
-    typedef const _Tp value_type;
-
-    typedef true_type propagate_on_container_move_assignment;
-    typedef true_type is_always_equal;
-
-    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
-    allocator() _NOEXCEPT {}
-
     template <class _Up>
-    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
-    allocator(const allocator<_Up>&) _NOEXCEPT {}
+    struct _LIBCPP_DEPRECATED_IN_CXX17 rebind {
+        typedef allocator<_Up> other;
+    };
 
-#if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS)
     _LIBCPP_DEPRECATED_IN_CXX17 _LIBCPP_INLINE_VISIBILITY
-    const_pointer address(const_reference __x) const _NOEXCEPT
-        {return _VSTD::addressof(__x);}
-#endif
-
-    _LIBCPP_NODISCARD_AFTER_CXX17 _LIBCPP_INLINE_VISIBILITY const _Tp* allocate(size_t __n)
-    {
-        // TODO(mpark): Replace with `allocator_traits<allocator>::max_size(*this)`.
-        if (__n > (size_t(~0) / sizeof(_Tp)))
-            __throw_length_error("allocator<const T>::allocate(size_t n)"
-                                 " 'n' exceeds maximum supported size");
-        return static_cast<const _Tp*>(_VSTD::__libcpp_allocate(__n * sizeof(_Tp), _LIBCPP_ALIGNOF(_Tp)));
+    const_pointer address(const_reference __x) const _NOEXCEPT {
+        return _VSTD::addressof(__x);
     }
 
-#if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS)
     _LIBCPP_NODISCARD_AFTER_CXX17 _LIBCPP_INLINE_VISIBILITY _LIBCPP_DEPRECATED_IN_CXX17
-    const _Tp* allocate(size_t __n, const void*) { return allocate(__n); }
-#endif
+    const _Tp* allocate(size_t __n, const void*) {
+        return allocate(__n);
+    }
 
-    _LIBCPP_INLINE_VISIBILITY void deallocate(const _Tp* __p, size_t __n)
-        {_VSTD::__libcpp_deallocate((void*) const_cast<_Tp *>(__p), __n * sizeof(_Tp), _LIBCPP_ALIGNOF(_Tp));}
+    _LIBCPP_DEPRECATED_IN_CXX17 _LIBCPP_INLINE_VISIBILITY size_type max_size() const _NOEXCEPT {
+        return size_type(~0) / sizeof(_Tp);
+    }
 
-#if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS)
-    _LIBCPP_DEPRECATED_IN_CXX17 _LIBCPP_INLINE_VISIBILITY size_type max_size() const _NOEXCEPT
-        {return size_type(~0) / sizeof(_Tp);}
-
-#if !defined(_LIBCPP_HAS_NO_RVALUE_REFERENCES) && !defined(_LIBCPP_HAS_NO_VARIADICS)
     template <class _Up, class... _Args>
-        _LIBCPP_DEPRECATED_IN_CXX17 _LIBCPP_INLINE_VISIBILITY
-        void
-        construct(_Up* __p, _Args&&... __args)
-        {
-            ::new((void*)__p) _Up(_VSTD::forward<_Args>(__args)...);
-        }
-#else  // !defined(_LIBCPP_HAS_NO_RVALUE_REFERENCES) && !defined(_LIBCPP_HAS_NO_VARIADICS)
-        _LIBCPP_INLINE_VISIBILITY
-        void
-        construct(pointer __p)
-        {
-            ::new((void*) const_cast<_Tp *>(__p)) _Tp();
-        }
-# if defined(_LIBCPP_HAS_NO_RVALUE_REFERENCES)
+    _LIBCPP_DEPRECATED_IN_CXX17 _LIBCPP_INLINE_VISIBILITY
+    void construct(_Up* __p, _Args&&... __args) {
+        ::new ((void*)__p) _Up(_VSTD::forward<_Args>(__args)...);
+    }
 
-    template <class _A0>
-        _LIBCPP_INLINE_VISIBILITY
-        void
-        construct(pointer __p, _A0& __a0)
-        {
-            ::new((void*) const_cast<_Tp *>(__p)) _Tp(__a0);
-        }
-    template <class _A0>
-        _LIBCPP_INLINE_VISIBILITY
-        void
-        construct(pointer __p, const _A0& __a0)
-        {
-            ::new((void*) const_cast<_Tp *>(__p)) _Tp(__a0);
-        }
-# endif  // defined(_LIBCPP_HAS_NO_RVALUE_REFERENCES)
-    template <class _A0, class _A1>
-        _LIBCPP_INLINE_VISIBILITY
-        void
-        construct(pointer __p, _A0& __a0, _A1& __a1)
-        {
-            ::new((void*) const_cast<_Tp *>(__p)) _Tp(__a0, __a1);
-        }
-    template <class _A0, class _A1>
-        _LIBCPP_INLINE_VISIBILITY
-        void
-        construct(pointer __p, const _A0& __a0, _A1& __a1)
-        {
-            ::new((void*) const_cast<_Tp *>(__p)) _Tp(__a0, __a1);
-        }
-    template <class _A0, class _A1>
-        _LIBCPP_INLINE_VISIBILITY
-        void
-        construct(pointer __p, _A0& __a0, const _A1& __a1)
-        {
-            ::new((void*) const_cast<_Tp *>(__p)) _Tp(__a0, __a1);
-        }
-    template <class _A0, class _A1>
-        _LIBCPP_INLINE_VISIBILITY
-        void
-        construct(pointer __p, const _A0& __a0, const _A1& __a1)
-        {
-            ::new((void*) const_cast<_Tp *>(__p)) _Tp(__a0, __a1);
-        }
-#endif  // !defined(_LIBCPP_HAS_NO_RVALUE_REFERENCES) && !defined(_LIBCPP_HAS_NO_VARIADICS)
-    _LIBCPP_DEPRECATED_IN_CXX17 _LIBCPP_INLINE_VISIBILITY void destroy(pointer __p) {__p->~_Tp();}
+    _LIBCPP_DEPRECATED_IN_CXX17 _LIBCPP_INLINE_VISIBILITY
+    void destroy(pointer __p) {
+        __p->~_Tp();
+    }
 #endif
 };
 
 template <class _Tp, class _Up>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 bool operator==(const allocator<_Tp>&, const allocator<_Up>&) _NOEXCEPT {return true;}
 
 template <class _Tp, class _Up>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 bool operator!=(const allocator<_Tp>&, const allocator<_Up>&) _NOEXCEPT {return false;}
 
+template <class _Alloc, class _Ptr>
+_LIBCPP_INLINE_VISIBILITY
+void __construct_forward_with_exception_guarantees(_Alloc& __a, _Ptr __begin1, _Ptr __end1, _Ptr& __begin2) {
+    static_assert(__is_cpp17_move_insertable<_Alloc>::value,
+        "The specified type does not meet the requirements of Cpp17MoveInsertable");
+    typedef allocator_traits<_Alloc> _Traits;
+    for (; __begin1 != __end1; ++__begin1, (void)++__begin2) {
+        _Traits::construct(__a, _VSTD::__to_address(__begin2),
+#ifdef _LIBCPP_NO_EXCEPTIONS
+            _VSTD::move(*__begin1)
+#else
+            _VSTD::move_if_noexcept(*__begin1)
+#endif
+        );
+    }
+}
+
+template <class _Alloc, class _Tp, typename enable_if<
+    (__is_default_allocator<_Alloc>::value || !__has_construct<_Alloc, _Tp*, _Tp>::value) &&
+    is_trivially_move_constructible<_Tp>::value
+>::type>
+_LIBCPP_INLINE_VISIBILITY
+void __construct_forward_with_exception_guarantees(_Alloc&, _Tp* __begin1, _Tp* __end1, _Tp*& __begin2) {
+    ptrdiff_t _Np = __end1 - __begin1;
+    if (_Np > 0) {
+        _VSTD::memcpy(__begin2, __begin1, _Np * sizeof(_Tp));
+        __begin2 += _Np;
+    }
+}
+
+template <class _Alloc, class _Iter, class _Ptr>
+_LIBCPP_INLINE_VISIBILITY
+void __construct_range_forward(_Alloc& __a, _Iter __begin1, _Iter __end1, _Ptr& __begin2) {
+    typedef allocator_traits<_Alloc> _Traits;
+    for (; __begin1 != __end1; ++__begin1, (void) ++__begin2) {
+        _Traits::construct(__a, _VSTD::__to_address(__begin2), *__begin1);
+    }
+}
+
+template <class _Alloc, class _Source, class _Dest,
+          class _RawSource = typename remove_const<_Source>::type,
+          class _RawDest = typename remove_const<_Dest>::type,
+          class =
+    typename enable_if<
+        is_trivially_copy_constructible<_Dest>::value &&
+        is_same<_RawSource, _RawDest>::value &&
+        (__is_default_allocator<_Alloc>::value || !__has_construct<_Alloc, _Dest*, _Source&>::value)
+    >::type>
+_LIBCPP_INLINE_VISIBILITY
+void __construct_range_forward(_Alloc&, _Source* __begin1, _Source* __end1, _Dest*& __begin2) {
+    ptrdiff_t _Np = __end1 - __begin1;
+    if (_Np > 0) {
+        _VSTD::memcpy(const_cast<_RawDest*>(__begin2), __begin1, _Np * sizeof(_Dest));
+        __begin2 += _Np;
+    }
+}
+
+template <class _Alloc, class _Ptr>
+_LIBCPP_INLINE_VISIBILITY
+void __construct_backward_with_exception_guarantees(_Alloc& __a, _Ptr __begin1, _Ptr __end1, _Ptr& __end2) {
+    static_assert(__is_cpp17_move_insertable<_Alloc>::value,
+        "The specified type does not meet the requirements of Cpp17MoveInsertable");
+    typedef allocator_traits<_Alloc> _Traits;
+    while (__end1 != __begin1) {
+        _Traits::construct(__a, _VSTD::__to_address(__end2 - 1),
+#ifdef _LIBCPP_NO_EXCEPTIONS
+            _VSTD::move(*--__end1)
+#else
+            _VSTD::move_if_noexcept(*--__end1)
+#endif
+        );
+        --__end2;
+    }
+}
+
+template <class _Alloc, class _Tp, class = typename enable_if<
+    (__is_default_allocator<_Alloc>::value || !__has_construct<_Alloc, _Tp*, _Tp>::value) &&
+    is_trivially_move_constructible<_Tp>::value
+>::type>
+_LIBCPP_INLINE_VISIBILITY
+void __construct_backward_with_exception_guarantees(_Alloc&, _Tp* __begin1, _Tp* __end1, _Tp*& __end2) {
+    ptrdiff_t _Np = __end1 - __begin1;
+    __end2 -= _Np;
+    if (_Np > 0)
+        _VSTD::memcpy(__end2, __begin1, _Np * sizeof(_Tp));
+}
+
 template <class _OutputIterator, class _Tp>
 class _LIBCPP_TEMPLATE_VIS raw_storage_iterator
     : public iterator<output_iterator_tag,
@@ -1953,10 +1079,10 @@ public:
     _LIBCPP_INLINE_VISIBILITY explicit raw_storage_iterator(_OutputIterator __x) : __x_(__x) {}
     _LIBCPP_INLINE_VISIBILITY raw_storage_iterator& operator*() {return *this;}
     _LIBCPP_INLINE_VISIBILITY raw_storage_iterator& operator=(const _Tp& __element)
-        {::new(_VSTD::addressof(*__x_)) _Tp(__element); return *this;}
+        {::new ((void*)_VSTD::addressof(*__x_)) _Tp(__element); return *this;}
 #if _LIBCPP_STD_VER >= 14
     _LIBCPP_INLINE_VISIBILITY raw_storage_iterator& operator=(_Tp&& __element)
-        {::new(_VSTD::addressof(*__x_)) _Tp(_VSTD::move(__element)); return *this;}
+        {::new ((void*)_VSTD::addressof(*__x_)) _Tp(_VSTD::move(__element)); return *this;}
 #endif
     _LIBCPP_INLINE_VISIBILITY raw_storage_iterator& operator++() {++__x_; return *this;}
     _LIBCPP_INLINE_VISIBILITY raw_storage_iterator  operator++(int)
@@ -1982,8 +1108,8 @@ get_temporary_buffer(ptrdiff_t __n) _NOEXCEPT
 #if !defined(_LIBCPP_HAS_NO_ALIGNED_ALLOCATION)
     if (__is_overaligned_for_new(_LIBCPP_ALIGNOF(_Tp)))
         {
-            std::align_val_t __al =
-                std::align_val_t(std::alignment_of<_Tp>::value);
+            align_val_t __al =
+                align_val_t(alignment_of<_Tp>::value);
             __r.first = static_cast<_Tp*>(::operator new(
                 __n * sizeof(_Tp), __al, nothrow));
         } else {
@@ -2052,7 +1178,7 @@ public:
     _LIBCPP_INLINE_VISIBILITY _Tp* release() _NOEXCEPT
     {
         _Tp* __t = __ptr_;
-        __ptr_ = 0;
+        __ptr_ = nullptr;
         return __t;
     }
     _LIBCPP_INLINE_VISIBILITY void reset(_Tp* __p = 0) _NOEXCEPT
@@ -2160,19 +1286,19 @@ struct __compressed_pair_elem<_Tp, _Idx, true> : private _Tp {
 template <class _T1, class _T2>
 class __compressed_pair : private __compressed_pair_elem<_T1, 0>,
                           private __compressed_pair_elem<_T2, 1> {
-  typedef _LIBCPP_NODEBUG_TYPE __compressed_pair_elem<_T1, 0> _Base1;
-  typedef _LIBCPP_NODEBUG_TYPE __compressed_pair_elem<_T2, 1> _Base2;
-
+public:
   // NOTE: This static assert should never fire because __compressed_pair
   // is *almost never* used in a scenario where it's possible for T1 == T2.
   // (The exception is std::function where it is possible that the function
   //  object and the allocator have the same type).
   static_assert((!is_same<_T1, _T2>::value),
-    "__compressed_pair cannot be instantated when T1 and T2 are the same type; "
+    "__compressed_pair cannot be instantiated when T1 and T2 are the same type; "
     "The current implementation is NOT ABI-compatible with the previous "
     "implementation for this configuration");
 
-public:
+    typedef _LIBCPP_NODEBUG_TYPE __compressed_pair_elem<_T1, 0> _Base1;
+    typedef _LIBCPP_NODEBUG_TYPE __compressed_pair_elem<_T2, 1> _Base2;
+
     template <bool _Dummy = true,
       class = typename enable_if<
           __dependent_type<is_default_constructible<_T1>, _Dummy>::value &&
@@ -2185,7 +1311,7 @@ public:
   template <class _U1, class _U2>
   _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
   __compressed_pair(_U1&& __t1, _U2&& __t2)
-      : _Base1(std::forward<_U1>(__t1)), _Base2(std::forward<_U2>(__t2)) {}
+      : _Base1(_VSTD::forward<_U1>(__t1)), _Base2(_VSTD::forward<_U2>(__t2)) {}
 
 #ifndef _LIBCPP_CXX03_LANG
   template <class... _Args1, class... _Args2>
@@ -2218,12 +1344,21 @@ public:
     return static_cast<_Base2 const&>(*this).__get();
   }
 
+  _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
+  static _Base1* __get_first_base(__compressed_pair* __pair) _NOEXCEPT {
+    return static_cast<_Base1*>(__pair);
+  }
+  _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
+  static _Base2* __get_second_base(__compressed_pair* __pair) _NOEXCEPT {
+    return static_cast<_Base2*>(__pair);
+  }
+
   _LIBCPP_INLINE_VISIBILITY
   void swap(__compressed_pair& __x)
     _NOEXCEPT_(__is_nothrow_swappable<_T1>::value &&
                __is_nothrow_swappable<_T2>::value)
   {
-    using std::swap;
+    using _VSTD::swap;
     swap(first(), __x.first());
     swap(second(), __x.second());
   }
@@ -2316,12 +1451,18 @@ struct __unique_ptr_deleter_sfinae<_Deleter&> {
   typedef false_type __enable_rval_overload;
 };
 
+#if defined(_LIBCPP_ABI_ENABLE_UNIQUE_PTR_TRIVIAL_ABI)
+#  define _LIBCPP_UNIQUE_PTR_TRIVIAL_ABI __attribute__((trivial_abi))
+#else
+#  define _LIBCPP_UNIQUE_PTR_TRIVIAL_ABI
+#endif
+
 template <class _Tp, class _Dp = default_delete<_Tp> >
-class _LIBCPP_TEMPLATE_VIS unique_ptr {
+class _LIBCPP_UNIQUE_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS unique_ptr {
 public:
   typedef _Tp element_type;
   typedef _Dp deleter_type;
-  typedef _LIBCPP_NODEBUG_TYPE typename __pointer_type<_Tp, deleter_type>::type pointer;
+  typedef _LIBCPP_NODEBUG_TYPE typename __pointer<_Tp, deleter_type>::type pointer;
 
   static_assert(!is_rvalue_reference<deleter_type>::value,
                 "the specified deleter type cannot be an rvalue reference");
@@ -2525,11 +1666,11 @@ public:
 
 
 template <class _Tp, class _Dp>
-class _LIBCPP_TEMPLATE_VIS unique_ptr<_Tp[], _Dp> {
+class _LIBCPP_UNIQUE_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS unique_ptr<_Tp[], _Dp> {
 public:
   typedef _Tp element_type;
   typedef _Dp deleter_type;
-  typedef typename __pointer_type<_Tp, deleter_type>::type pointer;
+  typedef typename __pointer<_Tp, deleter_type>::type pointer;
 
 private:
   __compressed_pair<pointer, deleter_type> __ptr_;
@@ -2987,7 +2128,7 @@ public:
         : __size_(__s) {}
 
     template <class _Tp>
-    _LIBCPP_INLINE_VISIBILITY void __incr(_Tp*) _NOEXCEPT
+    _LIBCPP_INLINE_VISIBILITY void __incr() _NOEXCEPT
         {__incr(integral_constant<bool, is_trivially_destructible<_Tp>::value>());}
 
     template <class _Tp>
@@ -3029,7 +2170,7 @@ uninitialized_copy(_InputIterator __f, _InputIterator __l, _ForwardIterator __r)
     {
 #endif
         for (; __f != __l; ++__f, (void) ++__r)
-            ::new (static_cast<void*>(_VSTD::addressof(*__r))) value_type(*__f);
+            ::new ((void*)_VSTD::addressof(*__r)) value_type(*__f);
 #ifndef _LIBCPP_NO_EXCEPTIONS
     }
     catch (...)
@@ -3053,7 +2194,7 @@ uninitialized_copy_n(_InputIterator __f, _Size __n, _ForwardIterator __r)
     {
 #endif
         for (; __n > 0; ++__f, (void) ++__r, (void) --__n)
-            ::new (static_cast<void*>(_VSTD::addressof(*__r))) value_type(*__f);
+            ::new ((void*)_VSTD::addressof(*__r)) value_type(*__f);
 #ifndef _LIBCPP_NO_EXCEPTIONS
     }
     catch (...)
@@ -3077,7 +2218,7 @@ uninitialized_fill(_ForwardIterator __f, _ForwardIterator __l, const _Tp& __x)
     {
 #endif
         for (; __f != __l; ++__f)
-            ::new (static_cast<void*>(_VSTD::addressof(*__f))) value_type(__x);
+            ::new ((void*)_VSTD::addressof(*__f)) value_type(__x);
 #ifndef _LIBCPP_NO_EXCEPTIONS
     }
     catch (...)
@@ -3100,7 +2241,7 @@ uninitialized_fill_n(_ForwardIterator __f, _Size __n, const _Tp& __x)
     {
 #endif
         for (; __n > 0; ++__f, (void) --__n)
-            ::new (static_cast<void*>(_VSTD::addressof(*__f))) value_type(__x);
+            ::new ((void*)_VSTD::addressof(*__f)) value_type(__x);
 #ifndef _LIBCPP_NO_EXCEPTIONS
     }
     catch (...)
@@ -3115,22 +2256,15 @@ uninitialized_fill_n(_ForwardIterator __f, _Size __n, const _Tp& __x)
 
 #if _LIBCPP_STD_VER > 14
 
-template <class _Tp>
-inline _LIBCPP_INLINE_VISIBILITY
-void destroy_at(_Tp* __loc) {
-    _LIBCPP_ASSERT(__loc, "null pointer given to destroy_at");
-    __loc->~_Tp();
-}
-
 template <class _ForwardIterator>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 void destroy(_ForwardIterator __first, _ForwardIterator __last) {
     for (; __first != __last; ++__first)
         _VSTD::destroy_at(_VSTD::addressof(*__first));
 }
 
 template <class _ForwardIterator, class _Size>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _ForwardIterator destroy_n(_ForwardIterator __first, _Size __n) {
     for (; __n > 0; (void)++__first, --__n)
         _VSTD::destroy_at(_VSTD::addressof(*__first));
@@ -3146,7 +2280,7 @@ void uninitialized_default_construct(_ForwardIterator __first, _ForwardIterator
     try {
 #endif
     for (; __idx != __last; ++__idx)
-        ::new((void*)_VSTD::addressof(*__idx)) _Vt;
+        ::new ((void*)_VSTD::addressof(*__idx)) _Vt;
 #ifndef _LIBCPP_NO_EXCEPTIONS
     } catch (...) {
         _VSTD::destroy(__first, __idx);
@@ -3164,7 +2298,7 @@ _ForwardIterator uninitialized_default_construct_n(_ForwardIterator __first, _Si
     try {
 #endif
     for (; __n > 0; (void)++__idx, --__n)
-        ::new((void*)_VSTD::addressof(*__idx)) _Vt;
+        ::new ((void*)_VSTD::addressof(*__idx)) _Vt;
     return __idx;
 #ifndef _LIBCPP_NO_EXCEPTIONS
     } catch (...) {
@@ -3184,7 +2318,7 @@ void uninitialized_value_construct(_ForwardIterator __first, _ForwardIterator __
     try {
 #endif
     for (; __idx != __last; ++__idx)
-        ::new((void*)_VSTD::addressof(*__idx)) _Vt();
+        ::new ((void*)_VSTD::addressof(*__idx)) _Vt();
 #ifndef _LIBCPP_NO_EXCEPTIONS
     } catch (...) {
         _VSTD::destroy(__first, __idx);
@@ -3202,7 +2336,7 @@ _ForwardIterator uninitialized_value_construct_n(_ForwardIterator __first, _Size
     try {
 #endif
     for (; __n > 0; (void)++__idx, --__n)
-        ::new((void*)_VSTD::addressof(*__idx)) _Vt();
+        ::new ((void*)_VSTD::addressof(*__idx)) _Vt();
     return __idx;
 #ifndef _LIBCPP_NO_EXCEPTIONS
     } catch (...) {
@@ -3222,7 +2356,7 @@ _ForwardIt uninitialized_move(_InputIt __first, _InputIt __last, _ForwardIt __fi
     try {
 #endif
     for (; __first != __last; (void)++__idx, ++__first)
-        ::new((void*)_VSTD::addressof(*__idx)) _Vt(std::move(*__first));
+        ::new ((void*)_VSTD::addressof(*__idx)) _Vt(_VSTD::move(*__first));
     return __idx;
 #ifndef _LIBCPP_NO_EXCEPTIONS
     } catch (...) {
@@ -3242,7 +2376,7 @@ uninitialized_move_n(_InputIt __first, _Size __n, _ForwardIt __first_res) {
     try {
 #endif
     for (; __n > 0; ++__idx, (void)++__first, --__n)
-        ::new((void*)_VSTD::addressof(*__idx)) _Vt(std::move(*__first));
+        ::new ((void*)_VSTD::addressof(*__idx)) _Vt(_VSTD::move(*__first));
     return {__first, __idx};
 #ifndef _LIBCPP_NO_EXCEPTIONS
     } catch (...) {
@@ -3389,13 +2523,7 @@ public:
     long use_count() const _NOEXCEPT {return __shared_count::use_count();}
     __shared_weak_count* lock() _NOEXCEPT;
 
-    // Define the function out only if we build static libc++ without RTTI.
-    // Otherwise we may break clients who need to compile their projects with
-    // -fno-rtti and yet link against a libc++.dylib compiled
-    // without -fno-rtti.
-#if !defined(_LIBCPP_NO_RTTI) || !defined(_LIBCPP_BUILD_STATIC)
     virtual const void* __get_deleter(const type_info&) const _NOEXCEPT;
-#endif
 private:
     virtual void __on_zero_shared_weak() _NOEXCEPT = 0;
 };
@@ -3452,69 +2580,86 @@ __shared_ptr_pointer<_Tp, _Dp, _Alloc>::__on_zero_shared_weak() _NOEXCEPT
 }
 
 template <class _Tp, class _Alloc>
-class __shared_ptr_emplace
-    : public __shared_weak_count
+struct __shared_ptr_emplace
+    : __shared_weak_count
 {
-    __compressed_pair<_Alloc, _Tp> __data_;
-public:
+    template<class ..._Args>
+    _LIBCPP_HIDE_FROM_ABI
+    explicit __shared_ptr_emplace(_Alloc __a, _Args&& ...__args)
+        : __storage_(_VSTD::move(__a))
+    {
+#if _LIBCPP_STD_VER > 17
+        using _TpAlloc = typename __allocator_traits_rebind<_Alloc, _Tp>::type;
+        _TpAlloc __tmp(*__get_alloc());
+        allocator_traits<_TpAlloc>::construct(__tmp, __get_elem(), _VSTD::forward<_Args>(__args)...);
+#else
+        ::new ((void*)__get_elem()) _Tp(_VSTD::forward<_Args>(__args)...);
+#endif
+    }
 
-    _LIBCPP_INLINE_VISIBILITY
-    __shared_ptr_emplace(_Alloc __a)
-        :  __data_(_VSTD::move(__a), __value_init_tag()) {}
+    _LIBCPP_HIDE_FROM_ABI
+    _Alloc* __get_alloc() _NOEXCEPT { return __storage_.__get_alloc(); }
 
-
-#ifndef _LIBCPP_HAS_NO_VARIADICS
-    template <class ..._Args>
-        _LIBCPP_INLINE_VISIBILITY
-        __shared_ptr_emplace(_Alloc __a, _Args&& ...__args)
-            :  __data_(piecewise_construct, _VSTD::forward_as_tuple(__a),
-                   _VSTD::forward_as_tuple(_VSTD::forward<_Args>(__args)...)) {}
-#else  // _LIBCPP_HAS_NO_VARIADICS
-
-    template <class _A0>
-        _LIBCPP_INLINE_VISIBILITY
-        __shared_ptr_emplace(_Alloc __a, _A0& __a0)
-            :  __data_(__a, _Tp(__a0)) {}
-
-    template <class _A0, class _A1>
-        _LIBCPP_INLINE_VISIBILITY
-        __shared_ptr_emplace(_Alloc __a, _A0& __a0, _A1& __a1)
-            :  __data_(__a, _Tp(__a0, __a1)) {}
-
-    template <class _A0, class _A1, class _A2>
-        _LIBCPP_INLINE_VISIBILITY
-        __shared_ptr_emplace(_Alloc __a, _A0& __a0, _A1& __a1, _A2& __a2)
-            :  __data_(__a, _Tp(__a0, __a1, __a2)) {}
-
-#endif  // _LIBCPP_HAS_NO_VARIADICS
+    _LIBCPP_HIDE_FROM_ABI
+    _Tp* __get_elem() _NOEXCEPT { return __storage_.__get_elem(); }
 
 private:
-    virtual void __on_zero_shared() _NOEXCEPT;
-    virtual void __on_zero_shared_weak() _NOEXCEPT;
-public:
-    _LIBCPP_INLINE_VISIBILITY
-    _Tp* get() _NOEXCEPT {return _VSTD::addressof(__data_.second());}
+    virtual void __on_zero_shared() _NOEXCEPT {
+#if _LIBCPP_STD_VER > 17
+        using _TpAlloc = typename __allocator_traits_rebind<_Alloc, _Tp>::type;
+        _TpAlloc __tmp(*__get_alloc());
+        allocator_traits<_TpAlloc>::destroy(__tmp, __get_elem());
+#else
+        __get_elem()->~_Tp();
+#endif
+    }
+
+    virtual void __on_zero_shared_weak() _NOEXCEPT {
+        using _ControlBlockAlloc = typename __allocator_traits_rebind<_Alloc, __shared_ptr_emplace>::type;
+        using _ControlBlockPointer = typename allocator_traits<_ControlBlockAlloc>::pointer;
+        _ControlBlockAlloc __tmp(*__get_alloc());
+        __storage_.~_Storage();
+        allocator_traits<_ControlBlockAlloc>::deallocate(__tmp,
+            pointer_traits<_ControlBlockPointer>::pointer_to(*this), 1);
+    }
+
+    // This class implements the control block for non-array shared pointers created
+    // through `std::allocate_shared` and `std::make_shared`.
+    //
+    // In previous versions of the library, we used a compressed pair to store
+    // both the _Alloc and the _Tp. This implies using EBO, which is incompatible
+    // with Allocator construction for _Tp. To allow implementing P0674 in C++20,
+    // we now use a properly aligned char buffer while making sure that we maintain
+    // the same layout that we had when we used a compressed pair.
+    using _CompressedPair = __compressed_pair<_Alloc, _Tp>;
+    struct _ALIGNAS_TYPE(_CompressedPair) _Storage {
+        char __blob_[sizeof(_CompressedPair)];
+
+        _LIBCPP_HIDE_FROM_ABI explicit _Storage(_Alloc&& __a) {
+            ::new ((void*)__get_alloc()) _Alloc(_VSTD::move(__a));
+        }
+        _LIBCPP_HIDE_FROM_ABI ~_Storage() {
+            __get_alloc()->~_Alloc();
+        }
+        _Alloc* __get_alloc() _NOEXCEPT {
+            _CompressedPair *__as_pair = reinterpret_cast<_CompressedPair*>(__blob_);
+            typename _CompressedPair::_Base1* __first = _CompressedPair::__get_first_base(__as_pair);
+            _Alloc *__alloc = reinterpret_cast<_Alloc*>(__first);
+            return __alloc;
+        }
+        _LIBCPP_NO_CFI _Tp* __get_elem() _NOEXCEPT {
+            _CompressedPair *__as_pair = reinterpret_cast<_CompressedPair*>(__blob_);
+            typename _CompressedPair::_Base2* __second = _CompressedPair::__get_second_base(__as_pair);
+            _Tp *__elem = reinterpret_cast<_Tp*>(__second);
+            return __elem;
+        }
+    };
+
+    static_assert(_LIBCPP_ALIGNOF(_Storage) == _LIBCPP_ALIGNOF(_CompressedPair), "");
+    static_assert(sizeof(_Storage) == sizeof(_CompressedPair), "");
+    _Storage __storage_;
 };
 
-template <class _Tp, class _Alloc>
-void
-__shared_ptr_emplace<_Tp, _Alloc>::__on_zero_shared() _NOEXCEPT
-{
-    __data_.second().~_Tp();
-}
-
-template <class _Tp, class _Alloc>
-void
-__shared_ptr_emplace<_Tp, _Alloc>::__on_zero_shared_weak() _NOEXCEPT
-{
-    typedef typename __allocator_traits_rebind<_Alloc, __shared_ptr_emplace>::type _Al;
-    typedef allocator_traits<_Al> _ATraits;
-    typedef pointer_traits<typename _ATraits::pointer> _PTraits;
-    _Al __a(__data_.first());
-    __data_.first().~_Alloc();
-    __a.deallocate(_PTraits::pointer_to(*this), 1);
-}
-
 struct __shared_ptr_dummy_rebind_allocator_type;
 template <>
 class _LIBCPP_TEMPLATE_VIS allocator<__shared_ptr_dummy_rebind_allocator_type>
@@ -3537,8 +2682,14 @@ struct __compatible_with
     : is_convertible<_Tp*, _Up*> {};
 #endif // _LIBCPP_STD_VER > 14
 
+#if defined(_LIBCPP_ABI_ENABLE_SHARED_PTR_TRIVIAL_ABI)
+#  define _LIBCPP_SHARED_PTR_TRIVIAL_ABI __attribute__((trivial_abi))
+#else
+#  define _LIBCPP_SHARED_PTR_TRIVIAL_ABI
+#endif
+
 template<class _Tp>
-class _LIBCPP_TEMPLATE_VIS shared_ptr
+class _LIBCPP_SHARED_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS shared_ptr
 {
 public:
 #if _LIBCPP_STD_VER > 14
@@ -3577,25 +2728,17 @@ public:
         shared_ptr(const shared_ptr<_Yp>& __r,
                    typename enable_if<__compatible_with<_Yp, element_type>::value, __nat>::type = __nat())
                        _NOEXCEPT;
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
     _LIBCPP_INLINE_VISIBILITY
     shared_ptr(shared_ptr&& __r) _NOEXCEPT;
     template<class _Yp> _LIBCPP_INLINE_VISIBILITY  shared_ptr(shared_ptr<_Yp>&& __r,
                    typename enable_if<__compatible_with<_Yp, element_type>::value, __nat>::type = __nat())
                        _NOEXCEPT;
-#endif  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
     template<class _Yp> explicit shared_ptr(const weak_ptr<_Yp>& __r,
                    typename enable_if<is_convertible<_Yp*, element_type*>::value, __nat>::type= __nat());
 #if _LIBCPP_STD_VER <= 14 || defined(_LIBCPP_ENABLE_CXX17_REMOVED_AUTO_PTR)
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
     template<class _Yp>
         shared_ptr(auto_ptr<_Yp>&& __r,
                    typename enable_if<is_convertible<_Yp*, element_type*>::value, __nat>::type = __nat());
-#else
-    template<class _Yp>
-        shared_ptr(auto_ptr<_Yp> __r,
-                   typename enable_if<is_convertible<_Yp*, element_type*>::value, __nat>::type = __nat());
-#endif
 #endif
     template <class _Yp, class _Dp>
         shared_ptr(unique_ptr<_Yp, _Dp>&&,
@@ -3628,7 +2771,6 @@ public:
         >::type
         _LIBCPP_INLINE_VISIBILITY
         operator=(const shared_ptr<_Yp>& __r) _NOEXCEPT;
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
     _LIBCPP_INLINE_VISIBILITY
     shared_ptr& operator=(shared_ptr&& __r) _NOEXCEPT;
     template<class _Yp>
@@ -3649,19 +2791,6 @@ public:
             shared_ptr
         >::type&
         operator=(auto_ptr<_Yp>&& __r);
-#endif
-#else  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
-#if _LIBCPP_STD_VER <= 14 || defined(_LIBCPP_ENABLE_CXX17_REMOVED_AUTO_PTR)
-    template<class _Yp>
-        _LIBCPP_INLINE_VISIBILITY
-        typename enable_if
-        <
-            !is_array<_Yp>::value &&
-            is_convertible<_Yp*, element_type*>::value,
-            shared_ptr&
-        >::type
-        operator=(auto_ptr<_Yp> __r);
-#endif
 #endif
     template <class _Yp, class _Dp>
         typename enable_if
@@ -3670,13 +2799,8 @@ public:
             is_convertible<typename unique_ptr<_Yp, _Dp>::pointer, element_type*>::value,
             shared_ptr&
         >::type
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
         _LIBCPP_INLINE_VISIBILITY
         operator=(unique_ptr<_Yp, _Dp>&& __r);
-#else  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
-        _LIBCPP_INLINE_VISIBILITY
-        operator=(unique_ptr<_Yp, _Dp> __r);
-#endif
 
     _LIBCPP_INLINE_VISIBILITY
     void swap(shared_ptr& __r) _NOEXCEPT;
@@ -3724,7 +2848,7 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     bool unique() const _NOEXCEPT {return use_count() == 1;}
     _LIBCPP_INLINE_VISIBILITY
-    _LIBCPP_EXPLICIT operator bool() const _NOEXCEPT {return get() != 0;}
+    _LIBCPP_EXPLICIT operator bool() const _NOEXCEPT {return get() != nullptr;}
     template <class _Up>
         _LIBCPP_INLINE_VISIBILITY
         bool owner_before(shared_ptr<_Up> const& __p) const _NOEXCEPT
@@ -3828,8 +2952,8 @@ template<class _Tp>
 inline
 _LIBCPP_CONSTEXPR
 shared_ptr<_Tp>::shared_ptr() _NOEXCEPT
-    : __ptr_(0),
-      __cntrl_(0)
+    : __ptr_(nullptr),
+      __cntrl_(nullptr)
 {
 }
 
@@ -3837,8 +2961,8 @@ template<class _Tp>
 inline
 _LIBCPP_CONSTEXPR
 shared_ptr<_Tp>::shared_ptr(nullptr_t) _NOEXCEPT
-    : __ptr_(0),
-      __cntrl_(0)
+    : __ptr_(nullptr),
+      __cntrl_(nullptr)
 {
 }
 
@@ -3883,7 +3007,7 @@ shared_ptr<_Tp>::shared_ptr(_Yp* __p, _Dp __d,
 template<class _Tp>
 template<class _Dp>
 shared_ptr<_Tp>::shared_ptr(nullptr_t __p, _Dp __d)
-    : __ptr_(0)
+    : __ptr_(nullptr)
 {
 #ifndef _LIBCPP_NO_EXCEPTIONS
     try
@@ -3917,8 +3041,7 @@ shared_ptr<_Tp>::shared_ptr(_Yp* __p, _Dp __d, _Alloc __a,
         typedef __allocator_destructor<_A2> _D2;
         _A2 __a2(__a);
         unique_ptr<_CntrlBlk, _D2> __hold2(__a2.allocate(1), _D2(__a2, 1));
-        ::new(static_cast<void*>(_VSTD::addressof(*__hold2.get())))
-            _CntrlBlk(__p, __d, __a);
+        ::new ((void*)_VSTD::addressof(*__hold2.get())) _CntrlBlk(__p, __d, __a);
         __cntrl_ = _VSTD::addressof(*__hold2.release());
         __enable_weak_this(__p, __p);
 #ifndef _LIBCPP_NO_EXCEPTIONS
@@ -3934,7 +3057,7 @@ shared_ptr<_Tp>::shared_ptr(_Yp* __p, _Dp __d, _Alloc __a,
 template<class _Tp>
 template<class _Dp, class _Alloc>
 shared_ptr<_Tp>::shared_ptr(nullptr_t __p, _Dp __d, _Alloc __a)
-    : __ptr_(0)
+    : __ptr_(nullptr)
 {
 #ifndef _LIBCPP_NO_EXCEPTIONS
     try
@@ -3945,8 +3068,7 @@ shared_ptr<_Tp>::shared_ptr(nullptr_t __p, _Dp __d, _Alloc __a)
         typedef __allocator_destructor<_A2> _D2;
         _A2 __a2(__a);
         unique_ptr<_CntrlBlk, _D2> __hold2(__a2.allocate(1), _D2(__a2, 1));
-        ::new(static_cast<void*>(_VSTD::addressof(*__hold2.get())))
-            _CntrlBlk(__p, __d, __a);
+        ::new ((void*)_VSTD::addressof(*__hold2.get())) _CntrlBlk(__p, __d, __a);
         __cntrl_ = _VSTD::addressof(*__hold2.release());
 #ifndef _LIBCPP_NO_EXCEPTIONS
     }
@@ -3992,16 +3114,14 @@ shared_ptr<_Tp>::shared_ptr(const shared_ptr<_Yp>& __r,
         __cntrl_->__add_shared();
 }
 
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
-
 template<class _Tp>
 inline
 shared_ptr<_Tp>::shared_ptr(shared_ptr&& __r) _NOEXCEPT
     : __ptr_(__r.__ptr_),
       __cntrl_(__r.__cntrl_)
 {
-    __r.__ptr_ = 0;
-    __r.__cntrl_ = 0;
+    __r.__ptr_ = nullptr;
+    __r.__cntrl_ = nullptr;
 }
 
 template<class _Tp>
@@ -4013,20 +3133,14 @@ shared_ptr<_Tp>::shared_ptr(shared_ptr<_Yp>&& __r,
     : __ptr_(__r.__ptr_),
       __cntrl_(__r.__cntrl_)
 {
-    __r.__ptr_ = 0;
-    __r.__cntrl_ = 0;
+    __r.__ptr_ = nullptr;
+    __r.__cntrl_ = nullptr;
 }
 
-#endif  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
-
 #if _LIBCPP_STD_VER <= 14 || defined(_LIBCPP_ENABLE_CXX17_REMOVED_AUTO_PTR)
 template<class _Tp>
 template<class _Yp>
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
 shared_ptr<_Tp>::shared_ptr(auto_ptr<_Yp>&& __r,
-#else
-shared_ptr<_Tp>::shared_ptr(auto_ptr<_Yp> __r,
-#endif
                             typename enable_if<is_convertible<_Yp*, element_type*>::value, __nat>::type)
     : __ptr_(__r.get())
 {
@@ -4121,8 +3235,6 @@ shared_ptr<_Tp>::operator=(const shared_ptr<_Yp>& __r) _NOEXCEPT
     return *this;
 }
 
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
-
 template<class _Tp>
 inline
 shared_ptr<_Tp>&
@@ -4179,43 +3291,6 @@ shared_ptr<_Tp>::operator=(unique_ptr<_Yp, _Dp>&& __r)
     return *this;
 }
 
-#else  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
-
-#if _LIBCPP_STD_VER <= 14 || defined(_LIBCPP_ENABLE_CXX17_REMOVED_AUTO_PTR)
-template<class _Tp>
-template<class _Yp>
-inline _LIBCPP_INLINE_VISIBILITY
-typename enable_if
-<
-    !is_array<_Yp>::value &&
-    is_convertible<_Yp*, typename shared_ptr<_Tp>::element_type*>::value,
-    shared_ptr<_Tp>&
->::type
-shared_ptr<_Tp>::operator=(auto_ptr<_Yp> __r)
-{
-    shared_ptr(__r).swap(*this);
-    return *this;
-}
-#endif
-
-template<class _Tp>
-template <class _Yp, class _Dp>
-inline _LIBCPP_INLINE_VISIBILITY
-typename enable_if
-<
-    !is_array<_Yp>::value &&
-    is_convertible<typename unique_ptr<_Yp, _Dp>::pointer,
-                   typename shared_ptr<_Tp>::element_type*>::value,
-    shared_ptr<_Tp>&
->::type
-shared_ptr<_Tp>::operator=(unique_ptr<_Yp, _Dp> __r)
-{
-    shared_ptr(_VSTD::move(__r)).swap(*this);
-    return *this;
-}
-
-#endif  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
-
 template<class _Tp>
 inline
 void
@@ -4272,50 +3347,26 @@ shared_ptr<_Tp>::reset(_Yp* __p, _Dp __d, _Alloc __a)
     shared_ptr(__p, __d, __a).swap(*this);
 }
 
-template<class _Tp, class ..._Args>
-inline _LIBCPP_INLINE_VISIBILITY
-typename enable_if
-<
-    !is_array<_Tp>::value,
-    shared_ptr<_Tp>
->::type
-make_shared(_Args&& ...__args)
+//
+// std::allocate_shared and std::make_shared
+//
+template<class _Tp, class _Alloc, class ..._Args, class = _EnableIf<!is_array<_Tp>::value> >
+_LIBCPP_HIDE_FROM_ABI
+shared_ptr<_Tp> allocate_shared(const _Alloc& __a, _Args&& ...__args)
 {
-    static_assert(is_constructible<_Tp, _Args...>::value, "Can't construct object in make_shared");
-    typedef __shared_ptr_emplace<_Tp, allocator<_Tp> > _CntrlBlk;
-    typedef allocator<_CntrlBlk> _A2;
-    typedef __allocator_destructor<_A2> _D2;
-
-    _A2 __a2;
-    unique_ptr<_CntrlBlk, _D2> __hold2(__a2.allocate(1), _D2(__a2, 1));
-    ::new(__hold2.get()) _CntrlBlk(__a2, _VSTD::forward<_Args>(__args)...);
-
-    _Tp *__ptr = __hold2.get()->get();
-    return shared_ptr<_Tp>::__create_with_control_block(__ptr, __hold2.release());
+    using _ControlBlock = __shared_ptr_emplace<_Tp, _Alloc>;
+    using _ControlBlockAllocator = typename __allocator_traits_rebind<_Alloc, _ControlBlock>::type;
+    __allocation_guard<_ControlBlockAllocator> __guard(__a, 1);
+    ::new ((void*)_VSTD::addressof(*__guard.__get())) _ControlBlock(__a, _VSTD::forward<_Args>(__args)...);
+    auto __control_block = __guard.__release_ptr();
+    return shared_ptr<_Tp>::__create_with_control_block((*__control_block).__get_elem(), _VSTD::addressof(*__control_block));
 }
 
-template<class _Tp, class _Alloc, class ..._Args>
-inline _LIBCPP_INLINE_VISIBILITY
-typename enable_if
-<
-    !is_array<_Tp>::value,
-    shared_ptr<_Tp>
->::type
-allocate_shared(const _Alloc& __a, _Args&& ...__args)
+template<class _Tp, class ..._Args, class = _EnableIf<!is_array<_Tp>::value> >
+_LIBCPP_HIDE_FROM_ABI
+shared_ptr<_Tp> make_shared(_Args&& ...__args)
 {
-    static_assert( is_constructible<_Tp, _Args...>::value, "Can't construct object in allocate_shared");
-
-    typedef __shared_ptr_emplace<_Tp, _Alloc> _CntrlBlk;
-    typedef typename __allocator_traits_rebind<_Alloc, _CntrlBlk>::type _A2;
-    typedef __allocator_destructor<_A2> _D2;
-
-    _A2 __a2(__a);
-    unique_ptr<_CntrlBlk, _D2> __hold2(__a2.allocate(1), _D2(__a2, 1));
-    ::new(static_cast<void*>(_VSTD::addressof(*__hold2.get())))
-        _CntrlBlk(__a, _VSTD::forward<_Args>(__args)...);
-
-    typename shared_ptr<_Tp>::element_type *__p = __hold2.get()->get();
-    return shared_ptr<_Tp>::__create_with_control_block(__p, _VSTD::addressof(*__hold2.release()));
+    return _VSTD::allocate_shared<_Tp>(allocator<_Tp>(), _VSTD::forward<_Args>(__args)...);
 }
 
 template<class _Tp, class _Up>
@@ -4526,7 +3577,7 @@ get_deleter(const shared_ptr<_Tp>& __p) _NOEXCEPT
 #endif  // _LIBCPP_NO_RTTI
 
 template<class _Tp>
-class _LIBCPP_TEMPLATE_VIS weak_ptr
+class _LIBCPP_SHARED_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS weak_ptr
 {
 public:
     typedef _Tp element_type;
@@ -4546,13 +3597,11 @@ public:
                    typename enable_if<is_convertible<_Yp*, _Tp*>::value, __nat*>::type = 0)
                          _NOEXCEPT;
 
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
     _LIBCPP_INLINE_VISIBILITY
     weak_ptr(weak_ptr&& __r) _NOEXCEPT;
     template<class _Yp> _LIBCPP_INLINE_VISIBILITY weak_ptr(weak_ptr<_Yp>&& __r,
                    typename enable_if<is_convertible<_Yp*, _Tp*>::value, __nat*>::type = 0)
                          _NOEXCEPT;
-#endif  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
     ~weak_ptr();
 
     _LIBCPP_INLINE_VISIBILITY
@@ -4566,8 +3615,6 @@ public:
         _LIBCPP_INLINE_VISIBILITY
         operator=(weak_ptr<_Yp> const& __r) _NOEXCEPT;
 
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
-
     _LIBCPP_INLINE_VISIBILITY
     weak_ptr& operator=(weak_ptr&& __r) _NOEXCEPT;
     template<class _Yp>
@@ -4579,8 +3626,6 @@ public:
         _LIBCPP_INLINE_VISIBILITY
         operator=(weak_ptr<_Yp>&& __r) _NOEXCEPT;
 
-#endif  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
-
     template<class _Yp>
         typename enable_if
         <
@@ -4600,7 +3645,7 @@ public:
         {return __cntrl_ ? __cntrl_->use_count() : 0;}
     _LIBCPP_INLINE_VISIBILITY
     bool expired() const _NOEXCEPT
-        {return __cntrl_ == 0 || __cntrl_->use_count() == 0;}
+        {return __cntrl_ == nullptr || __cntrl_->use_count() == 0;}
     shared_ptr<_Tp> lock() const _NOEXCEPT;
     template<class _Up>
         _LIBCPP_INLINE_VISIBILITY
@@ -4624,8 +3669,8 @@ template<class _Tp>
 inline
 _LIBCPP_CONSTEXPR
 weak_ptr<_Tp>::weak_ptr() _NOEXCEPT
-    : __ptr_(0),
-      __cntrl_(0)
+    : __ptr_(nullptr),
+      __cntrl_(nullptr)
 {
 }
 
@@ -4665,16 +3710,14 @@ weak_ptr<_Tp>::weak_ptr(weak_ptr<_Yp> const& __r,
         __cntrl_->__add_weak();
 }
 
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
-
 template<class _Tp>
 inline
 weak_ptr<_Tp>::weak_ptr(weak_ptr&& __r) _NOEXCEPT
     : __ptr_(__r.__ptr_),
       __cntrl_(__r.__cntrl_)
 {
-    __r.__ptr_ = 0;
-    __r.__cntrl_ = 0;
+    __r.__ptr_ = nullptr;
+    __r.__cntrl_ = nullptr;
 }
 
 template<class _Tp>
@@ -4686,12 +3729,10 @@ weak_ptr<_Tp>::weak_ptr(weak_ptr<_Yp>&& __r,
     : __ptr_(__r.__ptr_),
       __cntrl_(__r.__cntrl_)
 {
-    __r.__ptr_ = 0;
-    __r.__cntrl_ = 0;
+    __r.__ptr_ = nullptr;
+    __r.__cntrl_ = nullptr;
 }
 
-#endif  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
-
 template<class _Tp>
 weak_ptr<_Tp>::~weak_ptr()
 {
@@ -4722,8 +3763,6 @@ weak_ptr<_Tp>::operator=(weak_ptr<_Yp> const& __r) _NOEXCEPT
     return *this;
 }
 
-#ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
-
 template<class _Tp>
 inline
 weak_ptr<_Tp>&
@@ -4747,8 +3786,6 @@ weak_ptr<_Tp>::operator=(weak_ptr<_Yp>&& __r) _NOEXCEPT
     return *this;
 }
 
-#endif  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
-
 template<class _Tp>
 template<class _Yp>
 inline
@@ -4795,7 +3832,7 @@ shared_ptr<_Tp>::shared_ptr(const weak_ptr<_Yp>& __r,
     : __ptr_(__r.__ptr_),
       __cntrl_(__r.__cntrl_ ? __r.__cntrl_->lock() : __r.__cntrl_)
 {
-    if (__cntrl_ == 0)
+    if (__cntrl_ == nullptr)
         __throw_bad_weak_ptr();
 }
 
@@ -5129,19 +4166,6 @@ undeclare_reachable(_Tp* __p)
 _LIBCPP_FUNC_VIS void* align(size_t __align, size_t __sz, void*& __ptr, size_t& __space);
 
 // --- Helper for container swap --
-template <typename _Alloc>
-inline _LIBCPP_INLINE_VISIBILITY
-void __swap_allocator(_Alloc & __a1, _Alloc & __a2)
-#if _LIBCPP_STD_VER >= 14
-    _NOEXCEPT
-#else
-    _NOEXCEPT_(__is_nothrow_swappable<_Alloc>::value)
-#endif
-{
-    __swap_allocator(__a1, __a2,
-      integral_constant<bool, _VSTD::allocator_traits<_Alloc>::propagate_on_container_swap::value>());
-}
-
 template <typename _Alloc>
 _LIBCPP_INLINE_VISIBILITY
 void __swap_allocator(_Alloc & __a1, _Alloc & __a2, true_type)
@@ -5159,6 +4183,19 @@ template <typename _Alloc>
 inline _LIBCPP_INLINE_VISIBILITY
 void __swap_allocator(_Alloc &, _Alloc &, false_type) _NOEXCEPT {}
 
+template <typename _Alloc>
+inline _LIBCPP_INLINE_VISIBILITY
+void __swap_allocator(_Alloc & __a1, _Alloc & __a2)
+#if _LIBCPP_STD_VER >= 14
+    _NOEXCEPT
+#else
+    _NOEXCEPT_(__is_nothrow_swappable<_Alloc>::value)
+#endif
+{
+    _VSTD::__swap_allocator(__a1, __a2,
+      integral_constant<bool, _VSTD::allocator_traits<_Alloc>::propagate_on_container_swap::value>());
+}
+
 template <typename _Alloc, typename _Traits=allocator_traits<_Alloc> >
 struct __noexcept_move_assign_container : public integral_constant<bool,
     _Traits::propagate_on_container_move_assignment::value
@@ -5170,7 +4207,6 @@ struct __noexcept_move_assign_container : public integral_constant<bool,
     > {};
 
 
-#ifndef _LIBCPP_HAS_NO_VARIADICS
 template <class _Tp, class _Alloc>
 struct __temp_value {
     typedef allocator_traits<_Alloc> _Traits;
@@ -5190,7 +4226,6 @@ struct __temp_value {
 
     ~__temp_value() { _Traits::destroy(__a, __addr()); }
     };
-#endif
 
 template<typename _Alloc, typename = void, typename = void>
 struct __is_allocator : false_type {};
@@ -5214,7 +4249,7 @@ struct __builtin_new_allocator {
         : __size_(__size), __align_(__align) {}
 
     void operator()(void* p) const _NOEXCEPT {
-        std::__libcpp_deallocate(p, __size_, __align_);
+        _VSTD::__libcpp_deallocate(p, __size_, __align_);
     }
 
    private:
@@ -5225,13 +4260,13 @@ struct __builtin_new_allocator {
   typedef unique_ptr<void, __builtin_new_deleter> __holder_t;
 
   static __holder_t __allocate_bytes(size_t __s, size_t __align) {
-      return __holder_t(std::__libcpp_allocate(__s, __align),
+      return __holder_t(_VSTD::__libcpp_allocate(__s, __align),
                      __builtin_new_deleter(__s, __align));
   }
 
   static void __deallocate_bytes(void* __p, size_t __s,
                                  size_t __align) _NOEXCEPT {
-      std::__libcpp_deallocate(__p, __s, __align);
+      _VSTD::__libcpp_deallocate(__p, __s, __align);
   }
 
   template <class _Tp>
diff --git a/lib/libcxx/include/module.modulemap b/lib/libcxx/include/module.modulemap
index b8d2a6669a..750cd38588 100644
--- a/lib/libcxx/include/module.modulemap
+++ b/lib/libcxx/include/module.modulemap
@@ -522,6 +522,7 @@ module std [system] {
   }
 
   // FIXME: These should be private.
+  module __bits { header "__bits" export * }
   module __bit_reference { header "__bit_reference" export * }
   module __debug { header "__debug" export * }
   module __errc { header "__errc" export * }
diff --git a/lib/libcxx/include/mutex b/lib/libcxx/include/mutex
index 62780bd073..f098ccba48 100644
--- a/lib/libcxx/include/mutex
+++ b/lib/libcxx/include/mutex
@@ -626,7 +626,7 @@ private:
     _LIBCPP_INLINE_VISIBILITY
     void __execute(__tuple_indices<_Indices...>)
     {
-        __invoke(_VSTD::get<0>(_VSTD::move(__f_)), _VSTD::get<_Indices>(_VSTD::move(__f_))...);
+        _VSTD::__invoke(_VSTD::get<0>(_VSTD::move(__f_)), _VSTD::get<_Indices>(_VSTD::move(__f_))...);
     }
 };
 
diff --git a/lib/libcxx/include/new b/lib/libcxx/include/new
index 40d351e9b7..0562cef458 100644
--- a/lib/libcxx/include/new
+++ b/lib/libcxx/include/new
@@ -49,11 +49,11 @@ new_handler get_new_handler() noexcept;
 template <class T> constexpr T* launder(T* p) noexcept; // C++17
 }  // std
 
-void* operator new(std::size_t size);                                   // replaceable, nodiscard in C++2a
-void* operator new(std::size_t size, std::align_val_t alignment);       // replaceable, C++17, nodiscard in C++2a
-void* operator new(std::size_t size, const std::nothrow_t&) noexcept;   // replaceable, nodiscard in C++2a
+void* operator new(std::size_t size);                                   // replaceable, nodiscard in C++20
+void* operator new(std::size_t size, std::align_val_t alignment);       // replaceable, C++17, nodiscard in C++20
+void* operator new(std::size_t size, const std::nothrow_t&) noexcept;   // replaceable, nodiscard in C++20
 void* operator new(std::size_t size, std::align_val_t alignment,
-                   const std::nothrow_t&) noexcept;                     // replaceable, C++17, nodiscard in C++2a
+                   const std::nothrow_t&) noexcept;                     // replaceable, C++17, nodiscard in C++20
 void  operator delete(void* ptr) noexcept;                              // replaceable
 void  operator delete(void* ptr, std::size_t size) noexcept;            // replaceable, C++14
 void  operator delete(void* ptr, std::align_val_t alignment) noexcept;  // replaceable, C++17
@@ -63,12 +63,12 @@ void  operator delete(void* ptr, const std::nothrow_t&) noexcept;       // repla
 void  operator delete(void* ptr, std:align_val_t alignment,
                       const std::nothrow_t&) noexcept;                  // replaceable, C++17
 
-void* operator new[](std::size_t size);                                 // replaceable, nodiscard in C++2a
+void* operator new[](std::size_t size);                                 // replaceable, nodiscard in C++20
 void* operator new[](std::size_t size,
-                     std::align_val_t alignment) noexcept;              // replaceable, C++17, nodiscard in C++2a
-void* operator new[](std::size_t size, const std::nothrow_t&) noexcept; // replaceable, nodiscard in C++2a
+                     std::align_val_t alignment) noexcept;              // replaceable, C++17, nodiscard in C++20
+void* operator new[](std::size_t size, const std::nothrow_t&) noexcept; // replaceable, nodiscard in C++20
 void* operator new[](std::size_t size, std::align_val_t alignment,
-                     const std::nothrow_t&) noexcept;                   // replaceable, C++17, nodiscard in C++2a
+                     const std::nothrow_t&) noexcept;                   // replaceable, C++17, nodiscard in C++20
 void  operator delete[](void* ptr) noexcept;                            // replaceable
 void  operator delete[](void* ptr, std::size_t size) noexcept;          // replaceable, C++14
 void  operator delete[](void* ptr,
@@ -79,21 +79,20 @@ void  operator delete[](void* ptr, const std::nothrow_t&) noexcept;     // repla
 void  operator delete[](void* ptr, std::align_val_t alignment,
                         const std::nothrow_t&) noexcept;                // replaceable, C++17
 
-void* operator new  (std::size_t size, void* ptr) noexcept;             // nodiscard in C++2a
-void* operator new[](std::size_t size, void* ptr) noexcept;             // nodiscard in C++2a
+void* operator new  (std::size_t size, void* ptr) noexcept;             // nodiscard in C++20
+void* operator new[](std::size_t size, void* ptr) noexcept;             // nodiscard in C++20
 void  operator delete  (void* ptr, void*) noexcept;
 void  operator delete[](void* ptr, void*) noexcept;
 
 */
 
 #include <__config>
+#include <__availability>
+#include <cstddef>
+#include <cstdlib>
 #include <exception>
 #include <type_traits>
-#include <cstddef>
 #include <version>
-#ifdef _LIBCPP_NO_EXCEPTIONS
-#include <cstdlib>
-#endif
 
 #if defined(_LIBCPP_ABI_VCRUNTIME)
 #include <new.h>
@@ -117,11 +116,6 @@ void  operator delete[](void* ptr, void*) noexcept;
 # define _LIBCPP_HAS_NO_SIZED_DEALLOCATION
 #endif
 
-#if !__has_builtin(__builtin_operator_new) || \
-   __has_builtin(__builtin_operator_new) < 201802L
-#define _LIBCPP_HAS_NO_BUILTIN_OVERLOADED_OPERATOR_NEW_DELETE
-#endif
-
 namespace std  // purposefully not using versioning namespace
 {
 
@@ -234,31 +228,54 @@ _LIBCPP_CONSTEXPR inline _LIBCPP_INLINE_VISIBILITY bool __is_overaligned_for_new
 #endif
 }
 
-inline _LIBCPP_INLINE_VISIBILITY void *__libcpp_allocate(size_t __size, size_t __align) {
-#ifndef _LIBCPP_HAS_NO_ALIGNED_ALLOCATION
-  if (__is_overaligned_for_new(__align)) {
-    const align_val_t __align_val = static_cast<align_val_t>(__align);
-# ifdef _LIBCPP_HAS_NO_BUILTIN_OVERLOADED_OPERATOR_NEW_DELETE
-    return ::operator new(__size, __align_val);
-# else
-    return __builtin_operator_new(__size, __align_val);
-# endif
-  }
+template <class ..._Args>
+_LIBCPP_INLINE_VISIBILITY
+void* __libcpp_operator_new(_Args ...__args) {
+#if __has_builtin(__builtin_operator_new) && __has_builtin(__builtin_operator_delete)
+  return __builtin_operator_new(__args...);
 #else
-  ((void)__align);
-#endif
-#ifdef _LIBCPP_HAS_NO_BUILTIN_OPERATOR_NEW_DELETE
-  return ::operator new(__size);
-#else
-  return __builtin_operator_new(__size);
+  return ::operator new(__args...);
 #endif
 }
 
-struct _DeallocateCaller {
-  static inline _LIBCPP_INLINE_VISIBILITY
-  void __do_deallocate_handle_size_align(void *__ptr, size_t __size, size_t __align) {
+template <class ..._Args>
+_LIBCPP_INLINE_VISIBILITY
+void __libcpp_operator_delete(_Args ...__args) {
+#if __has_builtin(__builtin_operator_new) && __has_builtin(__builtin_operator_delete)
+  __builtin_operator_delete(__args...);
+#else
+  ::operator delete(__args...);
+#endif
+}
+
+inline _LIBCPP_INLINE_VISIBILITY
+void *__libcpp_allocate(size_t __size, size_t __align) {
+#ifndef _LIBCPP_HAS_NO_ALIGNED_ALLOCATION
+  if (__is_overaligned_for_new(__align)) {
+    const align_val_t __align_val = static_cast<align_val_t>(__align);
+    return __libcpp_operator_new(__size, __align_val);
+  }
+#endif
+
+  (void)__align;
+  return __libcpp_operator_new(__size);
+}
+
+template <class ..._Args>
+_LIBCPP_INLINE_VISIBILITY
+void __do_deallocate_handle_size(void *__ptr, size_t __size, _Args ...__args) {
+#ifdef _LIBCPP_HAS_NO_SIZED_DEALLOCATION
+  (void)__size;
+  return __libcpp_operator_delete(__ptr, __args...);
+#else
+  return __libcpp_operator_delete(__ptr, __size, __args...);
+#endif
+}
+
+inline _LIBCPP_INLINE_VISIBILITY
+void __libcpp_deallocate(void* __ptr, size_t __size, size_t __align) {
 #if defined(_LIBCPP_HAS_NO_ALIGNED_ALLOCATION)
-    ((void)__align);
+    (void)__align;
     return __do_deallocate_handle_size(__ptr, __size);
 #else
     if (__is_overaligned_for_new(__align)) {
@@ -268,82 +285,52 @@ struct _DeallocateCaller {
       return __do_deallocate_handle_size(__ptr, __size);
     }
 #endif
-  }
-
-  static inline _LIBCPP_INLINE_VISIBILITY
-  void __do_deallocate_handle_align(void *__ptr, size_t __align) {
-#if defined(_LIBCPP_HAS_NO_ALIGNED_ALLOCATION)
-    ((void)__align);
-    return __do_call(__ptr);
-#else
-    if (__is_overaligned_for_new(__align)) {
-      const align_val_t __align_val = static_cast<align_val_t>(__align);
-      return __do_call(__ptr, __align_val);
-    } else {
-      return __do_call(__ptr);
-    }
-#endif
-  }
-
- private:
-  static inline void __do_deallocate_handle_size(void *__ptr, size_t __size) {
-#ifdef _LIBCPP_HAS_NO_SIZED_DEALLOCATION
-    ((void)__size);
-    return __do_call(__ptr);
-#else
-    return __do_call(__ptr, __size);
-#endif
-  }
-
-#ifndef _LIBCPP_HAS_NO_ALIGNED_ALLOCATION
-  static inline void __do_deallocate_handle_size(void *__ptr, size_t __size, align_val_t __align) {
-#ifdef _LIBCPP_HAS_NO_SIZED_DEALLOCATION
-    ((void)__size);
-    return __do_call(__ptr, __align);
-#else
-    return __do_call(__ptr, __size, __align);
-#endif
-  }
-#endif
-
-private:
-  template <class _A1, class _A2>
-  static inline void __do_call(void *__ptr, _A1 __a1, _A2 __a2) {
-#if defined(_LIBCPP_HAS_NO_BUILTIN_OPERATOR_NEW_DELETE) || \
-    defined(_LIBCPP_HAS_NO_BUILTIN_OVERLOADED_OPERATOR_NEW_DELETE)
-    return ::operator delete(__ptr, __a1, __a2);
-#else
-    return __builtin_operator_delete(__ptr, __a1, __a2);
-#endif
-  }
-
-  template <class _A1>
-  static inline void __do_call(void *__ptr, _A1 __a1) {
-#if defined(_LIBCPP_HAS_NO_BUILTIN_OPERATOR_NEW_DELETE) || \
-    defined(_LIBCPP_HAS_NO_BUILTIN_OVERLOADED_OPERATOR_NEW_DELETE)
-    return ::operator delete(__ptr, __a1);
-#else
-    return __builtin_operator_delete(__ptr, __a1);
-#endif
-  }
-
-  static inline void __do_call(void *__ptr) {
-#ifdef _LIBCPP_HAS_NO_BUILTIN_OPERATOR_NEW_DELETE
-    return ::operator delete(__ptr);
-#else
-    return __builtin_operator_delete(__ptr);
-#endif
-  }
-};
-
-inline _LIBCPP_INLINE_VISIBILITY void __libcpp_deallocate(void* __ptr, size_t __size, size_t __align) {
-  _DeallocateCaller::__do_deallocate_handle_size_align(__ptr, __size, __align);
 }
 
 inline _LIBCPP_INLINE_VISIBILITY void __libcpp_deallocate_unsized(void* __ptr, size_t __align) {
-  _DeallocateCaller::__do_deallocate_handle_align(__ptr, __align);
+#if defined(_LIBCPP_HAS_NO_ALIGNED_ALLOCATION)
+    (void)__align;
+    return __libcpp_operator_delete(__ptr);
+#else
+    if (__is_overaligned_for_new(__align)) {
+      const align_val_t __align_val = static_cast<align_val_t>(__align);
+      return __libcpp_operator_delete(__ptr, __align_val);
+    } else {
+      return __libcpp_operator_delete(__ptr);
+    }
+#endif
 }
 
+#if !defined(_LIBCPP_HAS_NO_LIBRARY_ALIGNED_ALLOCATION)
+// Low-level helpers to call the aligned allocation and deallocation functions
+// on the target platform. This is used to implement libc++'s own memory
+// allocation routines -- if you need to allocate memory inside the library,
+// chances are that you want to use `__libcpp_allocate` instead.
+//
+// Returns the allocated memory, or `nullptr` on failure.
+inline _LIBCPP_INLINE_VISIBILITY
+void* __libcpp_aligned_alloc(std::size_t __alignment, std::size_t __size) {
+#if defined(_LIBCPP_MSVCRT_LIKE)
+  return ::_aligned_malloc(__size, __alignment);
+#else
+  void* __result = nullptr;
+  ::posix_memalign(&__result, __alignment, __size);
+  // If posix_memalign fails, __result is unmodified so we still return `nullptr`.
+  return __result;
+#endif
+}
+
+inline _LIBCPP_INLINE_VISIBILITY
+void __libcpp_aligned_free(void* __ptr) {
+#if defined(_LIBCPP_MSVCRT_LIKE)
+  ::_aligned_free(__ptr);
+#else
+  ::free(__ptr);
+#endif
+}
+#endif // !_LIBCPP_HAS_NO_ALIGNED_ALLOCATION
+
+
 template <class _Tp>
 _LIBCPP_NODISCARD_AFTER_CXX17 inline
 _LIBCPP_CONSTEXPR _Tp* __launder(_Tp* __p) _NOEXCEPT
diff --git a/lib/libcxx/include/numbers b/lib/libcxx/include/numbers
index e7d981be4a..38dad99554 100644
--- a/lib/libcxx/include/numbers
+++ b/lib/libcxx/include/numbers
@@ -100,7 +100,7 @@ template <class T> inline constexpr T egamma_v =     __illformed<T>{};
 template <class T> inline constexpr T phi_v =        __illformed<T>{};
 
 template <class T>
-concept __floating_point = std::is_floating_point_v<T>;
+concept __floating_point = is_floating_point_v<T>;
 
 template <__floating_point T> inline constexpr T e_v<T>          = 2.718281828459045235360287471352662;
 template <__floating_point T> inline constexpr T log2e_v<T>      = 1.442695040888963407359924681001892;
diff --git a/lib/libcxx/include/numeric b/lib/libcxx/include/numeric
index 5ceadc1775..4f202bb84f 100644
--- a/lib/libcxx/include/numeric
+++ b/lib/libcxx/include/numeric
@@ -17,115 +17,116 @@ namespace std
 {
 
 template <class InputIterator, class T>
-    T
+    constexpr T  // constexpr since C++20
     accumulate(InputIterator first, InputIterator last, T init);
 
 template <class InputIterator, class T, class BinaryOperation>
-    T
+    constexpr T  // constexpr since C++20
     accumulate(InputIterator first, InputIterator last, T init, BinaryOperation binary_op);
 
 template<class InputIterator>
-    typename iterator_traits<InputIterator>::value_type
+    constexpr typename iterator_traits<InputIterator>::value_type  // constexpr since C++20
     reduce(InputIterator first, InputIterator last);  // C++17
 
 template<class InputIterator, class T>
-    T
+    constexpr T  // constexpr since C++20
     reduce(InputIterator first, InputIterator last, T init);  // C++17
 
 template<class InputIterator, class T, class BinaryOperation>
-    T
+    constexpr T  // constexpr since C++20
     reduce(InputIterator first, InputIterator last, T init, BinaryOperation binary_op);  // C++17
 
 template <class InputIterator1, class InputIterator2, class T>
-    T
+    constexpr T  // constexpr since C++20
     inner_product(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, T init);
 
 template <class InputIterator1, class InputIterator2, class T, class BinaryOperation1, class BinaryOperation2>
-    T
+    constexpr T  // constexpr since C++20
     inner_product(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2,
                   T init, BinaryOperation1 binary_op1, BinaryOperation2 binary_op2);
 
 
 template<class InputIterator1, class InputIterator2, class T>
-    T
+    constexpr T  // constexpr since C++20
     transform_reduce(InputIterator1 first1, InputIterator1 last1,
                      InputIterator2 first2, T init);  // C++17
 
 template<class InputIterator1, class InputIterator2, class T, class BinaryOperation1, class BinaryOperation2>
-    T
+    constexpr T  // constexpr since C++20
     transform_reduce(InputIterator1 first1, InputIterator1 last1,
                      InputIterator2 first2, T init,
                      BinaryOperation1 binary_op1, BinaryOperation2 binary_op2);  // C++17
 
 template<class InputIterator, class T, class BinaryOperation, class UnaryOperation>
-    T
+    constexpr T  // constexpr since C++20
     transform_reduce(InputIterator first, InputIterator last, T init,
                      BinaryOperation binary_op, UnaryOperation unary_op);  // C++17
 
 template <class InputIterator, class OutputIterator>
-    OutputIterator
+    constexpr OutputIterator  // constexpr since C++20
     partial_sum(InputIterator first, InputIterator last, OutputIterator result);
 
 template <class InputIterator, class OutputIterator, class BinaryOperation>
-    OutputIterator
+    constexpr OutputIterator  // constexpr since C++20
     partial_sum(InputIterator first, InputIterator last, OutputIterator result, BinaryOperation binary_op);
 
 template<class InputIterator, class OutputIterator, class T>
-    OutputIterator
+    constexpr OutputIterator  // constexpr since C++20
     exclusive_scan(InputIterator first, InputIterator last,
                    OutputIterator result, T init); // C++17
 
 template<class InputIterator, class OutputIterator, class T, class BinaryOperation>
-    OutputIterator
+    constexpr OutputIterator  // constexpr since C++20
     exclusive_scan(InputIterator first, InputIterator last,
                    OutputIterator result, T init, BinaryOperation binary_op); // C++17
 
 template<class InputIterator, class OutputIterator>
-    OutputIterator
+    constexpr OutputIterator  // constexpr since C++20
     inclusive_scan(InputIterator first, InputIterator last, OutputIterator result);  // C++17
 
 template<class InputIterator, class OutputIterator, class BinaryOperation>
-    OutputIterator
+    constexpr OutputIterator  // constexpr since C++20
     inclusive_scan(InputIterator first, InputIterator last,
                    OutputIterator result, BinaryOperation binary_op);  // C++17
 
 template<class InputIterator, class OutputIterator, class BinaryOperation, class T>
-    OutputIterator
+    constexpr OutputIterator  // constexpr since C++20
     inclusive_scan(InputIterator first, InputIterator last,
                    OutputIterator result, BinaryOperation binary_op, T init);  // C++17
 
 template<class InputIterator, class OutputIterator, class T,
          class BinaryOperation, class UnaryOperation>
-    OutputIterator
+    constexpr OutputIterator  // constexpr since C++20
     transform_exclusive_scan(InputIterator first, InputIterator last,
                              OutputIterator result, T init,
                              BinaryOperation binary_op, UnaryOperation unary_op);  // C++17
 
 template<class InputIterator, class OutputIterator,
          class BinaryOperation, class UnaryOperation>
-    OutputIterator
+    constexpr OutputIterator  // constexpr since C++20
     transform_inclusive_scan(InputIterator first, InputIterator last,
                              OutputIterator result,
                              BinaryOperation binary_op, UnaryOperation unary_op);  // C++17
 
 template<class InputIterator, class OutputIterator,
          class BinaryOperation, class UnaryOperation, class T>
-    OutputIterator
+    constexpr OutputIterator  // constexpr since C++20
     transform_inclusive_scan(InputIterator first, InputIterator last,
                              OutputIterator result,
                              BinaryOperation binary_op, UnaryOperation unary_op,
                              T init);  // C++17
 
 template <class InputIterator, class OutputIterator>
-    OutputIterator
+    constexpr OutputIterator  // constexpr since C++20
     adjacent_difference(InputIterator first, InputIterator last, OutputIterator result);
 
 template <class InputIterator, class OutputIterator, class BinaryOperation>
-    OutputIterator
+    constexpr OutputIterator  // constexpr since C++20
     adjacent_difference(InputIterator first, InputIterator last, OutputIterator result, BinaryOperation binary_op);
 
 template <class ForwardIterator, class T>
-    void iota(ForwardIterator first, ForwardIterator last, T value);
+    constexpr void  // constexpr since C++20
+    iota(ForwardIterator first, ForwardIterator last, T value);
 
 template <class M, class N>
     constexpr common_type_t<M,N> gcd(M m, N n);    // C++17
@@ -133,9 +134,11 @@ template <class M, class N>
 template <class M, class N>
     constexpr common_type_t<M,N> lcm(M m, N n);    // C++17
 
-integer         midpoint(integer a, integer b);                  // C++20
-pointer         midpoint(pointer a, pointer b);                  // C++20
-floating_point  midpoint(floating_point a, floating_point b);    // C++20
+template<class T>
+    constexpr T midpoint(T a, T b) noexcept;  // C++20
+
+template<class T>
+    constexpr T* midpoint(T* a, T* b);        // C++20
 
 }  // std
 
@@ -158,28 +161,36 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _InputIterator, class _Tp>
-inline _LIBCPP_INLINE_VISIBILITY
+_LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _Tp
 accumulate(_InputIterator __first, _InputIterator __last, _Tp __init)
 {
     for (; __first != __last; ++__first)
+#if _LIBCPP_STD_VER > 17
+        __init = _VSTD::move(__init) + *__first;
+#else
         __init = __init + *__first;
+#endif
     return __init;
 }
 
 template <class _InputIterator, class _Tp, class _BinaryOperation>
-inline _LIBCPP_INLINE_VISIBILITY
+_LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _Tp
 accumulate(_InputIterator __first, _InputIterator __last, _Tp __init, _BinaryOperation __binary_op)
 {
     for (; __first != __last; ++__first)
+#if _LIBCPP_STD_VER > 17
+        __init = __binary_op(_VSTD::move(__init), *__first);
+#else
         __init = __binary_op(__init, *__first);
+#endif
     return __init;
 }
 
 #if _LIBCPP_STD_VER > 14
 template <class _InputIterator, class _Tp, class _BinaryOp>
-inline _LIBCPP_INLINE_VISIBILITY
+_LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _Tp
 reduce(_InputIterator __first, _InputIterator __last, _Tp __init, _BinaryOp __b)
 {
@@ -189,7 +200,7 @@ reduce(_InputIterator __first, _InputIterator __last, _Tp __init, _BinaryOp __b)
 }
 
 template <class _InputIterator, class _Tp>
-inline _LIBCPP_INLINE_VISIBILITY
+_LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _Tp
 reduce(_InputIterator __first, _InputIterator __last, _Tp __init)
 {
@@ -197,7 +208,7 @@ reduce(_InputIterator __first, _InputIterator __last, _Tp __init)
 }
 
 template <class _InputIterator>
-inline _LIBCPP_INLINE_VISIBILITY
+_LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 typename iterator_traits<_InputIterator>::value_type
 reduce(_InputIterator __first, _InputIterator __last)
 {
@@ -207,29 +218,37 @@ reduce(_InputIterator __first, _InputIterator __last)
 #endif
 
 template <class _InputIterator1, class _InputIterator2, class _Tp>
-inline _LIBCPP_INLINE_VISIBILITY
+_LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _Tp
 inner_product(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, _Tp __init)
 {
     for (; __first1 != __last1; ++__first1, (void) ++__first2)
+#if _LIBCPP_STD_VER > 17
+        __init = _VSTD::move(__init) + *__first1 * *__first2;
+#else
         __init = __init + *__first1 * *__first2;
+#endif
     return __init;
 }
 
 template <class _InputIterator1, class _InputIterator2, class _Tp, class _BinaryOperation1, class _BinaryOperation2>
-inline _LIBCPP_INLINE_VISIBILITY
+_LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _Tp
 inner_product(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2,
               _Tp __init, _BinaryOperation1 __binary_op1, _BinaryOperation2 __binary_op2)
 {
     for (; __first1 != __last1; ++__first1, (void) ++__first2)
+#if _LIBCPP_STD_VER > 17
+        __init = __binary_op1(_VSTD::move(__init), __binary_op2(*__first1, *__first2));
+#else
         __init = __binary_op1(__init, __binary_op2(*__first1, *__first2));
+#endif
     return __init;
 }
 
 #if _LIBCPP_STD_VER > 14
 template <class _InputIterator, class _Tp, class _BinaryOp, class _UnaryOp>
-inline _LIBCPP_INLINE_VISIBILITY
+_LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _Tp
 transform_reduce(_InputIterator __first, _InputIterator __last,
            _Tp __init,  _BinaryOp __b, _UnaryOp __u)
@@ -241,7 +260,7 @@ transform_reduce(_InputIterator __first, _InputIterator __last,
 
 template <class _InputIterator1, class _InputIterator2,
           class _Tp, class _BinaryOp1, class _BinaryOp2>
-inline _LIBCPP_INLINE_VISIBILITY
+_LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _Tp
 transform_reduce(_InputIterator1 __first1, _InputIterator1 __last1,
                  _InputIterator2 __first2, _Tp __init,  _BinaryOp1 __b1, _BinaryOp2 __b2)
@@ -252,7 +271,7 @@ transform_reduce(_InputIterator1 __first1, _InputIterator1 __last1,
 }
 
 template <class _InputIterator1, class _InputIterator2, class _Tp>
-inline _LIBCPP_INLINE_VISIBILITY
+_LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _Tp
 transform_reduce(_InputIterator1 __first1, _InputIterator1 __last1,
                  _InputIterator2 __first2, _Tp __init)
@@ -263,7 +282,7 @@ transform_reduce(_InputIterator1 __first1, _InputIterator1 __last1,
 #endif
 
 template <class _InputIterator, class _OutputIterator>
-inline _LIBCPP_INLINE_VISIBILITY
+_LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _OutputIterator
 partial_sum(_InputIterator __first, _InputIterator __last, _OutputIterator __result)
 {
@@ -273,7 +292,11 @@ partial_sum(_InputIterator __first, _InputIterator __last, _OutputIterator __res
         *__result = __t;
         for (++__first, (void) ++__result; __first != __last; ++__first, (void) ++__result)
         {
+#if _LIBCPP_STD_VER > 17
+            __t = _VSTD::move(__t) + *__first;
+#else
             __t = __t + *__first;
+#endif
             *__result = __t;
         }
     }
@@ -281,7 +304,7 @@ partial_sum(_InputIterator __first, _InputIterator __last, _OutputIterator __res
 }
 
 template <class _InputIterator, class _OutputIterator, class _BinaryOperation>
-inline _LIBCPP_INLINE_VISIBILITY
+_LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _OutputIterator
 partial_sum(_InputIterator __first, _InputIterator __last, _OutputIterator __result,
               _BinaryOperation __binary_op)
@@ -292,7 +315,11 @@ partial_sum(_InputIterator __first, _InputIterator __last, _OutputIterator __res
         *__result = __t;
         for (++__first, (void) ++__result; __first != __last; ++__first, (void) ++__result)
         {
+#if _LIBCPP_STD_VER > 17
+            __t = __binary_op(_VSTD::move(__t), *__first);
+#else
             __t = __binary_op(__t, *__first);
+#endif
             *__result = __t;
         }
     }
@@ -301,27 +328,30 @@ partial_sum(_InputIterator __first, _InputIterator __last, _OutputIterator __res
 
 #if _LIBCPP_STD_VER > 14
 template <class _InputIterator, class _OutputIterator, class _Tp, class _BinaryOp>
-inline _LIBCPP_INLINE_VISIBILITY
+_LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _OutputIterator
 exclusive_scan(_InputIterator __first, _InputIterator __last,
                _OutputIterator __result, _Tp __init, _BinaryOp __b)
 {
     if (__first != __last)
     {
-        _Tp __saved = __init;
-        do
+        _Tp __tmp(__b(__init, *__first));
+        while (true)
         {
-            __init = __b(__init, *__first);
-            *__result = __saved;
-            __saved = __init;
+            *__result = _VSTD::move(__init);
             ++__result;
-        } while (++__first != __last);
+            ++__first;
+            if (__first == __last)
+                break;
+            __init = _VSTD::move(__tmp);
+            __tmp = __b(__init, *__first);
+        }
     }
     return __result;
 }
 
 template <class _InputIterator, class _OutputIterator, class _Tp>
-inline _LIBCPP_INLINE_VISIBILITY
+_LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _OutputIterator
 exclusive_scan(_InputIterator __first, _InputIterator __last,
                _OutputIterator __result, _Tp __init)
@@ -330,40 +360,43 @@ exclusive_scan(_InputIterator __first, _InputIterator __last,
 }
 
 template <class _InputIterator, class _OutputIterator, class _Tp, class _BinaryOp>
+_LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _OutputIterator inclusive_scan(_InputIterator __first, _InputIterator __last,
                                _OutputIterator __result, _BinaryOp __b,  _Tp __init)
 {
     for (; __first != __last; ++__first, (void) ++__result) {
         __init = __b(__init, *__first);
         *__result = __init;
-        }
+    }
     return __result;
 }
 
 template <class _InputIterator, class _OutputIterator, class _BinaryOp>
+_LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _OutputIterator inclusive_scan(_InputIterator __first, _InputIterator __last,
                                _OutputIterator __result, _BinaryOp __b)
 {
     if (__first != __last) {
-        typename std::iterator_traits<_InputIterator>::value_type __init = *__first;
+        typename iterator_traits<_InputIterator>::value_type __init = *__first;
         *__result++ = __init;
         if (++__first != __last)
             return _VSTD::inclusive_scan(__first, __last, __result, __b, __init);
-        }
+    }
 
     return __result;
 }
 
 template <class _InputIterator, class _OutputIterator>
+_LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _OutputIterator inclusive_scan(_InputIterator __first, _InputIterator __last,
                                _OutputIterator __result)
 {
-    return _VSTD::inclusive_scan(__first, __last, __result, std::plus<>());
+    return _VSTD::inclusive_scan(__first, __last, __result, _VSTD::plus<>());
 }
 
 template <class _InputIterator, class _OutputIterator, class _Tp,
           class _BinaryOp, class _UnaryOp>
-inline _LIBCPP_INLINE_VISIBILITY
+_LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _OutputIterator
 transform_exclusive_scan(_InputIterator __first, _InputIterator __last,
                            _OutputIterator __result, _Tp __init,
@@ -384,7 +417,9 @@ transform_exclusive_scan(_InputIterator __first, _InputIterator __last,
 }
 
 template <class _InputIterator, class _OutputIterator, class _Tp, class _BinaryOp, class _UnaryOp>
-_OutputIterator transform_inclusive_scan(_InputIterator __first, _InputIterator __last,
+_LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
+_OutputIterator
+transform_inclusive_scan(_InputIterator __first, _InputIterator __last,
                            _OutputIterator __result, _BinaryOp __b, _UnaryOp __u, _Tp __init)
 {
     for (; __first != __last; ++__first, (void) ++__result) {
@@ -396,61 +431,71 @@ _OutputIterator transform_inclusive_scan(_InputIterator __first, _InputIterator
 }
 
 template <class _InputIterator, class _OutputIterator, class _BinaryOp, class _UnaryOp>
-_OutputIterator transform_inclusive_scan(_InputIterator __first, _InputIterator __last,
+_LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
+_OutputIterator
+transform_inclusive_scan(_InputIterator __first, _InputIterator __last,
                                _OutputIterator __result, _BinaryOp __b, _UnaryOp __u)
 {
     if (__first != __last) {
-        typename std::iterator_traits<_InputIterator>::value_type __init = __u(*__first);
+        typename iterator_traits<_InputIterator>::value_type __init = __u(*__first);
         *__result++ = __init;
         if (++__first != __last)
             return _VSTD::transform_inclusive_scan(__first, __last, __result, __b, __u, __init);
-        }
+    }
 
     return __result;
 }
 #endif
 
 template <class _InputIterator, class _OutputIterator>
-inline _LIBCPP_INLINE_VISIBILITY
+_LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _OutputIterator
 adjacent_difference(_InputIterator __first, _InputIterator __last, _OutputIterator __result)
 {
     if (__first != __last)
     {
-        typename iterator_traits<_InputIterator>::value_type __t1(*__first);
-        *__result = __t1;
+        typename iterator_traits<_InputIterator>::value_type __acc(*__first);
+        *__result = __acc;
         for (++__first, (void) ++__result; __first != __last; ++__first, (void) ++__result)
         {
-            typename iterator_traits<_InputIterator>::value_type __t2(*__first);
-            *__result = __t2 - __t1;
-            __t1 = _VSTD::move(__t2);
+            typename iterator_traits<_InputIterator>::value_type __val(*__first);
+#if _LIBCPP_STD_VER > 17
+            *__result = __val - _VSTD::move(__acc);
+#else
+            *__result = __val - __acc;
+#endif
+            __acc = _VSTD::move(__val);
         }
     }
     return __result;
 }
 
 template <class _InputIterator, class _OutputIterator, class _BinaryOperation>
-inline _LIBCPP_INLINE_VISIBILITY
+_LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _OutputIterator
 adjacent_difference(_InputIterator __first, _InputIterator __last, _OutputIterator __result,
                       _BinaryOperation __binary_op)
 {
     if (__first != __last)
     {
-        typename iterator_traits<_InputIterator>::value_type __t1(*__first);
-        *__result = __t1;
+        typename iterator_traits<_InputIterator>::value_type __acc(*__first);
+        *__result = __acc;
         for (++__first, (void) ++__result; __first != __last; ++__first, (void) ++__result)
         {
-            typename iterator_traits<_InputIterator>::value_type __t2(*__first);
-            *__result = __binary_op(__t2, __t1);
-            __t1 = _VSTD::move(__t2);
+            typename iterator_traits<_InputIterator>::value_type __val(*__first);
+#if _LIBCPP_STD_VER > 17
+            *__result = __binary_op(__val, _VSTD::move(__acc));
+#else
+            *__result = __binary_op(__val, __acc);
+#endif
+            __acc = _VSTD::move(__val);
         }
     }
     return __result;
 }
 
 template <class _ForwardIterator, class _Tp>
-inline _LIBCPP_INLINE_VISIBILITY
+_LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 void
 iota(_ForwardIterator __first, _ForwardIterator __last, _Tp __value_)
 {
@@ -467,9 +512,9 @@ struct __ct_abs<_Result, _Source, true> {
     _LIBCPP_CONSTEXPR _LIBCPP_INLINE_VISIBILITY
     _Result operator()(_Source __t) const noexcept
     {
-    if (__t >= 0) return __t;
-    if (__t == numeric_limits<_Source>::min()) return -static_cast<_Result>(__t);
-    return -__t;
+        if (__t >= 0) return __t;
+        if (__t == numeric_limits<_Source>::min()) return -static_cast<_Result>(__t);
+        return -__t;
     }
 };
 
@@ -531,8 +576,8 @@ enable_if_t<is_integral_v<_Tp> && !is_same_v<bool, _Tp> && !is_null_pointer_v<_T
 midpoint(_Tp __a, _Tp __b) noexcept
 _LIBCPP_DISABLE_UBSAN_UNSIGNED_INTEGER_CHECK
 {
-    using _Up = std::make_unsigned_t<_Tp>;
-    constexpr _Up __bitshift = std::numeric_limits<_Up>::digits - 1;
+    using _Up = make_unsigned_t<_Tp>;
+    constexpr _Up __bitshift = numeric_limits<_Up>::digits - 1;
 
     _Up __diff = _Up(__b) - _Up(__a);
     _Up __sign_bit = __b < __a;
diff --git a/lib/libcxx/include/optional b/lib/libcxx/include/optional
index a147d69da0..97a0bbe66c 100644
--- a/lib/libcxx/include/optional
+++ b/lib/libcxx/include/optional
@@ -147,6 +147,7 @@ template<class T>
 */
 
 #include <__config>
+#include <__availability>
 #include <__debug>
 #include <__functional_base>
 #include <functional>
@@ -320,7 +321,7 @@ struct __optional_storage_base : __optional_destruct_base<_Tp>
     void __construct(_Args&&... __args)
     {
         _LIBCPP_ASSERT(!has_value(), "__construct called for engaged __optional_storage");
-        ::new((void*)_VSTD::addressof(this->__val_)) value_type(_VSTD::forward<_Args>(__args)...);
+        ::new ((void*)_VSTD::addressof(this->__val_)) value_type(_VSTD::forward<_Args>(__args)...);
         this->__engaged_ = true;
     }
 
@@ -656,7 +657,7 @@ private:
       }
       template <class _Up, class _QUp = _QualUp>
       static constexpr bool __enable_assign() {
-          // Construction and assignability of _Qup to _Tp has already been
+          // Construction and assignability of _QUp to _Tp has already been
           // checked.
           return !__check_constructible_from_opt<_Up>::value &&
               !__check_assignable_from_opt<_Up>::value;
diff --git a/lib/libcxx/include/ostream b/lib/libcxx/include/ostream
index 697732d54e..f5eb8a8946 100644
--- a/lib/libcxx/include/ostream
+++ b/lib/libcxx/include/ostream
@@ -126,9 +126,8 @@ template <class charT, class traits>
   basic_ostream<charT,traits>& flush(basic_ostream<charT,traits>& os);
 
 // rvalue stream insertion
-template <class charT, class traits, class T>
-  basic_ostream<charT, traits>&
-  operator<<(basic_ostream<charT, traits>&& os, const T& x);
+template <class Stream, class T>
+  Stream&& operator<<(Stream&& os, const T& x);
 
 }  // std
 
@@ -1028,15 +1027,20 @@ flush(basic_ostream<_CharT, _Traits>& __os)
 
 #ifndef _LIBCPP_CXX03_LANG
 
+template <class _Stream, class _Tp, class = void>
+struct __is_ostreamable : false_type { };
+
 template <class _Stream, class _Tp>
-inline _LIBCPP_INLINE_VISIBILITY
-typename enable_if
-<
-    !is_lvalue_reference<_Stream>::value &&
-    is_base_of<ios_base, _Stream>::value,
-    _Stream&&
->::type
-operator<<(_Stream&& __os, const _Tp& __x)
+struct __is_ostreamable<_Stream, _Tp, decltype(
+    _VSTD::declval<_Stream>() << _VSTD::declval<_Tp>(), void()
+)> : true_type { };
+
+template <class _Stream, class _Tp, class = typename enable_if<
+    _And<is_base_of<ios_base, _Stream>,
+         __is_ostreamable<_Stream&, const _Tp&>>::value
+>::type>
+_LIBCPP_INLINE_VISIBILITY
+_Stream&& operator<<(_Stream&& __os, const _Tp& __x)
 {
     __os << __x;
     return _VSTD::move(__os);
@@ -1097,10 +1101,8 @@ operator<<(basic_ostream<_CharT, _Traits>& __os, const bitset<_Size>& __x)
                          use_facet<ctype<_CharT> >(__os.getloc()).widen('1'));
 }
 
-#ifndef _LIBCPP_DO_NOT_ASSUME_STREAMS_EXPLICIT_INSTANTIATION_IN_DYLIB
 _LIBCPP_EXTERN_TEMPLATE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_ostream<char>)
 _LIBCPP_EXTERN_TEMPLATE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_ostream<wchar_t>)
-#endif
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/lib/libcxx/include/queue b/lib/libcxx/include/queue
index 33c25e0dfc..a2048c1e22 100644
--- a/lib/libcxx/include/queue
+++ b/lib/libcxx/include/queue
@@ -112,18 +112,11 @@ protected:
     Compare comp;
 
 public:
-    priority_queue() = default;
-    ~priority_queue() = default;
-
-    priority_queue(const priority_queue& q) = default;
-    priority_queue(priority_queue&& q) = default;
-
-    priority_queue& operator=(const priority_queue& q) = default;
-    priority_queue& operator=(priority_queue&& q) = default;
-
-    explicit priority_queue(const Compare& comp);
-    priority_queue(const Compare& comp, const container_type& c);
-    explicit priority_queue(const Compare& comp, container_type&& c);
+    priority_queue() : priority_queue(Compare()) {} // C++20
+    explicit priority_queue(const Compare& x) : priority_queue(x, Container()) {}
+    priority_queue(const Compare& x, const Container&);
+    explicit priority_queue(const Compare& x = Compare(), Container&&= Container()); // before C++20
+    priority_queue(const Compare& x, Container&&); // C++20
     template <class InputIterator>
         priority_queue(InputIterator first, InputIterator last,
                        const Compare& comp = Compare());
@@ -474,7 +467,7 @@ public:
     priority_queue(const value_compare& __comp, const container_type& __c);
 #ifndef _LIBCPP_CXX03_LANG
     _LIBCPP_INLINE_VISIBILITY
-    explicit priority_queue(const value_compare& __comp, container_type&& __c);
+    priority_queue(const value_compare& __comp, container_type&& __c);
 #endif
     template <class _InputIter>
         _LIBCPP_INLINE_VISIBILITY
diff --git a/lib/libcxx/include/random b/lib/libcxx/include/random
index ce3d135e06..6e0d2ecb47 100644
--- a/lib/libcxx/include/random
+++ b/lib/libcxx/include/random
@@ -36,7 +36,9 @@ public:
     static constexpr result_type default_seed = 1u;
 
     // constructors and seeding functions
-    explicit linear_congruential_engine(result_type s = default_seed);
+    explicit linear_congruential_engine(result_type s = default_seed);         // before C++20
+    linear_congruential_engine() : linear_congruential_engine(default_seed) {} // C++20
+    explicit linear_congruential_engine(result_type s);                        // C++20
     template<class Sseq> explicit linear_congruential_engine(Sseq& q);
     void seed(result_type s = default_seed);
     template<class Sseq> void seed(Sseq& q);
@@ -96,7 +98,9 @@ public:
     static constexpr result_type default_seed = 5489u;
 
     // constructors and seeding functions
-    explicit mersenne_twister_engine(result_type value = default_seed);
+    explicit mersenne_twister_engine(result_type s = default_seed);      // before C++20
+    mersenne_twister_engine() : mersenne_twister_engine(default_seed) {} // C++20
+    explicit mersenne_twister_engine(result_type s);                     // C++20
     template<class Sseq> explicit mersenne_twister_engine(Sseq& q);
     void seed(result_type value = default_seed);
     template<class Sseq> void seed(Sseq& q);
@@ -154,7 +158,9 @@ public:
     static constexpr result_type default_seed = 19780503u;
 
     // constructors and seeding functions
-    explicit subtract_with_carry_engine(result_type value = default_seed);
+    explicit subtract_with_carry_engine(result_type value = default_seed);     // before C++20
+    subtract_with_carry_engine() : subtract_with_carry_engine(default_seed) {} // C++20
+    explicit subtract_with_carry_engine(result_type value);                    // C++20
     template<class Sseq> explicit subtract_with_carry_engine(Sseq& q);
     void seed(result_type value = default_seed);
     template<class Sseq> void seed(Sseq& q);
@@ -385,7 +391,9 @@ public:
     static constexpr result_type max() { return numeric_limits<result_type>::max(); }
 
     // constructors
-    explicit random_device(const string& token = "/dev/urandom");
+    explicit random_device(const string& token = implementation-defined); // before C++20
+    random_device() : random_device(implementation-defined) {}            // C++20
+    explicit random_device(const string& token);                          // C++20
 
     // generating functions
     result_type operator()();
@@ -456,7 +464,10 @@ public:
 
     // constructors and reset functions
     explicit uniform_int_distribution(IntType a = 0,
-                                    IntType b = numeric_limits<IntType>::max());
+                                      IntType b = numeric_limits<IntType>::max()); // before C++20
+    uniform_int_distribution() : uniform_int_distribution(0) {}                    // C++20
+    explicit uniform_int_distribution(IntType a,
+                                      IntType b = numeric_limits<IntType>::max()); // C++20
     explicit uniform_int_distribution(const param_type& parm);
     void reset();
 
@@ -515,7 +526,9 @@ public:
     };
 
     // constructors and reset functions
-    explicit uniform_real_distribution(RealType a = 0.0, RealType b = 1.0);
+    explicit uniform_real_distribution(RealType a = 0.0, RealType b = 1.0); // before C++20
+    uniform_real_distribution() : uniform_real_distribution(0.0) {}         // C++20
+    explicit uniform_real_distribution(RealType a, RealType b = 1.0);       // C++20
     explicit uniform_real_distribution(const param_type& parm);
     void reset();
 
@@ -571,7 +584,9 @@ public:
     };
 
     // constructors and reset functions
-    explicit bernoulli_distribution(double p = 0.5);
+    explicit bernoulli_distribution(double p = 0.5);          // before C++20
+    bernoulli_distribution() : bernoulli_distribution(0.5) {} // C++20
+    explicit bernoulli_distribution(double p);                // C++20
     explicit bernoulli_distribution(const param_type& parm);
     void reset();
 
@@ -628,7 +643,9 @@ public:
     };
 
     // constructors and reset functions
-    explicit binomial_distribution(IntType t = 1, double p = 0.5);
+    explicit binomial_distribution(IntType t = 1, double p = 0.5); // before C++20
+    binomial_distribution() : binomial_distribution(1) {}          // C++20
+    explicit binomial_distribution(IntType t, double p = 0.5);     // C++20
     explicit binomial_distribution(const param_type& parm);
     void reset();
 
@@ -685,7 +702,9 @@ public:
     };
 
     // constructors and reset functions
-    explicit geometric_distribution(double p = 0.5);
+    explicit geometric_distribution(double p = 0.5);          // before C++20
+    geometric_distribution() : geometric_distribution(0.5) {} // C++20
+    explicit geometric_distribution(double p);                // C++20
     explicit geometric_distribution(const param_type& parm);
     void reset();
 
@@ -742,7 +761,9 @@ public:
     };
 
     // constructor and reset functions
-    explicit negative_binomial_distribution(result_type k = 1, double p = 0.5);
+    explicit negative_binomial_distribution(IntType k = 1, double p = 0.5); // before C++20
+    negative_binomial_distribution() : negative_binomial_distribution(1) {} // C++20
+    explicit negative_binomial_distribution(IntType k, double p = 0.5);     // C++20
     explicit negative_binomial_distribution(const param_type& parm);
     void reset();
 
@@ -799,7 +820,9 @@ public:
     };
 
     // constructors and reset functions
-    explicit poisson_distribution(double mean = 1.0);
+    explicit poisson_distribution(double mean = 1.0);     // before C++20
+    poisson_distribution() : poisson_distribution(1.0) {} // C++20
+    explicit poisson_distribution(double mean);           // C++20
     explicit poisson_distribution(const param_type& parm);
     void reset();
 
@@ -855,7 +878,9 @@ public:
     };
 
     // constructors and reset functions
-    explicit exponential_distribution(result_type lambda = 1.0);
+    explicit exponential_distribution(RealType lambda = 1.0);     // before C++20
+    exponential_distribution() : exponential_distribution(1.0) {} // C++20
+    explicit exponential_distribution(RealType lambda);           // C++20
     explicit exponential_distribution(const param_type& parm);
     void reset();
 
@@ -912,7 +937,9 @@ public:
     };
 
     // constructors and reset functions
-    explicit gamma_distribution(result_type alpha = 1, result_type beta = 1);
+    explicit gamma_distribution(RealType alpha = 0.0, RealType beta = 1.0); // before C++20
+    gamma_distribution() : gamma_distribution(0.0) {}                       // C++20
+    explicit gamma_distribution(RealType alpha, RealType beta = 1.0);       // C++20
     explicit gamma_distribution(const param_type& parm);
     void reset();
 
@@ -970,7 +997,9 @@ public:
     };
 
     // constructor and reset functions
-    explicit weibull_distribution(result_type a = 1, result_type b = 1);
+    explicit weibull_distribution(RealType a = 1.0, RealType b = 1.0); // before C++20
+    weibull_distribution() : weibull_distribution(1.0) {}              // C++20
+    explicit weibull_distribution(RealType a, RealType b = 1.0);       // C++20
     explicit weibull_distribution(const param_type& parm);
     void reset();
 
@@ -1028,7 +1057,9 @@ public:
     };
 
     // constructor and reset functions
-    explicit extreme_value_distribution(result_type a = 0, result_type b = 1);
+    explicit extreme_value_distribution(RealType a = 0.0, RealType b = 1.0); // before C++20
+    extreme_value_distribution() : extreme_value_distribution(0.0) {}        // C++20
+    explicit extreme_value_distribution(RealType a, RealType b = 1.0);       // C++20
     explicit extreme_value_distribution(const param_type& parm);
     void reset();
 
@@ -1086,7 +1117,9 @@ public:
     };
 
     // constructors and reset functions
-    explicit normal_distribution(result_type mean = 0, result_type stddev = 1);
+    explicit normal_distribution(RealType mean = 0.0, RealType stddev = 1.0); // before C++20
+    normal_distribution() : normal_distribution(0.0) {}                       // C++20
+    explicit normal_distribution(RealType mean, RealType stddev = 1.0);       // C++20
     explicit normal_distribution(const param_type& parm);
     void reset();
 
@@ -1144,7 +1177,9 @@ public:
     };
 
     // constructor and reset functions
-    explicit lognormal_distribution(result_type m = 0, result_type s = 1);
+    explicit lognormal_distribution(RealType mean = 0.0, RealType stddev = 1.0); // before C++20
+    lognormal_distribution() : lognormal_distribution(0.0) {}                    // C++20
+    explicit lognormal_distribution(RealType mean, RealType stddev = 1.0);       // C++20
     explicit lognormal_distribution(const param_type& parm);
     void reset();
 
@@ -1201,7 +1236,9 @@ public:
     };
 
     // constructor and reset functions
-    explicit chi_squared_distribution(result_type n = 1);
+    explicit chi_squared_distribution(RealType n = 1.0);          // before C++20
+    chi_squared_distribution() : chi_squared_distribution(1.0) {} // C++20
+    explicit chi_squared_distribution(RealType n);                // C++20
     explicit chi_squared_distribution(const param_type& parm);
     void reset();
 
@@ -1258,7 +1295,9 @@ public:
     };
 
     // constructor and reset functions
-    explicit cauchy_distribution(result_type a = 0, result_type b = 1);
+    explicit cauchy_distribution(RealType a = 0.0, RealType b = 1.0); // before C++20
+    cauchy_distribution() : cauchy_distribution(0.0) {}               // C++20
+    explicit cauchy_distribution(RealType a, RealType b = 1.0);       // C++20
     explicit cauchy_distribution(const param_type& parm);
     void reset();
 
@@ -1316,7 +1355,9 @@ public:
     };
 
     // constructor and reset functions
-    explicit fisher_f_distribution(result_type m = 1, result_type n = 1);
+    explicit fisher_f_distribution(RealType m = 1.0, RealType n = 1.0); // before C++20
+    fisher_f_distribution() : fisher_f_distribution(1.0) {}             // C++20
+    explicit fisher_f_distribution(RealType m, RealType n = 1.0);       // C++20
     explicit fisher_f_distribution(const param_type& parm);
     void reset();
 
@@ -1373,7 +1414,9 @@ public:
     };
 
     // constructor and reset functions
-    explicit student_t_distribution(result_type n = 1);
+    explicit student_t_distribution(RealType n = 1.0);        // before C++20
+    student_t_distribution() : student_t_distribution(1.0) {} // C++20
+    explicit student_t_distribution(RealType n);              // C++20
     explicit student_t_distribution(const param_type& parm);
     void reset();
 
@@ -1642,8 +1685,7 @@ class piecewise_linear_distribution
 #include <numeric>
 #include <vector>
 #include <string>
-#include <istream>
-#include <ostream>
+#include <iosfwd>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #pragma GCC system_header
@@ -1669,7 +1711,23 @@ struct __is_seed_sequence
 
 template <unsigned long long __a, unsigned long long __c,
           unsigned long long __m, unsigned long long _Mp,
-          bool _MightOverflow = (__a != 0 && __m != 0 && __m-1 > (_Mp-__c)/__a)>
+          bool _MightOverflow = (__a != 0 && __m != 0 && __m-1 > (_Mp-__c)/__a),
+          bool _OverflowOK = ((__m|__m-1) > __m), // m = 2^n
+          bool _SchrageOK = (__a != 0 && __m != 0 && __m % __a <= __m / __a)> // r <= q
+struct __lce_alg_picker
+{
+    static_assert(__a != 0 || __m != 0 || !_MightOverflow || _OverflowOK || _SchrageOK,
+                  "The current values of a, c, and m cannot generate a number "
+                  "within bounds of linear_congruential_engine.");
+
+    static _LIBCPP_CONSTEXPR const bool __use_schrage = _MightOverflow &&
+                                                        !_OverflowOK &&
+                                                        _SchrageOK;
+};
+
+template <unsigned long long __a, unsigned long long __c,
+          unsigned long long __m, unsigned long long _Mp,
+          bool _UseSchrage = __lce_alg_picker<__a, __c, __m, _Mp>::__use_schrage>
 struct __lce_ta;
 
 // 64
@@ -1843,6 +1901,7 @@ private:
 
     static_assert(__m == 0 || __a < __m, "linear_congruential_engine invalid parameters");
     static_assert(__m == 0 || __c < __m, "linear_congruential_engine invalid parameters");
+    static_assert(_VSTD::is_unsigned<_UIntType>::value, "_UIntType must be unsigned type");
 public:
     static _LIBCPP_CONSTEXPR const result_type _Min = __c == 0u ? 1u: 0u;
     static _LIBCPP_CONSTEXPR const result_type _Max = __m - 1u;
@@ -1859,9 +1918,17 @@ public:
     static _LIBCPP_CONSTEXPR const result_type default_seed = 1u;
 
     // constructors and seeding functions
+#ifndef _LIBCPP_CXX03_LANG
     _LIBCPP_INLINE_VISIBILITY
-    explicit linear_congruential_engine(result_type __s = default_seed)
-        {seed(__s);}
+    linear_congruential_engine() : linear_congruential_engine(default_seed) {}
+    _LIBCPP_INLINE_VISIBILITY
+    explicit linear_congruential_engine(result_type __s) { seed(__s); }
+#else
+    _LIBCPP_INLINE_VISIBILITY
+    explicit linear_congruential_engine(result_type __s = default_seed) {
+      seed(__s);
+    }
+#endif
     template<class _Sseq>
         _LIBCPP_INLINE_VISIBILITY
         explicit linear_congruential_engine(_Sseq& __q,
@@ -1982,7 +2049,8 @@ operator<<(basic_ostream<_CharT, _Traits>& __os,
            const linear_congruential_engine<_UIntType, __a, __c, __m>& __x)
 {
     __save_flags<_CharT, _Traits> __lx(__os);
-    __os.flags(ios_base::dec | ios_base::left);
+    typedef basic_ostream<_CharT, _Traits> _Ostream;
+    __os.flags(_Ostream::dec | _Ostream::left);
     __os.fill(__os.widen(' '));
     return __os << __x.__x_;
 }
@@ -1994,7 +2062,8 @@ operator>>(basic_istream<_CharT, _Traits>& __is,
            linear_congruential_engine<_UIntType, __a, __c, __m>& __x)
 {
     __save_flags<_CharT, _Traits> __lx(__is);
-    __is.flags(ios_base::dec | ios_base::skipws);
+    typedef basic_istream<_CharT, _Traits> _Istream;
+    __is.flags(_Istream::dec | _Istream::skipws);
     _UIntType __t;
     __is >> __t;
     if (!__is.fail())
@@ -2106,9 +2175,17 @@ public:
     static _LIBCPP_CONSTEXPR const result_type default_seed = 5489u;
 
     // constructors and seeding functions
+#ifndef _LIBCPP_CXX03_LANG
     _LIBCPP_INLINE_VISIBILITY
-    explicit mersenne_twister_engine(result_type __sd = default_seed)
-        {seed(__sd);}
+    mersenne_twister_engine() : mersenne_twister_engine(default_seed) {}
+    _LIBCPP_INLINE_VISIBILITY
+    explicit mersenne_twister_engine(result_type __sd) { seed(__sd); }
+#else
+    _LIBCPP_INLINE_VISIBILITY
+    explicit mersenne_twister_engine(result_type __sd = default_seed) {
+      seed(__sd);
+    }
+#endif
     template<class _Sseq>
         _LIBCPP_INLINE_VISIBILITY
         explicit mersenne_twister_engine(_Sseq& __q,
@@ -2453,7 +2530,8 @@ operator<<(basic_ostream<_CharT, _Traits>& __os,
                                          _Bp, _Tp, _Cp, _Lp, _Fp>& __x)
 {
     __save_flags<_CharT, _Traits> __lx(__os);
-    __os.flags(ios_base::dec | ios_base::left);
+    typedef basic_ostream<_CharT, _Traits> _Ostream;
+    __os.flags(_Ostream::dec | _Ostream::left);
     _CharT __sp = __os.widen(' ');
     __os.fill(__sp);
     __os << __x.__x_[__x.__i_];
@@ -2474,7 +2552,8 @@ operator>>(basic_istream<_CharT, _Traits>& __is,
                                    _Bp, _Tp, _Cp, _Lp, _Fp>& __x)
 {
     __save_flags<_CharT, _Traits> __lx(__is);
-    __is.flags(ios_base::dec | ios_base::skipws);
+    typedef basic_istream<_CharT, _Traits> _Istream;
+    __is.flags(_Istream::dec | _Istream::skipws);
     _UInt __t[_Np];
     for (size_t __i = 0; __i < _Np; ++__i)
         __is >> __t[__i];
@@ -2562,9 +2641,17 @@ public:
     static _LIBCPP_CONSTEXPR const result_type default_seed = 19780503u;
 
     // constructors and seeding functions
+#ifndef _LIBCPP_CXX03_LANG
     _LIBCPP_INLINE_VISIBILITY
-    explicit subtract_with_carry_engine(result_type __sd = default_seed)
-        {seed(__sd);}
+    subtract_with_carry_engine() : subtract_with_carry_engine(default_seed) {}
+    _LIBCPP_INLINE_VISIBILITY
+    explicit subtract_with_carry_engine(result_type __sd) { seed(__sd); }
+#else
+    _LIBCPP_INLINE_VISIBILITY
+    explicit subtract_with_carry_engine(result_type __sd = default_seed) {
+      seed(__sd);
+    }
+#endif
     template<class _Sseq>
         _LIBCPP_INLINE_VISIBILITY
         explicit subtract_with_carry_engine(_Sseq& __q,
@@ -2773,7 +2860,8 @@ operator<<(basic_ostream<_CharT, _Traits>& __os,
            const subtract_with_carry_engine<_UInt, _Wp, _Sp, _Rp>& __x)
 {
     __save_flags<_CharT, _Traits> __lx(__os);
-    __os.flags(ios_base::dec | ios_base::left);
+    typedef basic_ostream<_CharT, _Traits> _Ostream;
+    __os.flags(_Ostream::dec | _Ostream::left);
     _CharT __sp = __os.widen(' ');
     __os.fill(__sp);
     __os << __x.__x_[__x.__i_];
@@ -2792,7 +2880,8 @@ operator>>(basic_istream<_CharT, _Traits>& __is,
            subtract_with_carry_engine<_UInt, _Wp, _Sp, _Rp>& __x)
 {
     __save_flags<_CharT, _Traits> __lx(__is);
-    __is.flags(ios_base::dec | ios_base::skipws);
+    typedef basic_istream<_CharT, _Traits> _Istream;
+    __is.flags(_Istream::dec | _Istream::skipws);
     _UInt __t[_Rp+1];
     for (size_t __i = 0; __i < _Rp+1; ++__i)
         __is >> __t[__i];
@@ -2955,7 +3044,8 @@ operator<<(basic_ostream<_CharT, _Traits>& __os,
            const discard_block_engine<_Eng, _Pp, _Rp>& __x)
 {
     __save_flags<_CharT, _Traits> __lx(__os);
-    __os.flags(ios_base::dec | ios_base::left);
+    typedef basic_ostream<_CharT, _Traits> _Ostream;
+    __os.flags(_Ostream::dec | _Ostream::left);
     _CharT __sp = __os.widen(' ');
     __os.fill(__sp);
     return __os << __x.__e_ << __sp << __x.__n_;
@@ -2968,7 +3058,8 @@ operator>>(basic_istream<_CharT, _Traits>& __is,
            discard_block_engine<_Eng, _Pp, _Rp>& __x)
 {
     __save_flags<_CharT, _Traits> __lx(__is);
-    __is.flags(ios_base::dec | ios_base::skipws);
+    typedef basic_istream<_CharT, _Traits> _Istream;
+    __is.flags(_Istream::dec | _Istream::skipws);
     _Eng __e;
     int __n;
     __is >> __e >> __n;
@@ -3440,7 +3531,8 @@ operator<<(basic_ostream<_CharT, _Traits>& __os,
            const shuffle_order_engine<_Eng, _Kp>& __x)
 {
     __save_flags<_CharT, _Traits> __lx(__os);
-    __os.flags(ios_base::dec | ios_base::left);
+    typedef basic_ostream<_CharT, _Traits> _Ostream;
+    __os.flags(_Ostream::dec | _Ostream::left);
     _CharT __sp = __os.widen(' ');
     __os.fill(__sp);
     __os << __x.__e_ << __sp << __x._V_[0];
@@ -3457,7 +3549,8 @@ operator>>(basic_istream<_CharT, _Traits>& __is,
 {
     typedef typename shuffle_order_engine<_Eng, _Kp>::result_type result_type;
     __save_flags<_CharT, _Traits> __lx(__is);
-    __is.flags(ios_base::dec | ios_base::skipws);
+    typedef basic_istream<_CharT, _Traits> _Istream;
+    __is.flags(_Istream::dec | _Istream::skipws);
     _Eng __e;
     result_type _Vp[_Kp+1];
     __is >> __e;
@@ -3477,6 +3570,8 @@ typedef shuffle_order_engine<minstd_rand0, 256>                         knuth_b;
 
 // random_device
 
+#if !defined(_LIBCPP_HAS_NO_RANDOM_DEVICE)
+
 class _LIBCPP_TYPE_VIS random_device
 {
 #ifdef _LIBCPP_USING_DEV_RANDOM
@@ -3496,7 +3591,12 @@ public:
     static _LIBCPP_CONSTEXPR result_type max() { return _Max;}
 
     // constructors
+#ifndef _LIBCPP_CXX03_LANG
+    random_device() : random_device("/dev/urandom") {}
+    explicit random_device(const string& __token);
+#else
     explicit random_device(const string& __token = "/dev/urandom");
+#endif
     ~random_device();
 
     // generating functions
@@ -3511,6 +3611,8 @@ private:
     random_device& operator=(const random_device&); // = delete;
 };
 
+#endif // !_LIBCPP_HAS_NO_RANDOM_DEVICE
+
 // seed_seq
 
 class _LIBCPP_TEMPLATE_VIS seed_seq
@@ -3663,7 +3765,8 @@ operator<<(basic_ostream<_CharT, _Traits>& __os,
            const uniform_int_distribution<_IT>& __x)
 {
     __save_flags<_CharT, _Traits> __lx(__os);
-    __os.flags(ios_base::dec | ios_base::left);
+    typedef basic_ostream<_CharT, _Traits> _Ostream;
+    __os.flags(_Ostream::dec | _Ostream::left);
     _CharT __sp = __os.widen(' ');
     __os.fill(__sp);
     return __os << __x.a() << __sp << __x.b();
@@ -3678,7 +3781,8 @@ operator>>(basic_istream<_CharT, _Traits>& __is,
     typedef typename _Eng::result_type result_type;
     typedef typename _Eng::param_type param_type;
     __save_flags<_CharT, _Traits> __lx(__is);
-    __is.flags(ios_base::dec | ios_base::skipws);
+    typedef basic_istream<_CharT, _Traits> _Istream;
+    __is.flags(_Istream::dec | _Istream::skipws);
     result_type __a;
     result_type __b;
     __is >> __a >> __b;
@@ -3726,9 +3830,16 @@ private:
 
 public:
     // constructors and reset functions
+#ifndef _LIBCPP_CXX03_LANG
+    _LIBCPP_INLINE_VISIBILITY
+    uniform_real_distribution() : uniform_real_distribution(0) {}
+    explicit uniform_real_distribution(result_type __a, result_type __b = 1)
+        : __p_(param_type(__a, __b)) {}
+#else
     _LIBCPP_INLINE_VISIBILITY
     explicit uniform_real_distribution(result_type __a = 0, result_type __b = 1)
         : __p_(param_type(__a, __b)) {}
+#endif
     _LIBCPP_INLINE_VISIBILITY
     explicit uniform_real_distribution(const param_type& __p) : __p_(__p) {}
     _LIBCPP_INLINE_VISIBILITY
@@ -3784,8 +3895,9 @@ operator<<(basic_ostream<_CharT, _Traits>& __os,
            const uniform_real_distribution<_RT>& __x)
 {
     __save_flags<_CharT, _Traits> __lx(__os);
-    __os.flags(ios_base::dec | ios_base::left | ios_base::fixed |
-               ios_base::scientific);
+    typedef basic_ostream<_CharT, _Traits> _OStream;
+    __os.flags(_OStream::dec | _OStream::left | _OStream::fixed |
+               _OStream::scientific);
     _CharT __sp = __os.widen(' ');
     __os.fill(__sp);
     return __os << __x.a() << __sp << __x.b();
@@ -3800,7 +3912,8 @@ operator>>(basic_istream<_CharT, _Traits>& __is,
     typedef typename _Eng::result_type result_type;
     typedef typename _Eng::param_type param_type;
     __save_flags<_CharT, _Traits> __lx(__is);
-    __is.flags(ios_base::dec | ios_base::skipws);
+    typedef basic_istream<_CharT, _Traits> _Istream;
+    __is.flags(_Istream::dec | _Istream::skipws);
     result_type __a;
     result_type __b;
     __is >> __a >> __b;
@@ -3842,9 +3955,15 @@ private:
 
 public:
     // constructors and reset functions
+#ifndef _LIBCPP_CXX03_LANG
     _LIBCPP_INLINE_VISIBILITY
-    explicit bernoulli_distribution(double __p = 0.5)
-        : __p_(param_type(__p)) {}
+    bernoulli_distribution() : bernoulli_distribution(0.5) {}
+    _LIBCPP_INLINE_VISIBILITY
+    explicit bernoulli_distribution(double __p) : __p_(param_type(__p)) {}
+#else
+    _LIBCPP_INLINE_VISIBILITY
+    explicit bernoulli_distribution(double __p = 0.5) : __p_(param_type(__p)) {}
+#endif
     _LIBCPP_INLINE_VISIBILITY
     explicit bernoulli_distribution(const param_type& __p) : __p_(__p) {}
     _LIBCPP_INLINE_VISIBILITY
@@ -3895,8 +4014,9 @@ basic_ostream<_CharT, _Traits>&
 operator<<(basic_ostream<_CharT, _Traits>& __os, const bernoulli_distribution& __x)
 {
     __save_flags<_CharT, _Traits> __lx(__os);
-    __os.flags(ios_base::dec | ios_base::left | ios_base::fixed |
-               ios_base::scientific);
+    typedef basic_ostream<_CharT, _Traits> _OStream;
+    __os.flags(_OStream::dec | _OStream::left | _OStream::fixed |
+               _OStream::scientific);
     _CharT __sp = __os.widen(' ');
     __os.fill(__sp);
     return __os << __x.p();
@@ -3909,7 +4029,8 @@ operator>>(basic_istream<_CharT, _Traits>& __is, bernoulli_distribution& __x)
     typedef bernoulli_distribution _Eng;
     typedef typename _Eng::param_type param_type;
     __save_flags<_CharT, _Traits> __lx(__is);
-    __is.flags(ios_base::dec | ios_base::skipws);
+    typedef basic_istream<_CharT, _Traits> _Istream;
+    __is.flags(_Istream::dec | _Istream::skipws);
     double __p;
     __is >> __p;
     if (!__is.fail())
@@ -3958,9 +4079,17 @@ private:
 
 public:
     // constructors and reset functions
+#ifndef _LIBCPP_CXX03_LANG
+    _LIBCPP_INLINE_VISIBILITY
+    binomial_distribution() : binomial_distribution(1) {}
+    _LIBCPP_INLINE_VISIBILITY
+    explicit binomial_distribution(result_type __t, double __p = 0.5)
+        : __p_(param_type(__t, __p)) {}
+#else
     _LIBCPP_INLINE_VISIBILITY
     explicit binomial_distribution(result_type __t = 1, double __p = 0.5)
         : __p_(param_type(__t, __p)) {}
+#endif
     _LIBCPP_INLINE_VISIBILITY
     explicit binomial_distribution(const param_type& __p) : __p_(__p) {}
     _LIBCPP_INLINE_VISIBILITY
@@ -3999,12 +4128,12 @@ public:
         {return !(__x == __y);}
 };
 
-#ifndef _LIBCPP_MSVCRT
+#ifndef _LIBCPP_MSVCRT_LIKE
 extern "C" double lgamma_r(double, int *);
 #endif
 
 inline _LIBCPP_INLINE_VISIBILITY double __libcpp_lgamma(double __d) {
-#if defined(_LIBCPP_MSVCRT)
+#if defined(_LIBCPP_MSVCRT_LIKE)
   return lgamma(__d);
 #else
   int __sign;
@@ -4079,8 +4208,9 @@ operator<<(basic_ostream<_CharT, _Traits>& __os,
            const binomial_distribution<_IntType>& __x)
 {
     __save_flags<_CharT, _Traits> __lx(__os);
-    __os.flags(ios_base::dec | ios_base::left | ios_base::fixed |
-               ios_base::scientific);
+    typedef basic_ostream<_CharT, _Traits> _OStream;
+    __os.flags(_OStream::dec | _OStream::left | _OStream::fixed |
+               _OStream::scientific);
     _CharT __sp = __os.widen(' ');
     __os.fill(__sp);
     return __os << __x.t() << __sp << __x.p();
@@ -4095,7 +4225,8 @@ operator>>(basic_istream<_CharT, _Traits>& __is,
     typedef typename _Eng::result_type result_type;
     typedef typename _Eng::param_type param_type;
     __save_flags<_CharT, _Traits> __lx(__is);
-    __is.flags(ios_base::dec | ios_base::skipws);
+    typedef basic_istream<_CharT, _Traits> _Istream;
+    __is.flags(_Istream::dec | _Istream::skipws);
     result_type __t;
     double __p;
     __is >> __t >> __p;
@@ -4138,9 +4269,17 @@ private:
 
 public:
     // constructors and reset functions
+#ifndef _LIBCPP_CXX03_LANG
+    _LIBCPP_INLINE_VISIBILITY
+    exponential_distribution() : exponential_distribution(1) {}
+    _LIBCPP_INLINE_VISIBILITY
+    explicit exponential_distribution(result_type __lambda)
+        : __p_(param_type(__lambda)) {}
+#else
     _LIBCPP_INLINE_VISIBILITY
     explicit exponential_distribution(result_type __lambda = 1)
         : __p_(param_type(__lambda)) {}
+#endif
     _LIBCPP_INLINE_VISIBILITY
     explicit exponential_distribution(const param_type& __p) : __p_(__p) {}
     _LIBCPP_INLINE_VISIBILITY
@@ -4197,8 +4336,9 @@ operator<<(basic_ostream<_CharT, _Traits>& __os,
            const exponential_distribution<_RealType>& __x)
 {
     __save_flags<_CharT, _Traits> __lx(__os);
-    __os.flags(ios_base::dec | ios_base::left | ios_base::fixed |
-               ios_base::scientific);
+    typedef basic_ostream<_CharT, _Traits> _OStream;
+    __os.flags(_OStream::dec | _OStream::left | _OStream::fixed |
+               _OStream::scientific);
     return __os << __x.lambda();
 }
 
@@ -4211,7 +4351,8 @@ operator>>(basic_istream<_CharT, _Traits>& __is,
     typedef typename _Eng::result_type result_type;
     typedef typename _Eng::param_type param_type;
     __save_flags<_CharT, _Traits> __lx(__is);
-    __is.flags(ios_base::dec | ios_base::skipws);
+    typedef basic_istream<_CharT, _Traits> _Istream;
+    __is.flags(_Istream::dec | _Istream::skipws);
     result_type __lambda;
     __is >> __lambda;
     if (!__is.fail())
@@ -4259,9 +4400,18 @@ private:
 
 public:
     // constructors and reset functions
+#ifndef _LIBCPP_CXX03_LANG
     _LIBCPP_INLINE_VISIBILITY
-    explicit normal_distribution(result_type __mean = 0, result_type __stddev = 1)
+    normal_distribution() : normal_distribution(0) {}
+    _LIBCPP_INLINE_VISIBILITY
+    explicit normal_distribution(result_type __mean, result_type __stddev = 1)
         : __p_(param_type(__mean, __stddev)), _V_hot_(false) {}
+#else
+    _LIBCPP_INLINE_VISIBILITY
+    explicit normal_distribution(result_type __mean = 0,
+                                 result_type __stddev = 1)
+        : __p_(param_type(__mean, __stddev)), _V_hot_(false) {}
+#endif
     _LIBCPP_INLINE_VISIBILITY
     explicit normal_distribution(const param_type& __p)
         : __p_(__p), _V_hot_(false) {}
@@ -4351,8 +4501,9 @@ operator<<(basic_ostream<_CharT, _Traits>& __os,
            const normal_distribution<_RT>& __x)
 {
     __save_flags<_CharT, _Traits> __lx(__os);
-    __os.flags(ios_base::dec | ios_base::left | ios_base::fixed |
-               ios_base::scientific);
+    typedef basic_ostream<_CharT, _Traits> _OStream;
+    __os.flags(_OStream::dec | _OStream::left | _OStream::fixed |
+               _OStream::scientific);
     _CharT __sp = __os.widen(' ');
     __os.fill(__sp);
     __os << __x.mean() << __sp << __x.stddev() << __sp << __x._V_hot_;
@@ -4370,7 +4521,8 @@ operator>>(basic_istream<_CharT, _Traits>& __is,
     typedef typename _Eng::result_type result_type;
     typedef typename _Eng::param_type param_type;
     __save_flags<_CharT, _Traits> __lx(__is);
-    __is.flags(ios_base::dec | ios_base::skipws);
+    typedef basic_istream<_CharT, _Traits> _Istream;
+    __is.flags(_Istream::dec | _Istream::skipws);
     result_type __mean;
     result_type __stddev;
     result_type _Vp = 0;
@@ -4437,9 +4589,18 @@ private:
 
 public:
     // constructor and reset functions
+#ifndef _LIBCPP_CXX03_LANG
     _LIBCPP_INLINE_VISIBILITY
-    explicit lognormal_distribution(result_type __m = 0, result_type __s = 1)
+    lognormal_distribution() : lognormal_distribution(0) {}
+    _LIBCPP_INLINE_VISIBILITY
+    explicit lognormal_distribution(result_type __m, result_type __s = 1)
         : __p_(param_type(__m, __s)) {}
+#else
+    _LIBCPP_INLINE_VISIBILITY
+    explicit lognormal_distribution(result_type __m = 0,
+                                    result_type __s = 1)
+        : __p_(param_type(__m, __s)) {}
+#endif
     _LIBCPP_INLINE_VISIBILITY
     explicit lognormal_distribution(const param_type& __p)
         : __p_(__p) {}
@@ -4557,8 +4718,17 @@ private:
 
 public:
     // constructors and reset functions
+#ifndef _LIBCPP_CXX03_LANG
     _LIBCPP_INLINE_VISIBILITY
-    explicit poisson_distribution(double __mean = 1.0) : __p_(__mean) {}
+    poisson_distribution() : poisson_distribution(1.0) {}
+    _LIBCPP_INLINE_VISIBILITY
+    explicit poisson_distribution(double __mean)
+        : __p_(__mean) {}
+#else
+    _LIBCPP_INLINE_VISIBILITY
+    explicit poisson_distribution(double __mean = 1.0)
+        : __p_(__mean) {}
+#endif
     _LIBCPP_INLINE_VISIBILITY
     explicit poisson_distribution(const param_type& __p) : __p_(__p) {}
     _LIBCPP_INLINE_VISIBILITY
@@ -4618,7 +4788,7 @@ poisson_distribution<_IntType>::param_type::param_type(double __mean)
     {
         __s_ = _VSTD::sqrt(__mean_);
         __d_ = 6 * __mean_ * __mean_;
-        __l_ = std::trunc(__mean_ - 1.1484);
+        __l_ = _VSTD::trunc(__mean_ - 1.1484);
         __omega_ = .3989423 / __s_;
         double __b1_ = .4166667E-1 / __mean_;
         double __b2_ = .3 * __b1_ * __b1_;
@@ -4650,13 +4820,13 @@ poisson_distribution<_IntType>::operator()(_URNG& __urng, const param_type& __pr
         double __u;
         if (__g > 0)
         {
-            __tx = std::trunc(__g);
+            __tx = _VSTD::trunc(__g);
             if (__tx >= __pr.__l_)
-                return std::__clamp_to_integral<result_type>(__tx);
+                return _VSTD::__clamp_to_integral<result_type>(__tx);
             __difmuk = __pr.__mean_ - __tx;
             __u = __urd(__urng);
             if (__pr.__d_ * __u >= __difmuk * __difmuk * __difmuk)
-                return std::__clamp_to_integral<result_type>(__tx);
+                return _VSTD::__clamp_to_integral<result_type>(__tx);
         }
         exponential_distribution<double> __edist;
         for (bool __using_exp_dist = false; true; __using_exp_dist = true)
@@ -4672,7 +4842,7 @@ poisson_distribution<_IntType>::operator()(_URNG& __urng, const param_type& __pr
                     __u += __u - 1;
                     __t = 1.8 + (__u < 0 ? -__e : __e);
                 } while (__t <= -.6744);
-                __tx = std::trunc(__pr.__mean_ + __pr.__s_ * __t);
+                __tx = _VSTD::trunc(__pr.__mean_ + __pr.__s_ * __t);
                 __difmuk = __pr.__mean_ - __tx;
                 __using_exp_dist = true;
             }
@@ -4716,7 +4886,7 @@ poisson_distribution<_IntType>::operator()(_URNG& __urng, const param_type& __pr
             }
         }
     }
-    return std::__clamp_to_integral<result_type>(__tx);
+    return _VSTD::__clamp_to_integral<result_type>(__tx);
 }
 
 template <class _CharT, class _Traits, class _IntType>
@@ -4725,8 +4895,9 @@ operator<<(basic_ostream<_CharT, _Traits>& __os,
            const poisson_distribution<_IntType>& __x)
 {
     __save_flags<_CharT, _Traits> __lx(__os);
-    __os.flags(ios_base::dec | ios_base::left | ios_base::fixed |
-               ios_base::scientific);
+    typedef basic_ostream<_CharT, _Traits> _OStream;
+    __os.flags(_OStream::dec | _OStream::left | _OStream::fixed |
+               _OStream::scientific);
     return __os << __x.mean();
 }
 
@@ -4738,7 +4909,8 @@ operator>>(basic_istream<_CharT, _Traits>& __is,
     typedef poisson_distribution<_IntType> _Eng;
     typedef typename _Eng::param_type param_type;
     __save_flags<_CharT, _Traits> __lx(__is);
-    __is.flags(ios_base::dec | ios_base::skipws);
+    typedef basic_istream<_CharT, _Traits> _Istream;
+    __is.flags(_Istream::dec | _Istream::skipws);
     double __mean;
     __is >> __mean;
     if (!__is.fail())
@@ -4784,9 +4956,17 @@ private:
 
 public:
     // constructor and reset functions
+#ifndef _LIBCPP_CXX03_LANG
+    _LIBCPP_INLINE_VISIBILITY
+    weibull_distribution() : weibull_distribution(1) {}
+    _LIBCPP_INLINE_VISIBILITY
+    explicit weibull_distribution(result_type __a, result_type __b = 1)
+        : __p_(param_type(__a, __b)) {}
+#else
     _LIBCPP_INLINE_VISIBILITY
     explicit weibull_distribution(result_type __a = 1, result_type __b = 1)
         : __p_(param_type(__a, __b)) {}
+#endif
     _LIBCPP_INLINE_VISIBILITY
     explicit weibull_distribution(const param_type& __p)
         : __p_(__p) {}
@@ -4836,8 +5016,9 @@ operator<<(basic_ostream<_CharT, _Traits>& __os,
            const weibull_distribution<_RT>& __x)
 {
     __save_flags<_CharT, _Traits> __lx(__os);
-    __os.flags(ios_base::dec | ios_base::left | ios_base::fixed |
-               ios_base::scientific);
+    typedef basic_ostream<_CharT, _Traits> _OStream;
+    __os.flags(_OStream::dec | _OStream::left | _OStream::fixed |
+               _OStream::scientific);
     _CharT __sp = __os.widen(' ');
     __os.fill(__sp);
     __os << __x.a() << __sp << __x.b();
@@ -4853,7 +5034,8 @@ operator>>(basic_istream<_CharT, _Traits>& __is,
     typedef typename _Eng::result_type result_type;
     typedef typename _Eng::param_type param_type;
     __save_flags<_CharT, _Traits> __lx(__is);
-    __is.flags(ios_base::dec | ios_base::skipws);
+    typedef basic_istream<_CharT, _Traits> _Istream;
+    __is.flags(_Istream::dec | _Istream::skipws);
     result_type __a;
     result_type __b;
     __is >> __a >> __b;
@@ -4898,9 +5080,18 @@ private:
 
 public:
     // constructor and reset functions
+#ifndef _LIBCPP_CXX03_LANG
     _LIBCPP_INLINE_VISIBILITY
-    explicit extreme_value_distribution(result_type __a = 0, result_type __b = 1)
+    extreme_value_distribution() : extreme_value_distribution(0) {}
+    _LIBCPP_INLINE_VISIBILITY
+    explicit extreme_value_distribution(result_type __a, result_type __b = 1)
         : __p_(param_type(__a, __b)) {}
+#else
+    _LIBCPP_INLINE_VISIBILITY
+    explicit extreme_value_distribution(result_type __a = 0,
+                                        result_type __b = 1)
+        : __p_(param_type(__a, __b)) {}
+#endif
     _LIBCPP_INLINE_VISIBILITY
     explicit extreme_value_distribution(const param_type& __p)
         : __p_(__p) {}
@@ -4955,8 +5146,9 @@ operator<<(basic_ostream<_CharT, _Traits>& __os,
            const extreme_value_distribution<_RT>& __x)
 {
     __save_flags<_CharT, _Traits> __lx(__os);
-    __os.flags(ios_base::dec | ios_base::left | ios_base::fixed |
-               ios_base::scientific);
+    typedef basic_ostream<_CharT, _Traits> _OStream;
+    __os.flags(_OStream::dec | _OStream::left | _OStream::fixed |
+               _OStream::scientific);
     _CharT __sp = __os.widen(' ');
     __os.fill(__sp);
     __os << __x.a() << __sp << __x.b();
@@ -4972,7 +5164,8 @@ operator>>(basic_istream<_CharT, _Traits>& __is,
     typedef typename _Eng::result_type result_type;
     typedef typename _Eng::param_type param_type;
     __save_flags<_CharT, _Traits> __lx(__is);
-    __is.flags(ios_base::dec | ios_base::skipws);
+    typedef basic_istream<_CharT, _Traits> _Istream;
+    __is.flags(_Istream::dec | _Istream::skipws);
     result_type __a;
     result_type __b;
     __is >> __a >> __b;
@@ -5019,9 +5212,18 @@ private:
 
 public:
     // constructors and reset functions
+#ifndef _LIBCPP_CXX03_LANG
     _LIBCPP_INLINE_VISIBILITY
-    explicit gamma_distribution(result_type __alpha = 1, result_type __beta = 1)
+    gamma_distribution() : gamma_distribution(1) {}
+    _LIBCPP_INLINE_VISIBILITY
+    explicit gamma_distribution(result_type __alpha, result_type __beta = 1)
         : __p_(param_type(__alpha, __beta)) {}
+#else
+    _LIBCPP_INLINE_VISIBILITY
+    explicit gamma_distribution(result_type __alpha = 1,
+                                result_type __beta = 1)
+        : __p_(param_type(__alpha, __beta)) {}
+#endif
     _LIBCPP_INLINE_VISIBILITY
     explicit gamma_distribution(const param_type& __p)
         : __p_(__p) {}
@@ -5127,8 +5329,9 @@ operator<<(basic_ostream<_CharT, _Traits>& __os,
            const gamma_distribution<_RT>& __x)
 {
     __save_flags<_CharT, _Traits> __lx(__os);
-    __os.flags(ios_base::dec | ios_base::left | ios_base::fixed |
-               ios_base::scientific);
+    typedef basic_ostream<_CharT, _Traits> _OStream;
+    __os.flags(_OStream::dec | _OStream::left | _OStream::fixed |
+               _OStream::scientific);
     _CharT __sp = __os.widen(' ');
     __os.fill(__sp);
     __os << __x.alpha() << __sp << __x.beta();
@@ -5144,7 +5347,8 @@ operator>>(basic_istream<_CharT, _Traits>& __is,
     typedef typename _Eng::result_type result_type;
     typedef typename _Eng::param_type param_type;
     __save_flags<_CharT, _Traits> __lx(__is);
-    __is.flags(ios_base::dec | ios_base::skipws);
+    typedef basic_istream<_CharT, _Traits> _Istream;
+    __is.flags(_Istream::dec | _Istream::skipws);
     result_type __alpha;
     result_type __beta;
     __is >> __alpha >> __beta;
@@ -5191,9 +5395,18 @@ private:
 
 public:
     // constructor and reset functions
+#ifndef _LIBCPP_CXX03_LANG
     _LIBCPP_INLINE_VISIBILITY
-    explicit negative_binomial_distribution(result_type __k = 1, double __p = 0.5)
+    negative_binomial_distribution() : negative_binomial_distribution(1) {}
+    _LIBCPP_INLINE_VISIBILITY
+    explicit negative_binomial_distribution(result_type __k, double __p = 0.5)
         : __p_(__k, __p) {}
+#else
+    _LIBCPP_INLINE_VISIBILITY
+    explicit negative_binomial_distribution(result_type __k = 1,
+                                            double __p = 0.5)
+        : __p_(__k, __p) {}
+#endif
     _LIBCPP_INLINE_VISIBILITY
     explicit negative_binomial_distribution(const param_type& __p) : __p_(__p) {}
     _LIBCPP_INLINE_VISIBILITY
@@ -5263,8 +5476,9 @@ operator<<(basic_ostream<_CharT, _Traits>& __os,
            const negative_binomial_distribution<_IntType>& __x)
 {
     __save_flags<_CharT, _Traits> __lx(__os);
-    __os.flags(ios_base::dec | ios_base::left | ios_base::fixed |
-               ios_base::scientific);
+    typedef basic_ostream<_CharT, _Traits> _OStream;
+    __os.flags(_OStream::dec | _OStream::left | _OStream::fixed |
+               _OStream::scientific);
     _CharT __sp = __os.widen(' ');
     __os.fill(__sp);
     return __os << __x.k() << __sp << __x.p();
@@ -5279,7 +5493,8 @@ operator>>(basic_istream<_CharT, _Traits>& __is,
     typedef typename _Eng::result_type result_type;
     typedef typename _Eng::param_type param_type;
     __save_flags<_CharT, _Traits> __lx(__is);
-    __is.flags(ios_base::dec | ios_base::skipws);
+    typedef basic_istream<_CharT, _Traits> _Istream;
+    __is.flags(_Istream::dec | _Istream::skipws);
     result_type __k;
     double __p;
     __is >> __k >> __p;
@@ -5322,8 +5537,17 @@ private:
 
 public:
     // constructors and reset functions
+#ifndef _LIBCPP_CXX03_LANG
     _LIBCPP_INLINE_VISIBILITY
-    explicit geometric_distribution(double __p = 0.5) : __p_(__p) {}
+    geometric_distribution() : geometric_distribution(0.5) {}
+    _LIBCPP_INLINE_VISIBILITY
+    explicit geometric_distribution(double __p)
+        : __p_(__p) {}
+#else
+    _LIBCPP_INLINE_VISIBILITY
+    explicit geometric_distribution(double __p = 0.5)
+        : __p_(__p) {}
+#endif
     _LIBCPP_INLINE_VISIBILITY
     explicit geometric_distribution(const param_type& __p) : __p_(__p) {}
     _LIBCPP_INLINE_VISIBILITY
@@ -5369,8 +5593,9 @@ operator<<(basic_ostream<_CharT, _Traits>& __os,
            const geometric_distribution<_IntType>& __x)
 {
     __save_flags<_CharT, _Traits> __lx(__os);
-    __os.flags(ios_base::dec | ios_base::left | ios_base::fixed |
-               ios_base::scientific);
+    typedef basic_ostream<_CharT, _Traits> _OStream;
+    __os.flags(_OStream::dec | _OStream::left | _OStream::fixed |
+               _OStream::scientific);
     return __os << __x.p();
 }
 
@@ -5382,7 +5607,8 @@ operator>>(basic_istream<_CharT, _Traits>& __is,
     typedef geometric_distribution<_IntType> _Eng;
     typedef typename _Eng::param_type param_type;
     __save_flags<_CharT, _Traits> __lx(__is);
-    __is.flags(ios_base::dec | ios_base::skipws);
+    typedef basic_istream<_CharT, _Traits> _Istream;
+    __is.flags(_Istream::dec | _Istream::skipws);
     double __p;
     __is >> __p;
     if (!__is.fail())
@@ -5424,9 +5650,17 @@ private:
 
 public:
     // constructor and reset functions
+#ifndef _LIBCPP_CXX03_LANG
+    _LIBCPP_INLINE_VISIBILITY
+    chi_squared_distribution() : chi_squared_distribution(1) {}
+    _LIBCPP_INLINE_VISIBILITY
+    explicit chi_squared_distribution(result_type __n)
+        : __p_(param_type(__n)) {}
+#else
     _LIBCPP_INLINE_VISIBILITY
     explicit chi_squared_distribution(result_type __n = 1)
         : __p_(param_type(__n)) {}
+#endif
     _LIBCPP_INLINE_VISIBILITY
     explicit chi_squared_distribution(const param_type& __p)
         : __p_(__p) {}
@@ -5473,8 +5707,9 @@ operator<<(basic_ostream<_CharT, _Traits>& __os,
            const chi_squared_distribution<_RT>& __x)
 {
     __save_flags<_CharT, _Traits> __lx(__os);
-    __os.flags(ios_base::dec | ios_base::left | ios_base::fixed |
-               ios_base::scientific);
+    typedef basic_ostream<_CharT, _Traits> _OStream;
+    __os.flags(_OStream::dec | _OStream::left | _OStream::fixed |
+               _OStream::scientific);
     __os << __x.n();
     return __os;
 }
@@ -5488,7 +5723,8 @@ operator>>(basic_istream<_CharT, _Traits>& __is,
     typedef typename _Eng::result_type result_type;
     typedef typename _Eng::param_type param_type;
     __save_flags<_CharT, _Traits> __lx(__is);
-    __is.flags(ios_base::dec | ios_base::skipws);
+    typedef basic_istream<_CharT, _Traits> _Istream;
+    __is.flags(_Istream::dec | _Istream::skipws);
     result_type __n;
     __is >> __n;
     if (!__is.fail())
@@ -5534,9 +5770,17 @@ private:
 
 public:
     // constructor and reset functions
+#ifndef _LIBCPP_CXX03_LANG
+    _LIBCPP_INLINE_VISIBILITY
+    cauchy_distribution() : cauchy_distribution(0) {}
+    _LIBCPP_INLINE_VISIBILITY
+    explicit cauchy_distribution(result_type __a, result_type __b = 1)
+        : __p_(param_type(__a, __b)) {}
+#else
     _LIBCPP_INLINE_VISIBILITY
     explicit cauchy_distribution(result_type __a = 0, result_type __b = 1)
         : __p_(param_type(__a, __b)) {}
+#endif
     _LIBCPP_INLINE_VISIBILITY
     explicit cauchy_distribution(const param_type& __p)
         : __p_(__p) {}
@@ -5593,8 +5837,9 @@ operator<<(basic_ostream<_CharT, _Traits>& __os,
            const cauchy_distribution<_RT>& __x)
 {
     __save_flags<_CharT, _Traits> __lx(__os);
-    __os.flags(ios_base::dec | ios_base::left | ios_base::fixed |
-               ios_base::scientific);
+    typedef basic_ostream<_CharT, _Traits> _OStream;
+    __os.flags(_OStream::dec | _OStream::left | _OStream::fixed |
+               _OStream::scientific);
     _CharT __sp = __os.widen(' ');
     __os.fill(__sp);
     __os << __x.a() << __sp << __x.b();
@@ -5610,7 +5855,8 @@ operator>>(basic_istream<_CharT, _Traits>& __is,
     typedef typename _Eng::result_type result_type;
     typedef typename _Eng::param_type param_type;
     __save_flags<_CharT, _Traits> __lx(__is);
-    __is.flags(ios_base::dec | ios_base::skipws);
+    typedef basic_istream<_CharT, _Traits> _Istream;
+    __is.flags(_Istream::dec | _Istream::skipws);
     result_type __a;
     result_type __b;
     __is >> __a >> __b;
@@ -5657,9 +5903,17 @@ private:
 
 public:
     // constructor and reset functions
+#ifndef _LIBCPP_CXX03_LANG
+    _LIBCPP_INLINE_VISIBILITY
+    fisher_f_distribution() : fisher_f_distribution(1) {}
+    _LIBCPP_INLINE_VISIBILITY
+    explicit fisher_f_distribution(result_type __m, result_type __n = 1)
+        : __p_(param_type(__m, __n)) {}
+#else
     _LIBCPP_INLINE_VISIBILITY
     explicit fisher_f_distribution(result_type __m = 1, result_type __n = 1)
         : __p_(param_type(__m, __n)) {}
+#endif
     _LIBCPP_INLINE_VISIBILITY
     explicit fisher_f_distribution(const param_type& __p)
         : __p_(__p) {}
@@ -5715,8 +5969,9 @@ operator<<(basic_ostream<_CharT, _Traits>& __os,
            const fisher_f_distribution<_RT>& __x)
 {
     __save_flags<_CharT, _Traits> __lx(__os);
-    __os.flags(ios_base::dec | ios_base::left | ios_base::fixed |
-               ios_base::scientific);
+    typedef basic_ostream<_CharT, _Traits> _OStream;
+    __os.flags(_OStream::dec | _OStream::left | _OStream::fixed |
+               _OStream::scientific);
     _CharT __sp = __os.widen(' ');
     __os.fill(__sp);
     __os << __x.m() << __sp << __x.n();
@@ -5732,7 +5987,8 @@ operator>>(basic_istream<_CharT, _Traits>& __is,
     typedef typename _Eng::result_type result_type;
     typedef typename _Eng::param_type param_type;
     __save_flags<_CharT, _Traits> __lx(__is);
-    __is.flags(ios_base::dec | ios_base::skipws);
+    typedef basic_istream<_CharT, _Traits> _Istream;
+    __is.flags(_Istream::dec | _Istream::skipws);
     result_type __m;
     result_type __n;
     __is >> __m >> __n;
@@ -5776,9 +6032,17 @@ private:
 
 public:
     // constructor and reset functions
+#ifndef _LIBCPP_CXX03_LANG
+    _LIBCPP_INLINE_VISIBILITY
+    student_t_distribution() : student_t_distribution(1) {}
+    _LIBCPP_INLINE_VISIBILITY
+    explicit student_t_distribution(result_type __n)
+        : __p_(param_type(__n)) {}
+#else
     _LIBCPP_INLINE_VISIBILITY
     explicit student_t_distribution(result_type __n = 1)
         : __p_(param_type(__n)) {}
+#endif
     _LIBCPP_INLINE_VISIBILITY
     explicit student_t_distribution(const param_type& __p)
         : __p_(__p) {}
@@ -5831,8 +6095,9 @@ operator<<(basic_ostream<_CharT, _Traits>& __os,
            const student_t_distribution<_RT>& __x)
 {
     __save_flags<_CharT, _Traits> __lx(__os);
-    __os.flags(ios_base::dec | ios_base::left | ios_base::fixed |
-               ios_base::scientific);
+    typedef basic_ostream<_CharT, _Traits> _OStream;
+    __os.flags(_OStream::dec | _OStream::left | _OStream::fixed |
+               _OStream::scientific);
     __os << __x.n();
     return __os;
 }
@@ -5846,7 +6111,8 @@ operator>>(basic_istream<_CharT, _Traits>& __is,
     typedef typename _Eng::result_type result_type;
     typedef typename _Eng::param_type param_type;
     __save_flags<_CharT, _Traits> __lx(__is);
-    __is.flags(ios_base::dec | ios_base::skipws);
+    typedef basic_istream<_CharT, _Traits> _Istream;
+    __is.flags(_Istream::dec | _Istream::skipws);
     result_type __n;
     __is >> __n;
     if (!__is.fail())
@@ -6054,8 +6320,9 @@ operator<<(basic_ostream<_CharT, _Traits>& __os,
            const discrete_distribution<_IT>& __x)
 {
     __save_flags<_CharT, _Traits> __lx(__os);
-    __os.flags(ios_base::dec | ios_base::left | ios_base::fixed |
-               ios_base::scientific);
+    typedef basic_ostream<_CharT, _Traits> _OStream;
+    __os.flags(_OStream::dec | _OStream::left | _OStream::fixed |
+               _OStream::scientific);
     _CharT __sp = __os.widen(' ');
     __os.fill(__sp);
     size_t __n = __x.__p_.__p_.size();
@@ -6071,7 +6338,8 @@ operator>>(basic_istream<_CharT, _Traits>& __is,
            discrete_distribution<_IT>& __x)
 {
     __save_flags<_CharT, _Traits> __lx(__is);
-    __is.flags(ios_base::dec | ios_base::skipws);
+    typedef basic_istream<_CharT, _Traits> _Istream;
+    __is.flags(_Istream::dec | _Istream::skipws);
     size_t __n;
     __is >> __n;
     vector<double> __p(__n);
@@ -6356,8 +6624,9 @@ operator<<(basic_ostream<_CharT, _Traits>& __os,
            const piecewise_constant_distribution<_RT>& __x)
 {
     __save_flags<_CharT, _Traits> __lx(__os);
-    __os.flags(ios_base::dec | ios_base::left | ios_base::fixed |
-               ios_base::scientific);
+    typedef basic_ostream<_CharT, _Traits> _OStream;
+    __os.flags(_OStream::dec | _OStream::left | _OStream::fixed |
+               _OStream::scientific);
     _CharT __sp = __os.widen(' ');
     __os.fill(__sp);
     size_t __n = __x.__p_.__b_.size();
@@ -6383,7 +6652,8 @@ operator>>(basic_istream<_CharT, _Traits>& __is,
     typedef piecewise_constant_distribution<_RT> _Eng;
     typedef typename _Eng::result_type result_type;
     __save_flags<_CharT, _Traits> __lx(__is);
-    __is.flags(ios_base::dec | ios_base::skipws);
+    typedef basic_istream<_CharT, _Traits> _Istream;
+    __is.flags(_Istream::dec | _Istream::skipws);
     size_t __n;
     __is >> __n;
     vector<result_type> __b(__n);
@@ -6696,8 +6966,9 @@ operator<<(basic_ostream<_CharT, _Traits>& __os,
            const piecewise_linear_distribution<_RT>& __x)
 {
     __save_flags<_CharT, _Traits> __lx(__os);
-    __os.flags(ios_base::dec | ios_base::left | ios_base::fixed |
-               ios_base::scientific);
+    typedef basic_ostream<_CharT, _Traits> _OStream;
+    __os.flags(_OStream::dec | _OStream::left | _OStream::fixed |
+               _OStream::scientific);
     _CharT __sp = __os.widen(' ');
     __os.fill(__sp);
     size_t __n = __x.__p_.__b_.size();
@@ -6723,7 +6994,8 @@ operator>>(basic_istream<_CharT, _Traits>& __is,
     typedef piecewise_linear_distribution<_RT> _Eng;
     typedef typename _Eng::result_type result_type;
     __save_flags<_CharT, _Traits> __lx(__is);
-    __is.flags(ios_base::dec | ios_base::skipws);
+    typedef basic_istream<_CharT, _Traits> _Istream;
+    __is.flags(_Istream::dec | _Istream::skipws);
     size_t __n;
     __is >> __n;
     vector<result_type> __b(__n);
diff --git a/lib/libcxx/include/regex b/lib/libcxx/include/regex
index f42f1ecd16..d78e4888a6 100644
--- a/lib/libcxx/include/regex
+++ b/lib/libcxx/include/regex
@@ -32,7 +32,8 @@ enum syntax_option_type
     extended   = unspecified,
     awk        = unspecified,
     grep       = unspecified,
-    egrep      = unspecified
+    egrep      = unspecified,
+    multiline  = unspecified
 };
 
 constexpr syntax_option_type operator~(syntax_option_type f);
@@ -142,6 +143,7 @@ public:
     static constexpr regex_constants::syntax_option_type awk = regex_constants::awk;
     static constexpr regex_constants::syntax_option_type grep = regex_constants::grep;
     static constexpr regex_constants::syntax_option_type egrep = regex_constants::egrep;
+    static constexpr regex_constants::syntax_option_type multiline = regex_constants::multiline;
 
     // construct/copy/destroy:
     basic_regex();
@@ -453,7 +455,9 @@ public:
     typedef basic_string<char_type>                           string_type;
 
     // construct/copy/destroy:
-    explicit match_results(const Allocator& a = Allocator());
+    explicit match_results(const Allocator& a = Allocator()); // before C++20
+    match_results() : match_results(Allocator()) {}           // C++20
+    explicit match_results(const Allocator& a);               // C++20
     match_results(const match_results& m);
     match_results(match_results&& m) noexcept;
     match_results& operator=(const match_results& m);
@@ -802,7 +806,9 @@ enum syntax_option_type
     extended   = 1 << 5,
     awk        = 1 << 6,
     grep       = 1 << 7,
-    egrep      = 1 << 8
+    egrep      = 1 << 8,
+    // 1 << 9 may be used by ECMAScript
+    multiline  = 1 << 10
 };
 
 inline _LIBCPP_CONSTEXPR
@@ -1982,24 +1988,33 @@ __word_boundary<_CharT, _Traits>::__exec(__state& __s) const
 // __l_anchor
 
 template <class _CharT>
-class __l_anchor
+_LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
+bool __is_eol(_CharT c)
+{
+    return c == '\r' || c == '\n';
+}
+
+template <class _CharT>
+class __l_anchor_multiline
     : public __owns_one_state<_CharT>
 {
     typedef __owns_one_state<_CharT> base;
 
+    bool __multiline;
+
 public:
     typedef _VSTD::__state<_CharT> __state;
 
     _LIBCPP_INLINE_VISIBILITY
-    __l_anchor(__node<_CharT>* __s)
-        : base(__s) {}
+    __l_anchor_multiline(bool __multiline, __node<_CharT>* __s)
+        : base(__s), __multiline(__multiline) {}
 
     virtual void __exec(__state&) const;
 };
 
 template <class _CharT>
 void
-__l_anchor<_CharT>::__exec(__state& __s) const
+__l_anchor_multiline<_CharT>::__exec(__state& __s) const
 {
     if (__s.__at_first_ && __s.__current_ == __s.__first_ &&
         !(__s.__flags_ & regex_constants::match_not_bol))
@@ -2007,6 +2022,13 @@ __l_anchor<_CharT>::__exec(__state& __s) const
         __s.__do_ = __state::__accept_but_not_consume;
         __s.__node_ = this->first();
     }
+    else if (__multiline &&
+             !__s.__at_first_ &&
+             __is_eol(*_VSTD::prev(__s.__current_)))
+    {
+        __s.__do_ = __state::__accept_but_not_consume;
+        __s.__node_ = this->first();
+    }
     else
     {
         __s.__do_ = __state::__reject;
@@ -2017,24 +2039,26 @@ __l_anchor<_CharT>::__exec(__state& __s) const
 // __r_anchor
 
 template <class _CharT>
-class __r_anchor
+class __r_anchor_multiline
     : public __owns_one_state<_CharT>
 {
     typedef __owns_one_state<_CharT> base;
 
+    bool __multiline;
+
 public:
     typedef _VSTD::__state<_CharT> __state;
 
     _LIBCPP_INLINE_VISIBILITY
-    __r_anchor(__node<_CharT>* __s)
-        : base(__s) {}
+    __r_anchor_multiline(bool __multiline, __node<_CharT>* __s)
+        : base(__s), __multiline(__multiline) {}
 
     virtual void __exec(__state&) const;
 };
 
 template <class _CharT>
 void
-__r_anchor<_CharT>::__exec(__state& __s) const
+__r_anchor_multiline<_CharT>::__exec(__state& __s) const
 {
     if (__s.__current_ == __s.__last_ &&
         !(__s.__flags_ & regex_constants::match_not_eol))
@@ -2042,6 +2066,11 @@ __r_anchor<_CharT>::__exec(__state& __s) const
         __s.__do_ = __state::__accept_but_not_consume;
         __s.__node_ = this->first();
     }
+    else if (__multiline && __is_eol(*__s.__current_))
+    {
+        __s.__do_ = __state::__accept_but_not_consume;
+        __s.__node_ = this->first();
+    }
     else
     {
         __s.__do_ = __state::__reject;
@@ -2448,7 +2477,7 @@ __bracket_expression<_CharT, _Traits>::__exec(__state& __s) const
         {
             const bool __in_neg_mask = __traits_.isctype(__ch, __neg_mask_);
           const bool __in_neg_chars =
-              std::find(__neg_chars_.begin(), __neg_chars_.end(), __ch) !=
+              _VSTD::find(__neg_chars_.begin(), __neg_chars_.end(), __ch) !=
               __neg_chars_.end();
           if (!(__in_neg_mask || __in_neg_chars))
           {
@@ -2507,7 +2536,17 @@ __exit:
 template <class _CharT, class _Traits> class __lookahead;
 
 template <class _CharT, class _Traits = regex_traits<_CharT> >
-class _LIBCPP_TEMPLATE_VIS basic_regex
+    class _LIBCPP_TEMPLATE_VIS basic_regex;
+
+typedef basic_regex<char>    regex;
+typedef basic_regex<wchar_t> wregex;
+
+template <class _CharT, class _Traits>
+class
+    _LIBCPP_TEMPLATE_VIS
+    _LIBCPP_PREFERRED_NAME(regex)
+    _LIBCPP_PREFERRED_NAME(wregex)
+    basic_regex
 {
 public:
     // types:
@@ -2541,17 +2580,18 @@ public:
     static const regex_constants::syntax_option_type awk = regex_constants::awk;
     static const regex_constants::syntax_option_type grep = regex_constants::grep;
     static const regex_constants::syntax_option_type egrep = regex_constants::egrep;
+    static const regex_constants::syntax_option_type multiline = regex_constants::multiline;
 
     // construct/copy/destroy:
     _LIBCPP_INLINE_VISIBILITY
     basic_regex()
         : __flags_(regex_constants::ECMAScript), __marked_count_(0), __loop_count_(0), __open_count_(0),
-          __end_(0)
+          __end_(nullptr)
         {}
     _LIBCPP_INLINE_VISIBILITY
     explicit basic_regex(const value_type* __p, flag_type __f = regex_constants::ECMAScript)
         : __flags_(__f), __marked_count_(0), __loop_count_(0), __open_count_(0),
-          __end_(0)
+          __end_(nullptr)
         {
         __init(__p, __p + __traits_.length(__p));
         }
@@ -2559,7 +2599,7 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     basic_regex(const value_type* __p, size_t __len, flag_type __f = regex_constants::ECMAScript)
         : __flags_(__f), __marked_count_(0), __loop_count_(0), __open_count_(0),
-          __end_(0)
+          __end_(nullptr)
         {
         __init(__p, __p + __len);
         }
@@ -2571,7 +2611,7 @@ public:
         explicit basic_regex(const basic_string<value_type, _ST, _SA>& __p,
                              flag_type __f = regex_constants::ECMAScript)
         : __flags_(__f), __marked_count_(0), __loop_count_(0), __open_count_(0),
-          __end_(0)
+          __end_(nullptr)
         {
         __init(__p.begin(), __p.end());
         }
@@ -2581,7 +2621,7 @@ public:
         basic_regex(_ForwardIterator __first, _ForwardIterator __last,
                     flag_type __f = regex_constants::ECMAScript)
         : __flags_(__f), __marked_count_(0), __loop_count_(0), __open_count_(0),
-          __end_(0)
+          __end_(nullptr)
         {
         __init(__first, __last);
         }
@@ -2590,7 +2630,7 @@ public:
     basic_regex(initializer_list<value_type> __il,
                 flag_type __f = regex_constants::ECMAScript)
         : __flags_(__f), __marked_count_(0), __loop_count_(0), __open_count_(0),
-          __end_(0)
+          __end_(nullptr)
         {
         __init(__il.begin(), __il.end());
         }
@@ -2707,6 +2747,12 @@ private:
     _LIBCPP_INLINE_VISIBILITY
     unsigned __loop_count() const {return __loop_count_;}
 
+    _LIBCPP_INLINE_VISIBILITY
+    bool __use_multiline() const
+    {
+        return __get_grammar(__flags_) == ECMAScript && (__flags_ & multiline);
+    }
+
     template <class _ForwardIterator>
         void
         __init(_ForwardIterator __first, _ForwardIterator __last);
@@ -4119,7 +4165,7 @@ basic_regex<_CharT, _Traits>::__parse_DUP_COUNT(_ForwardIterator __first,
                  __first != __last && ( __val = __traits_.value(*__first, 10)) != -1;
                  ++__first)
             {
-                if (__c >= std::numeric_limits<int>::max() / 10)
+                if (__c >= numeric_limits<int>::max() / 10)
                     __throw_regex_error<regex_constants::error_badbrace>();
                 __c *= 10;
                 __c += __val;
@@ -4383,7 +4429,7 @@ basic_regex<_CharT, _Traits>::__parse_decimal_escape(_ForwardIterator __first,
             for (++__first;
                     __first != __last && '0' <= *__first && *__first <= '9'; ++__first)
                 {
-                if (__v >= std::numeric_limits<unsigned>::max() / 10)
+                if (__v >= numeric_limits<unsigned>::max() / 10)
                     __throw_regex_error<regex_constants::error_backref>();
                 __v = 10 * __v + *__first - '0';
                 }
@@ -4746,7 +4792,7 @@ template <class _CharT, class _Traits>
 void
 basic_regex<_CharT, _Traits>::__push_l_anchor()
 {
-    __end_->first() = new __l_anchor<_CharT>(__end_->first());
+    __end_->first() = new __l_anchor_multiline<_CharT>(__use_multiline(), __end_->first());
     __end_ = static_cast<__owns_one_state<_CharT>*>(__end_->first());
 }
 
@@ -4754,7 +4800,7 @@ template <class _CharT, class _Traits>
 void
 basic_regex<_CharT, _Traits>::__push_r_anchor()
 {
-    __end_->first() = new __r_anchor<_CharT>(__end_->first());
+    __end_->first() = new __r_anchor_multiline<_CharT>(__use_multiline(), __end_->first());
     __end_ = static_cast<__owns_one_state<_CharT>*>(__end_->first());
 }
 
@@ -4845,13 +4891,21 @@ basic_regex<_CharT, _Traits>::__push_lookahead(const basic_regex& __exp,
     __end_ = static_cast<__owns_one_state<_CharT>*>(__end_->first());
 }
 
-typedef basic_regex<char>    regex;
-typedef basic_regex<wchar_t> wregex;
-
 // sub_match
 
+typedef sub_match<const char*>             csub_match;
+typedef sub_match<const wchar_t*>          wcsub_match;
+typedef sub_match<string::const_iterator>  ssub_match;
+typedef sub_match<wstring::const_iterator> wssub_match;
+
 template <class _BidirectionalIterator>
-class _LIBCPP_TEMPLATE_VIS sub_match
+class
+    _LIBCPP_TEMPLATE_VIS
+    _LIBCPP_PREFERRED_NAME(csub_match)
+    _LIBCPP_PREFERRED_NAME(wcsub_match)
+    _LIBCPP_PREFERRED_NAME(ssub_match)
+    _LIBCPP_PREFERRED_NAME(wssub_match)
+    sub_match
     : public pair<_BidirectionalIterator, _BidirectionalIterator>
 {
 public:
@@ -4886,11 +4940,6 @@ public:
         {return str().compare(__s);}
 };
 
-typedef sub_match<const char*>             csub_match;
-typedef sub_match<const wchar_t*>          wcsub_match;
-typedef sub_match<string::const_iterator>  ssub_match;
-typedef sub_match<wstring::const_iterator> wssub_match;
-
 template <class _BiIter>
 inline _LIBCPP_INLINE_VISIBILITY
 bool
@@ -5273,8 +5322,19 @@ operator<<(basic_ostream<_CharT, _ST>& __os, const sub_match<_BiIter>& __m)
     return __os << __m.str();
 }
 
+typedef match_results<const char*>             cmatch;
+typedef match_results<const wchar_t*>          wcmatch;
+typedef match_results<string::const_iterator>  smatch;
+typedef match_results<wstring::const_iterator> wsmatch;
+
 template <class _BidirectionalIterator, class _Allocator>
-class _LIBCPP_TEMPLATE_VIS match_results
+class
+    _LIBCPP_TEMPLATE_VIS
+    _LIBCPP_PREFERRED_NAME(cmatch)
+    _LIBCPP_PREFERRED_NAME(wcmatch)
+    _LIBCPP_PREFERRED_NAME(smatch)
+    _LIBCPP_PREFERRED_NAME(wsmatch)
+    match_results
 {
 public:
     typedef _Allocator                                        allocator_type;
@@ -5299,7 +5359,13 @@ public:
     typedef basic_string<char_type>                           string_type;
 
     // construct/copy/destroy:
+#ifndef _LIBCPP_CXX03_LANG
+    match_results() : match_results(allocator_type()) {}
+    explicit match_results(const allocator_type& __a);
+#else
     explicit match_results(const allocator_type& __a = allocator_type());
+#endif
+
 //    match_results(const match_results&) = default;
 //    match_results& operator=(const match_results&) = default;
 //    match_results(match_results&& __m) = default;
@@ -5556,7 +5622,7 @@ match_results<_BidirectionalIterator, _Allocator>::format(_OutputIter __output_i
                             '0' <= __fmt_first[1] && __fmt_first[1] <= '9')
                         {
                             ++__fmt_first;
-                            if (__idx >= std::numeric_limits<size_t>::max() / 10)
+                            if (__idx >= numeric_limits<size_t>::max() / 10)
                                 __throw_regex_error<regex_constants::error_escape>();
                             __idx = 10 * __idx + *__fmt_first - '0';
                         }
@@ -5594,11 +5660,6 @@ match_results<_BidirectionalIterator, _Allocator>::swap(match_results& __m)
     swap(__ready_, __m.__ready_);
 }
 
-typedef match_results<const char*>             cmatch;
-typedef match_results<const wchar_t*>          wcmatch;
-typedef match_results<string::const_iterator>  smatch;
-typedef match_results<wstring::const_iterator> wsmatch;
-
 template <class _BidirectionalIterator, class _Allocator>
 bool
 operator==(const match_results<_BidirectionalIterator, _Allocator>& __x,
@@ -6182,7 +6243,21 @@ regex_match(const basic_string<_CharT, _ST, _SA>& __s,
 template <class _BidirectionalIterator,
           class _CharT = typename iterator_traits<_BidirectionalIterator>::value_type,
           class _Traits = regex_traits<_CharT> >
-class _LIBCPP_TEMPLATE_VIS regex_iterator
+    class _LIBCPP_TEMPLATE_VIS regex_iterator;
+
+typedef regex_iterator<const char*>             cregex_iterator;
+typedef regex_iterator<const wchar_t*>          wcregex_iterator;
+typedef regex_iterator<string::const_iterator>  sregex_iterator;
+typedef regex_iterator<wstring::const_iterator> wsregex_iterator;
+
+template <class _BidirectionalIterator, class _CharT, class _Traits>
+class
+    _LIBCPP_TEMPLATE_VIS
+    _LIBCPP_PREFERRED_NAME(cregex_iterator)
+    _LIBCPP_PREFERRED_NAME(wcregex_iterator)
+    _LIBCPP_PREFERRED_NAME(sregex_iterator)
+    _LIBCPP_PREFERRED_NAME(wsregex_iterator)
+    regex_iterator
 {
 public:
     typedef basic_regex<_CharT, _Traits>          regex_type;
@@ -6291,17 +6366,26 @@ regex_iterator<_BidirectionalIterator, _CharT, _Traits>::operator++()
     return *this;
 }
 
-typedef regex_iterator<const char*>             cregex_iterator;
-typedef regex_iterator<const wchar_t*>          wcregex_iterator;
-typedef regex_iterator<string::const_iterator>  sregex_iterator;
-typedef regex_iterator<wstring::const_iterator> wsregex_iterator;
-
 // regex_token_iterator
 
 template <class _BidirectionalIterator,
           class _CharT = typename iterator_traits<_BidirectionalIterator>::value_type,
           class _Traits = regex_traits<_CharT> >
-class _LIBCPP_TEMPLATE_VIS regex_token_iterator
+    class _LIBCPP_TEMPLATE_VIS regex_token_iterator;
+
+typedef regex_token_iterator<const char*>             cregex_token_iterator;
+typedef regex_token_iterator<const wchar_t*>          wcregex_token_iterator;
+typedef regex_token_iterator<string::const_iterator>  sregex_token_iterator;
+typedef regex_token_iterator<wstring::const_iterator> wsregex_token_iterator;
+
+template <class _BidirectionalIterator, class _CharT, class _Traits>
+class
+    _LIBCPP_TEMPLATE_VIS
+    _LIBCPP_PREFERRED_NAME(cregex_token_iterator)
+    _LIBCPP_PREFERRED_NAME(wcregex_token_iterator)
+    _LIBCPP_PREFERRED_NAME(sregex_token_iterator)
+    _LIBCPP_PREFERRED_NAME(wsregex_token_iterator)
+    regex_token_iterator
 {
 public:
     typedef basic_regex<_CharT, _Traits>      regex_type;
@@ -6367,7 +6451,7 @@ public:
                              regex_constants::match_flag_type __m =
                                                 regex_constants::match_default);
 #if _LIBCPP_STD_VER > 11
-    template <std::size_t _Np>
+    template <size_t _Np>
         regex_token_iterator(_BidirectionalIterator __a,
                              _BidirectionalIterator __b,
                              const regex_type&& __re,
@@ -6579,11 +6663,6 @@ regex_token_iterator<_BidirectionalIterator, _CharT, _Traits>::operator++()
     return *this;
 }
 
-typedef regex_token_iterator<const char*>             cregex_token_iterator;
-typedef regex_token_iterator<const wchar_t*>          wcregex_token_iterator;
-typedef regex_token_iterator<string::const_iterator>  sregex_token_iterator;
-typedef regex_token_iterator<wstring::const_iterator> wsregex_token_iterator;
-
 // regex_replace
 
 template <class _OutputIterator, class _BidirectionalIterator,
diff --git a/lib/libcxx/include/semaphore b/lib/libcxx/include/semaphore
index 447bc2f385..0e0434f7fb 100644
--- a/lib/libcxx/include/semaphore
+++ b/lib/libcxx/include/semaphore
@@ -46,9 +46,9 @@ using binary_semaphore = counting_semaphore<1>;
 */
 
 #include <__config>
+#include <__availability>
 #include <__threading_support>
 #include <atomic>
-#include <cassert>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #pragma GCC system_header
@@ -58,6 +58,9 @@ using binary_semaphore = counting_semaphore<1>;
 # error <semaphore> is not supported on this single threaded system
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if _LIBCPP_STD_VER >= 14
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -68,7 +71,7 @@ __atomic_semaphore_base is the general-case implementation, to be used for
 user-requested least-max values that exceed the OS implementation support
 (incl. when the OS has no support of its own) and for binary semaphores.
 
-It is a typical Dijsktra semaphore algorithm over atomics, wait and notify
+It is a typical Dijkstra semaphore algorithm over atomics, wait and notify
 functions. It avoids contention against users' own use of those facilities.
 
 */
@@ -232,4 +235,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 14
 
+_LIBCPP_POP_MACROS
+
 #endif //_LIBCPP_SEMAPHORE
diff --git a/lib/libcxx/include/shared_mutex b/lib/libcxx/include/shared_mutex
index fcafd8c0f4..5448d8a807 100644
--- a/lib/libcxx/include/shared_mutex
+++ b/lib/libcxx/include/shared_mutex
@@ -123,6 +123,7 @@ template <class Mutex>
 */
 
 #include <__config>
+#include <__availability>
 #include <version>
 
 _LIBCPP_PUSH_MACROS
diff --git a/lib/libcxx/include/span b/lib/libcxx/include/span
index b307c98aee..4f63d0ac4e 100644
--- a/lib/libcxx/include/span
+++ b/lib/libcxx/include/span
@@ -132,11 +132,14 @@ template<class Container>
 #pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER > 17
 
-inline constexpr size_t dynamic_extent = (numeric_limits<size_t>::max)();
+inline constexpr size_t dynamic_extent = numeric_limits<size_t>::max();
 template <typename _Tp, size_t _Extent = dynamic_extent> class span;
 
 
@@ -546,4 +549,6 @@ template<class _Container>
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP_SPAN
diff --git a/lib/libcxx/include/sstream b/lib/libcxx/include/sstream
index 14c91971c2..7ce85be6ac 100644
--- a/lib/libcxx/include/sstream
+++ b/lib/libcxx/include/sstream
@@ -25,8 +25,10 @@ public:
     typedef typename traits_type::off_type off_type;
     typedef Allocator                      allocator_type;
 
-    // 27.8.1.1 Constructors:
-    explicit basic_stringbuf(ios_base::openmode which = ios_base::in | ios_base::out);
+    // 27.8.1.1 [stringbuf.cons], constructors:
+    explicit basic_stringbuf(ios_base::openmode which = ios_base::in | ios_base::out); // before C++20
+    basic_stringbuf() : basic_stringbuf(ios_base::in | ios_base::out) {}               // C++20
+    explicit basic_stringbuf(ios_base::openmode which);                                // C++20
     explicit basic_stringbuf(const basic_string<char_type, traits_type, allocator_type>& str,
                              ios_base::openmode which = ios_base::in | ios_base::out);
     basic_stringbuf(basic_stringbuf&& rhs);
@@ -71,7 +73,10 @@ public:
     typedef Allocator                      allocator_type;
 
     // 27.8.2.1 Constructors:
-    explicit basic_istringstream(ios_base::openmode which = ios_base::in);
+    explicit basic_istringstream(ios_base::openmode which = ios_base::in); // before C++20
+    basic_istringstream() : basic_istringstream(ios_base::in) {}           // C++20
+    explicit basic_istringstream(ios_base::openmode which);                // C++20
+
     explicit basic_istringstream(const basic_string<char_type, traits_type,allocator_type>& str,
                                  ios_base::openmode which = ios_base::in);
     basic_istringstream(basic_istringstream&& rhs);
@@ -107,7 +112,10 @@ public:
     typedef Allocator                      allocator_type;
 
     // 27.8.3.1 Constructors/destructor:
-    explicit basic_ostringstream(ios_base::openmode which = ios_base::out);
+    explicit basic_ostringstream(ios_base::openmode which = ios_base::out); // before C++20
+    basic_ostringstream() : basic_ostringstream(ios_base::out) {}           // C++20
+    explicit basic_ostringstream(ios_base::openmode which);                 // C++20
+
     explicit basic_ostringstream(const basic_string<char_type, traits_type, allocator_type>& str,
                                  ios_base::openmode which = ios_base::out);
     basic_ostringstream(basic_ostringstream&& rhs);
@@ -143,7 +151,10 @@ public:
     typedef Allocator                      allocator_type;
 
     // constructors/destructor
-    explicit basic_stringstream(ios_base::openmode which = ios_base::out|ios_base::in);
+    explicit basic_stringstream(ios_base::openmode which = ios_base::out | ios_base::in); // before C++20
+    basic_stringstream() : basic_stringstream(ios_base::out | ios_base::in) {}            // C++20
+    explicit basic_stringstream(ios_base::openmode which);                                // C++20
+
     explicit basic_stringstream(const basic_string<char_type, traits_type, allocator_type>& str,
                                 ios_base::openmode which = ios_base::out|ios_base::in);
     basic_stringstream(basic_stringstream&& rhs);
@@ -207,18 +218,33 @@ private:
     ios_base::openmode __mode_;
 
 public:
-    // 27.8.1.1 Constructors:
-    inline _LIBCPP_INLINE_VISIBILITY
-    explicit basic_stringbuf(ios_base::openmode __wch = ios_base::in | ios_base::out);
-    inline _LIBCPP_INLINE_VISIBILITY
-    explicit basic_stringbuf(const string_type& __s,
-                             ios_base::openmode __wch = ios_base::in | ios_base::out);
+    // 30.8.2.1 [stringbuf.cons], constructors
 #ifndef _LIBCPP_CXX03_LANG
+    _LIBCPP_INLINE_VISIBILITY
+    basic_stringbuf() : basic_stringbuf(ios_base::in | ios_base::out) {}
+
+    _LIBCPP_INLINE_VISIBILITY
+    explicit basic_stringbuf(ios_base::openmode __wch)
+        : __hm_(nullptr), __mode_(__wch) {}
+#else
+    _LIBCPP_INLINE_VISIBILITY
+    explicit basic_stringbuf(ios_base::openmode __wch = ios_base::in |
+                                                        ios_base::out)
+        : __hm_(nullptr), __mode_(__wch) {}
+#endif
+
+    _LIBCPP_INLINE_VISIBILITY
+    explicit basic_stringbuf(const string_type& __s,
+                             ios_base::openmode __wch = ios_base::in | ios_base::out)
+        : __str_(__s.get_allocator()), __hm_(nullptr), __mode_(__wch)
+    {
+        str(__s);
+    }
+
     basic_stringbuf(basic_stringbuf&& __rhs);
 
     // 27.8.1.2 Assign and swap:
     basic_stringbuf& operator=(basic_stringbuf&& __rhs);
-#endif
     void swap(basic_stringbuf& __rhs);
 
     // 27.8.1.3 Get and set:
@@ -232,30 +258,13 @@ protected:
     virtual int_type overflow (int_type __c = traits_type::eof());
     virtual pos_type seekoff(off_type __off, ios_base::seekdir __way,
                              ios_base::openmode __wch = ios_base::in | ios_base::out);
-    inline _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY
     virtual pos_type seekpos(pos_type __sp,
-                             ios_base::openmode __wch = ios_base::in | ios_base::out);
+                             ios_base::openmode __wch = ios_base::in | ios_base::out) {
+        return seekoff(__sp, ios_base::beg, __wch);
+    }
 };
 
-template <class _CharT, class _Traits, class _Allocator>
-basic_stringbuf<_CharT, _Traits, _Allocator>::basic_stringbuf(ios_base::openmode __wch)
-    : __hm_(0),
-      __mode_(__wch)
-{
-}
-
-template <class _CharT, class _Traits, class _Allocator>
-basic_stringbuf<_CharT, _Traits, _Allocator>::basic_stringbuf(const string_type& __s,
-                             ios_base::openmode __wch)
-    : __str_(__s.get_allocator()),
-      __hm_(0),
-      __mode_(__wch)
-{
-    str(__s);
-}
-
-#ifndef _LIBCPP_CXX03_LANG
-
 template <class _CharT, class _Traits, class _Allocator>
 basic_stringbuf<_CharT, _Traits, _Allocator>::basic_stringbuf(basic_stringbuf&& __rhs)
     : __mode_(__rhs.__mode_)
@@ -345,8 +354,6 @@ basic_stringbuf<_CharT, _Traits, _Allocator>::operator=(basic_stringbuf&& __rhs)
     return *this;
 }
 
-#endif  // _LIBCPP_CXX03_LANG
-
 template <class _CharT, class _Traits, class _Allocator>
 void
 basic_stringbuf<_CharT, _Traits, _Allocator>::swap(basic_stringbuf& __rhs)
@@ -453,7 +460,7 @@ void
 basic_stringbuf<_CharT, _Traits, _Allocator>::str(const string_type& __s)
 {
     __str_ = __s;
-    __hm_ = 0;
+    __hm_ = nullptr;
     if (__mode_ & ios_base::in)
     {
         __hm_ = const_cast<char_type*>(__str_.data()) + __str_.size();
@@ -600,9 +607,9 @@ basic_stringbuf<_CharT, _Traits, _Allocator>::seekoff(off_type __off,
         return pos_type(-1);
     if (__noff != 0)
     {
-        if ((__wch & ios_base::in) && this->gptr() == 0)
+        if ((__wch & ios_base::in) && this->gptr() == nullptr)
             return pos_type(-1);
-        if ((__wch & ios_base::out) && this->pptr() == 0)
+        if ((__wch & ios_base::out) && this->pptr() == nullptr)
             return pos_type(-1);
     }
     if (__wch & ios_base::in)
@@ -615,14 +622,6 @@ basic_stringbuf<_CharT, _Traits, _Allocator>::seekoff(off_type __off,
     return pos_type(__noff);
 }
 
-template <class _CharT, class _Traits, class _Allocator>
-typename basic_stringbuf<_CharT, _Traits, _Allocator>::pos_type
-basic_stringbuf<_CharT, _Traits, _Allocator>::seekpos(pos_type __sp,
-                                                      ios_base::openmode __wch)
-{
-    return seekoff(__sp, ios_base::beg, __wch);
-}
-
 // basic_istringstream
 
 template <class _CharT, class _Traits, class _Allocator>
@@ -643,74 +642,62 @@ private:
     basic_stringbuf<char_type, traits_type, allocator_type> __sb_;
 
 public:
-    // 27.8.2.1 Constructors:
-    inline _LIBCPP_INLINE_VISIBILITY
-    explicit basic_istringstream(ios_base::openmode __wch = ios_base::in);
-    inline _LIBCPP_INLINE_VISIBILITY
-    explicit basic_istringstream(const string_type& __s,
-                                 ios_base::openmode __wch = ios_base::in);
+    // 30.8.3.1 [istringstream.cons], constructors
 #ifndef _LIBCPP_CXX03_LANG
-    inline _LIBCPP_INLINE_VISIBILITY
-    basic_istringstream(basic_istringstream&& __rhs);
+    _LIBCPP_INLINE_VISIBILITY
+    basic_istringstream() : basic_istringstream(ios_base::in) {}
+
+    _LIBCPP_INLINE_VISIBILITY
+    explicit basic_istringstream(ios_base::openmode __wch)
+        : basic_istream<_CharT, _Traits>(&__sb_), __sb_(__wch | ios_base::in) {}
+#else
+    _LIBCPP_INLINE_VISIBILITY
+    explicit basic_istringstream(ios_base::openmode __wch = ios_base::in)
+        : basic_istream<_CharT, _Traits>(&__sb_), __sb_(__wch | ios_base::in) {}
+#endif
+
+    _LIBCPP_INLINE_VISIBILITY
+    explicit basic_istringstream(const string_type& __s,
+                                 ios_base::openmode __wch = ios_base::in)
+        : basic_istream<_CharT, _Traits>(&__sb_)
+        , __sb_(__s, __wch | ios_base::in)
+    { }
+
+    _LIBCPP_INLINE_VISIBILITY
+    basic_istringstream(basic_istringstream&& __rhs)
+        : basic_istream<_CharT, _Traits>(_VSTD::move(__rhs))
+        , __sb_(_VSTD::move(__rhs.__sb_))
+    {
+        basic_istream<_CharT, _Traits>::set_rdbuf(&__sb_);
+    }
 
     // 27.8.2.2 Assign and swap:
-    basic_istringstream& operator=(basic_istringstream&& __rhs);
-#endif  // _LIBCPP_CXX03_LANG
-    inline _LIBCPP_INLINE_VISIBILITY
-    void swap(basic_istringstream& __rhs);
+    basic_istringstream& operator=(basic_istringstream&& __rhs) {
+        basic_istream<char_type, traits_type>::operator=(_VSTD::move(__rhs));
+        __sb_ = _VSTD::move(__rhs.__sb_);
+        return *this;
+    }
+    _LIBCPP_INLINE_VISIBILITY
+    void swap(basic_istringstream& __rhs) {
+        basic_istream<char_type, traits_type>::swap(__rhs);
+        __sb_.swap(__rhs.__sb_);
+    }
 
     // 27.8.2.3 Members:
-    inline _LIBCPP_INLINE_VISIBILITY
-    basic_stringbuf<char_type, traits_type, allocator_type>* rdbuf() const;
-    inline _LIBCPP_INLINE_VISIBILITY
-    string_type str() const;
-    inline _LIBCPP_INLINE_VISIBILITY
-    void str(const string_type& __s);
+    _LIBCPP_INLINE_VISIBILITY
+    basic_stringbuf<char_type, traits_type, allocator_type>* rdbuf() const {
+        return const_cast<basic_stringbuf<char_type, traits_type, allocator_type>*>(&__sb_);
+    }
+    _LIBCPP_INLINE_VISIBILITY
+    string_type str() const {
+        return __sb_.str();
+    }
+    _LIBCPP_INLINE_VISIBILITY
+    void str(const string_type& __s) {
+        __sb_.str(__s);
+    }
 };
 
-template <class _CharT, class _Traits, class _Allocator>
-basic_istringstream<_CharT, _Traits, _Allocator>::basic_istringstream(ios_base::openmode __wch)
-    : basic_istream<_CharT, _Traits>(&__sb_),
-      __sb_(__wch | ios_base::in)
-{
-}
-
-template <class _CharT, class _Traits, class _Allocator>
-basic_istringstream<_CharT, _Traits, _Allocator>::basic_istringstream(const string_type& __s,
-                                                                      ios_base::openmode __wch)
-    : basic_istream<_CharT, _Traits>(&__sb_),
-      __sb_(__s, __wch | ios_base::in)
-{
-}
-
-#ifndef _LIBCPP_CXX03_LANG
-
-template <class _CharT, class _Traits, class _Allocator>
-basic_istringstream<_CharT, _Traits, _Allocator>::basic_istringstream(basic_istringstream&& __rhs)
-    : basic_istream<_CharT, _Traits>(_VSTD::move(__rhs)),
-      __sb_(_VSTD::move(__rhs.__sb_))
-{
-    basic_istream<_CharT, _Traits>::set_rdbuf(&__sb_);
-}
-
-template <class _CharT, class _Traits, class _Allocator>
-basic_istringstream<_CharT, _Traits, _Allocator>&
-basic_istringstream<_CharT, _Traits, _Allocator>::operator=(basic_istringstream&& __rhs)
-{
-    basic_istream<char_type, traits_type>::operator=(_VSTD::move(__rhs));
-    __sb_ = _VSTD::move(__rhs.__sb_);
-    return *this;
-}
-
-#endif  // _LIBCPP_CXX03_LANG
-
-template <class _CharT, class _Traits, class _Allocator>
-void basic_istringstream<_CharT, _Traits, _Allocator>::swap(basic_istringstream& __rhs)
-{
-    basic_istream<char_type, traits_type>::swap(__rhs);
-    __sb_.swap(__rhs.__sb_);
-}
-
 template <class _CharT, class _Traits, class _Allocator>
 inline _LIBCPP_INLINE_VISIBILITY
 void
@@ -720,26 +707,6 @@ swap(basic_istringstream<_CharT, _Traits, _Allocator>& __x,
     __x.swap(__y);
 }
 
-template <class _CharT, class _Traits, class _Allocator>
-basic_stringbuf<_CharT, _Traits, _Allocator>*
-basic_istringstream<_CharT, _Traits, _Allocator>::rdbuf() const
-{
-    return const_cast<basic_stringbuf<char_type, traits_type, allocator_type>*>(&__sb_);
-}
-
-template <class _CharT, class _Traits, class _Allocator>
-basic_string<_CharT, _Traits, _Allocator>
-basic_istringstream<_CharT, _Traits, _Allocator>::str() const
-{
-    return __sb_.str();
-}
-
-template <class _CharT, class _Traits, class _Allocator>
-void basic_istringstream<_CharT, _Traits, _Allocator>::str(const string_type& __s)
-{
-    __sb_.str(__s);
-}
-
 // basic_ostringstream
 
 template <class _CharT, class _Traits, class _Allocator>
@@ -760,75 +727,65 @@ private:
     basic_stringbuf<char_type, traits_type, allocator_type> __sb_;
 
 public:
-    // 27.8.2.1 Constructors:
-    inline _LIBCPP_INLINE_VISIBILITY
-    explicit basic_ostringstream(ios_base::openmode __wch = ios_base::out);
-    inline _LIBCPP_INLINE_VISIBILITY
-    explicit basic_ostringstream(const string_type& __s,
-                                 ios_base::openmode __wch = ios_base::out);
+    // 30.8.4.1 [ostringstream.cons], constructors
 #ifndef _LIBCPP_CXX03_LANG
-    inline _LIBCPP_INLINE_VISIBILITY
-    basic_ostringstream(basic_ostringstream&& __rhs);
+    _LIBCPP_INLINE_VISIBILITY
+    basic_ostringstream() : basic_ostringstream(ios_base::out) {}
+
+    _LIBCPP_INLINE_VISIBILITY
+    explicit basic_ostringstream(ios_base::openmode __wch)
+        : basic_ostream<_CharT, _Traits>(&__sb_),
+          __sb_(__wch | ios_base::out) {}
+#else
+    _LIBCPP_INLINE_VISIBILITY
+    explicit basic_ostringstream(ios_base::openmode __wch = ios_base::out)
+        : basic_ostream<_CharT, _Traits>(&__sb_),
+          __sb_(__wch | ios_base::out) {}
+#endif
+
+    _LIBCPP_INLINE_VISIBILITY
+    explicit basic_ostringstream(const string_type& __s,
+                                 ios_base::openmode __wch = ios_base::out)
+        : basic_ostream<_CharT, _Traits>(&__sb_)
+        , __sb_(__s, __wch | ios_base::out)
+    { }
+
+    _LIBCPP_INLINE_VISIBILITY
+    basic_ostringstream(basic_ostringstream&& __rhs)
+        : basic_ostream<_CharT, _Traits>(_VSTD::move(__rhs))
+        , __sb_(_VSTD::move(__rhs.__sb_))
+    {
+        basic_ostream<_CharT, _Traits>::set_rdbuf(&__sb_);
+    }
 
     // 27.8.2.2 Assign and swap:
-    basic_ostringstream& operator=(basic_ostringstream&& __rhs);
-#endif  // _LIBCPP_CXX03_LANG
-    inline _LIBCPP_INLINE_VISIBILITY
-    void swap(basic_ostringstream& __rhs);
+    basic_ostringstream& operator=(basic_ostringstream&& __rhs) {
+        basic_ostream<char_type, traits_type>::operator=(_VSTD::move(__rhs));
+        __sb_ = _VSTD::move(__rhs.__sb_);
+        return *this;
+    }
+
+    _LIBCPP_INLINE_VISIBILITY
+    void swap(basic_ostringstream& __rhs) {
+        basic_ostream<char_type, traits_type>::swap(__rhs);
+        __sb_.swap(__rhs.__sb_);
+    }
 
     // 27.8.2.3 Members:
-    inline _LIBCPP_INLINE_VISIBILITY
-    basic_stringbuf<char_type, traits_type, allocator_type>* rdbuf() const;
-    inline _LIBCPP_INLINE_VISIBILITY
-    string_type str() const;
-    inline _LIBCPP_INLINE_VISIBILITY
-    void str(const string_type& __s);
+    _LIBCPP_INLINE_VISIBILITY
+    basic_stringbuf<char_type, traits_type, allocator_type>* rdbuf() const {
+        return const_cast<basic_stringbuf<char_type, traits_type, allocator_type>*>(&__sb_);
+    }
+    _LIBCPP_INLINE_VISIBILITY
+    string_type str() const {
+        return __sb_.str();
+    }
+    _LIBCPP_INLINE_VISIBILITY
+    void str(const string_type& __s) {
+        __sb_.str(__s);
+    }
 };
 
-template <class _CharT, class _Traits, class _Allocator>
-basic_ostringstream<_CharT, _Traits, _Allocator>::basic_ostringstream(ios_base::openmode __wch)
-    : basic_ostream<_CharT, _Traits>(&__sb_),
-      __sb_(__wch | ios_base::out)
-{
-}
-
-template <class _CharT, class _Traits, class _Allocator>
-basic_ostringstream<_CharT, _Traits, _Allocator>::basic_ostringstream(const string_type& __s,
-                                                                      ios_base::openmode __wch)
-    : basic_ostream<_CharT, _Traits>(&__sb_),
-      __sb_(__s, __wch | ios_base::out)
-{
-}
-
-#ifndef _LIBCPP_CXX03_LANG
-
-template <class _CharT, class _Traits, class _Allocator>
-basic_ostringstream<_CharT, _Traits, _Allocator>::basic_ostringstream(basic_ostringstream&& __rhs)
-    : basic_ostream<_CharT, _Traits>(_VSTD::move(__rhs)),
-      __sb_(_VSTD::move(__rhs.__sb_))
-{
-    basic_ostream<_CharT, _Traits>::set_rdbuf(&__sb_);
-}
-
-template <class _CharT, class _Traits, class _Allocator>
-basic_ostringstream<_CharT, _Traits, _Allocator>&
-basic_ostringstream<_CharT, _Traits, _Allocator>::operator=(basic_ostringstream&& __rhs)
-{
-    basic_ostream<char_type, traits_type>::operator=(_VSTD::move(__rhs));
-    __sb_ = _VSTD::move(__rhs.__sb_);
-    return *this;
-}
-
-#endif  // _LIBCPP_CXX03_LANG
-
-template <class _CharT, class _Traits, class _Allocator>
-void
-basic_ostringstream<_CharT, _Traits, _Allocator>::swap(basic_ostringstream& __rhs)
-{
-    basic_ostream<char_type, traits_type>::swap(__rhs);
-    __sb_.swap(__rhs.__sb_);
-}
-
 template <class _CharT, class _Traits, class _Allocator>
 inline _LIBCPP_INLINE_VISIBILITY
 void
@@ -838,27 +795,6 @@ swap(basic_ostringstream<_CharT, _Traits, _Allocator>& __x,
     __x.swap(__y);
 }
 
-template <class _CharT, class _Traits, class _Allocator>
-basic_stringbuf<_CharT, _Traits, _Allocator>*
-basic_ostringstream<_CharT, _Traits, _Allocator>::rdbuf() const
-{
-    return const_cast<basic_stringbuf<char_type, traits_type, allocator_type>*>(&__sb_);
-}
-
-template <class _CharT, class _Traits, class _Allocator>
-basic_string<_CharT, _Traits, _Allocator>
-basic_ostringstream<_CharT, _Traits, _Allocator>::str() const
-{
-    return __sb_.str();
-}
-
-template <class _CharT, class _Traits, class _Allocator>
-void
-basic_ostringstream<_CharT, _Traits, _Allocator>::str(const string_type& __s)
-{
-    __sb_.str(__s);
-}
-
 // basic_stringstream
 
 template <class _CharT, class _Traits, class _Allocator>
@@ -879,75 +815,63 @@ private:
     basic_stringbuf<char_type, traits_type, allocator_type> __sb_;
 
 public:
-    // 27.8.2.1 Constructors:
-    inline _LIBCPP_INLINE_VISIBILITY
-    explicit basic_stringstream(ios_base::openmode __wch = ios_base::in | ios_base::out);
-    inline _LIBCPP_INLINE_VISIBILITY
-    explicit basic_stringstream(const string_type& __s,
-                                ios_base::openmode __wch = ios_base::in | ios_base::out);
+    // 30.8.5.1 [stringstream.cons], constructors
 #ifndef _LIBCPP_CXX03_LANG
-    inline _LIBCPP_INLINE_VISIBILITY
-    basic_stringstream(basic_stringstream&& __rhs);
+    _LIBCPP_INLINE_VISIBILITY
+    basic_stringstream() : basic_stringstream(ios_base::in | ios_base::out) {}
+
+    _LIBCPP_INLINE_VISIBILITY
+    explicit basic_stringstream(ios_base::openmode __wch)
+        : basic_iostream<_CharT, _Traits>(&__sb_), __sb_(__wch) {}
+#else
+    _LIBCPP_INLINE_VISIBILITY
+    explicit basic_stringstream(ios_base::openmode __wch = ios_base::in |
+                                                           ios_base::out)
+        : basic_iostream<_CharT, _Traits>(&__sb_), __sb_(__wch) {}
+#endif
+
+    _LIBCPP_INLINE_VISIBILITY
+    explicit basic_stringstream(const string_type& __s,
+                                ios_base::openmode __wch = ios_base::in | ios_base::out)
+        : basic_iostream<_CharT, _Traits>(&__sb_)
+        , __sb_(__s, __wch)
+    { }
+
+    _LIBCPP_INLINE_VISIBILITY
+    basic_stringstream(basic_stringstream&& __rhs)
+        : basic_iostream<_CharT, _Traits>(_VSTD::move(__rhs))
+        , __sb_(_VSTD::move(__rhs.__sb_))
+    {
+        basic_istream<_CharT, _Traits>::set_rdbuf(&__sb_);
+    }
 
     // 27.8.2.2 Assign and swap:
-    basic_stringstream& operator=(basic_stringstream&& __rhs);
-#endif  // _LIBCPP_CXX03_LANG
-    inline _LIBCPP_INLINE_VISIBILITY
-    void swap(basic_stringstream& __rhs);
+    basic_stringstream& operator=(basic_stringstream&& __rhs) {
+        basic_iostream<char_type, traits_type>::operator=(_VSTD::move(__rhs));
+        __sb_ = _VSTD::move(__rhs.__sb_);
+        return *this;
+    }
+    _LIBCPP_INLINE_VISIBILITY
+    void swap(basic_stringstream& __rhs) {
+        basic_iostream<char_type, traits_type>::swap(__rhs);
+        __sb_.swap(__rhs.__sb_);
+    }
 
     // 27.8.2.3 Members:
-    inline _LIBCPP_INLINE_VISIBILITY
-    basic_stringbuf<char_type, traits_type, allocator_type>* rdbuf() const;
-    inline _LIBCPP_INLINE_VISIBILITY
-    string_type str() const;
-    inline _LIBCPP_INLINE_VISIBILITY
-    void str(const string_type& __s);
+    _LIBCPP_INLINE_VISIBILITY
+    basic_stringbuf<char_type, traits_type, allocator_type>* rdbuf() const {
+        return const_cast<basic_stringbuf<char_type, traits_type, allocator_type>*>(&__sb_);
+    }
+    _LIBCPP_INLINE_VISIBILITY
+    string_type str() const {
+        return __sb_.str();
+    }
+    _LIBCPP_INLINE_VISIBILITY
+    void str(const string_type& __s) {
+        __sb_.str(__s);
+    }
 };
 
-template <class _CharT, class _Traits, class _Allocator>
-basic_stringstream<_CharT, _Traits, _Allocator>::basic_stringstream(ios_base::openmode __wch)
-    : basic_iostream<_CharT, _Traits>(&__sb_),
-      __sb_(__wch)
-{
-}
-
-template <class _CharT, class _Traits, class _Allocator>
-basic_stringstream<_CharT, _Traits, _Allocator>::basic_stringstream(const string_type& __s,
-                                                                    ios_base::openmode __wch)
-    : basic_iostream<_CharT, _Traits>(&__sb_),
-      __sb_(__s, __wch)
-{
-}
-
-#ifndef _LIBCPP_CXX03_LANG
-
-template <class _CharT, class _Traits, class _Allocator>
-basic_stringstream<_CharT, _Traits, _Allocator>::basic_stringstream(basic_stringstream&& __rhs)
-    : basic_iostream<_CharT, _Traits>(_VSTD::move(__rhs)),
-      __sb_(_VSTD::move(__rhs.__sb_))
-{
-    basic_istream<_CharT, _Traits>::set_rdbuf(&__sb_);
-}
-
-template <class _CharT, class _Traits, class _Allocator>
-basic_stringstream<_CharT, _Traits, _Allocator>&
-basic_stringstream<_CharT, _Traits, _Allocator>::operator=(basic_stringstream&& __rhs)
-{
-    basic_iostream<char_type, traits_type>::operator=(_VSTD::move(__rhs));
-    __sb_ = _VSTD::move(__rhs.__sb_);
-    return *this;
-}
-
-#endif  // _LIBCPP_CXX03_LANG
-
-template <class _CharT, class _Traits, class _Allocator>
-void
-basic_stringstream<_CharT, _Traits, _Allocator>::swap(basic_stringstream& __rhs)
-{
-    basic_iostream<char_type, traits_type>::swap(__rhs);
-    __sb_.swap(__rhs.__sb_);
-}
-
 template <class _CharT, class _Traits, class _Allocator>
 inline _LIBCPP_INLINE_VISIBILITY
 void
@@ -957,26 +881,12 @@ swap(basic_stringstream<_CharT, _Traits, _Allocator>& __x,
     __x.swap(__y);
 }
 
-template <class _CharT, class _Traits, class _Allocator>
-basic_stringbuf<_CharT, _Traits, _Allocator>*
-basic_stringstream<_CharT, _Traits, _Allocator>::rdbuf() const
-{
-    return const_cast<basic_stringbuf<char_type, traits_type, allocator_type>*>(&__sb_);
-}
-
-template <class _CharT, class _Traits, class _Allocator>
-basic_string<_CharT, _Traits, _Allocator>
-basic_stringstream<_CharT, _Traits, _Allocator>::str() const
-{
-    return __sb_.str();
-}
-
-template <class _CharT, class _Traits, class _Allocator>
-void
-basic_stringstream<_CharT, _Traits, _Allocator>::str(const string_type& __s)
-{
-    __sb_.str(__s);
-}
+#if defined(_LIBCPP_ABI_ENABLE_ADDITIONAL_IOSTREAM_EXPLICIT_INSTANTIATIONS_1)
+_LIBCPP_EXTERN_TEMPLATE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_stringbuf<char>)
+_LIBCPP_EXTERN_TEMPLATE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_stringstream<char>)
+_LIBCPP_EXTERN_TEMPLATE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_ostringstream<char>)
+_LIBCPP_EXTERN_TEMPLATE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_istringstream<char>)
+#endif
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/lib/libcxx/include/stdexcept b/lib/libcxx/include/stdexcept
index 7a7f367940..7b5de7ea36 100644
--- a/lib/libcxx/include/stdexcept
+++ b/lib/libcxx/include/stdexcept
@@ -42,11 +42,9 @@ public:
 */
 
 #include <__config>
+#include <cstdlib>
 #include <exception>
 #include <iosfwd>  // for string forward decl
-#ifdef _LIBCPP_NO_EXCEPTIONS
-#include <cstdlib>
-#endif
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #pragma GCC system_header
diff --git a/lib/libcxx/include/streambuf b/lib/libcxx/include/streambuf
index 48c07d5e1f..0dfa31449b 100644
--- a/lib/libcxx/include/streambuf
+++ b/lib/libcxx/include/streambuf
@@ -308,12 +308,12 @@ basic_streambuf<_CharT, _Traits>::~basic_streambuf()
 
 template <class _CharT, class _Traits>
 basic_streambuf<_CharT, _Traits>::basic_streambuf()
-    : __binp_(0),
-      __ninp_(0),
-      __einp_(0),
-      __bout_(0),
-      __nout_(0),
-      __eout_(0)
+    : __binp_(nullptr),
+      __ninp_(nullptr),
+      __einp_(nullptr),
+      __bout_(nullptr),
+      __nout_(nullptr),
+      __eout_(nullptr)
 {
 }
 
@@ -485,13 +485,11 @@ basic_streambuf<_CharT, _Traits>::overflow(int_type)
     return traits_type::eof();
 }
 
-#ifndef _LIBCPP_DO_NOT_ASSUME_STREAMS_EXPLICIT_INSTANTIATION_IN_DYLIB
 _LIBCPP_EXTERN_TEMPLATE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_streambuf<char>)
 _LIBCPP_EXTERN_TEMPLATE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_streambuf<wchar_t>)
 
 _LIBCPP_EXTERN_TEMPLATE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_ios<char>)
 _LIBCPP_EXTERN_TEMPLATE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_ios<wchar_t>)
-#endif
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/lib/libcxx/include/string b/lib/libcxx/include/string
index 2f846eda06..687795c79b 100644
--- a/lib/libcxx/include/string
+++ b/lib/libcxx/include/string
@@ -153,7 +153,8 @@ public:
     void resize(size_type n, value_type c);
     void resize(size_type n);
 
-    void reserve(size_type res_arg = 0);
+    void reserve(size_type res_arg);
+    void reserve(); // deprecated in C++20
     void shrink_to_fit();
     void clear() noexcept;
     bool empty() const noexcept;
@@ -316,12 +317,16 @@ public:
     int compare(size_type pos1, size_type n1, const value_type* s) const;
     int compare(size_type pos1, size_type n1, const value_type* s, size_type n2) const;
 
-    bool starts_with(basic_string_view<charT, traits> sv) const noexcept; // C++2a
-    bool starts_with(charT c) const noexcept;                             // C++2a
-    bool starts_with(const charT* s) const;                               // C++2a
-    bool ends_with(basic_string_view<charT, traits> sv) const noexcept;   // C++2a
-    bool ends_with(charT c) const noexcept;                               // C++2a
-    bool ends_with(const charT* s) const;                                 // C++2a
+    bool starts_with(basic_string_view<charT, traits> sv) const noexcept; // C++20
+    bool starts_with(charT c) const noexcept;                             // C++20
+    bool starts_with(const charT* s) const;                               // C++20
+    bool ends_with(basic_string_view<charT, traits> sv) const noexcept;   // C++20
+    bool ends_with(charT c) const noexcept;                               // C++20
+    bool ends_with(const charT* s) const;                                 // C++20
+
+    constexpr bool contains(basic_string_view<charT, traits> sv) const noexcept; // C++2b
+    constexpr bool contains(charT c) const noexcept;                             // C++2b
+    constexpr bool contains(const charT* s) const;                               // C++2b
 
     bool __invariants() const;
 };
@@ -448,15 +453,15 @@ typedef basic_string<wchar_t> wstring;
 typedef basic_string<char16_t> u16string;
 typedef basic_string<char32_t> u32string;
 
-int                stoi  (const string& str, size_t* idx = 0, int base = 10);
-long               stol  (const string& str, size_t* idx = 0, int base = 10);
-unsigned long      stoul (const string& str, size_t* idx = 0, int base = 10);
-long long          stoll (const string& str, size_t* idx = 0, int base = 10);
-unsigned long long stoull(const string& str, size_t* idx = 0, int base = 10);
+int                stoi  (const string& str, size_t* idx = nullptr, int base = 10);
+long               stol  (const string& str, size_t* idx = nullptr, int base = 10);
+unsigned long      stoul (const string& str, size_t* idx = nullptr, int base = 10);
+long long          stoll (const string& str, size_t* idx = nullptr, int base = 10);
+unsigned long long stoull(const string& str, size_t* idx = nullptr, int base = 10);
 
-float       stof (const string& str, size_t* idx = 0);
-double      stod (const string& str, size_t* idx = 0);
-long double stold(const string& str, size_t* idx = 0);
+float       stof (const string& str, size_t* idx = nullptr);
+double      stod (const string& str, size_t* idx = nullptr);
+long double stold(const string& str, size_t* idx = nullptr);
 
 string to_string(int val);
 string to_string(unsigned val);
@@ -468,15 +473,15 @@ string to_string(float val);
 string to_string(double val);
 string to_string(long double val);
 
-int                stoi  (const wstring& str, size_t* idx = 0, int base = 10);
-long               stol  (const wstring& str, size_t* idx = 0, int base = 10);
-unsigned long      stoul (const wstring& str, size_t* idx = 0, int base = 10);
-long long          stoll (const wstring& str, size_t* idx = 0, int base = 10);
-unsigned long long stoull(const wstring& str, size_t* idx = 0, int base = 10);
+int                stoi  (const wstring& str, size_t* idx = nullptr, int base = 10);
+long               stol  (const wstring& str, size_t* idx = nullptr, int base = 10);
+unsigned long      stoul (const wstring& str, size_t* idx = nullptr, int base = 10);
+long long          stoll (const wstring& str, size_t* idx = nullptr, int base = 10);
+unsigned long long stoull(const wstring& str, size_t* idx = nullptr, int base = 10);
 
-float       stof (const wstring& str, size_t* idx = 0);
-double      stod (const wstring& str, size_t* idx = 0);
-long double stold(const wstring& str, size_t* idx = 0);
+float       stof (const wstring& str, size_t* idx = nullptr);
+double      stod (const wstring& str, size_t* idx = nullptr);
+long double stold(const wstring& str, size_t* idx = nullptr);
 
 wstring to_wstring(int val);
 wstring to_wstring(unsigned val);
@@ -665,8 +670,26 @@ struct __padding<_CharT, 1>
 
 #endif  // _LIBCPP_ABI_ALTERNATE_STRING_LAYOUT
 
+#ifndef _LIBCPP_NO_HAS_CHAR8_T
+typedef basic_string<char8_t> u8string;
+#endif
+
+#ifndef _LIBCPP_HAS_NO_UNICODE_CHARS
+typedef basic_string<char16_t> u16string;
+typedef basic_string<char32_t> u32string;
+#endif  // _LIBCPP_HAS_NO_UNICODE_CHARS
+
 template<class _CharT, class _Traits, class _Allocator>
-class _LIBCPP_TEMPLATE_VIS basic_string
+class
+    _LIBCPP_TEMPLATE_VIS
+#ifndef _LIBCPP_NO_HAS_CHAR8_T
+    _LIBCPP_PREFERRED_NAME(u8string)
+#endif
+#ifndef _LIBCPP_HAS_NO_UNICODE_CHARS
+    _LIBCPP_PREFERRED_NAME(u16string)
+    _LIBCPP_PREFERRED_NAME(u32string)
+#endif
+    basic_string
     : private __basic_string_common<true>
 {
 public:
@@ -816,7 +839,7 @@ public:
     basic_string(const _CharT* __s) : __r_(__default_init_tag(), __default_init_tag()) {
       _LIBCPP_ASSERT(__s != nullptr, "basic_string(const char*) detected nullptr");
       __init(__s, traits_type::length(__s));
-#   if _LIBCPP_DEBUG_LEVEL >= 2
+#   if _LIBCPP_DEBUG_LEVEL == 2
       __get_db()->__insert_c(this);
 #   endif
     }
@@ -890,7 +913,7 @@ public:
     _LIBCPP_INLINE_VISIBILITY basic_string& operator=(const value_type* __s) {return assign(__s);}
     basic_string& operator=(value_type __c);
 
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_INLINE_VISIBILITY
     iterator begin() _NOEXCEPT
         {return iterator(this, __get_pointer());}
@@ -916,7 +939,7 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     const_iterator end() const _NOEXCEPT
         {return const_iterator(__get_pointer() + size());}
-#endif  // _LIBCPP_DEBUG_LEVEL >= 2
+#endif  // _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_INLINE_VISIBILITY
     reverse_iterator rbegin() _NOEXCEPT
         {return reverse_iterator(end());}
@@ -954,13 +977,13 @@ public:
     void resize(size_type __n, value_type __c);
     _LIBCPP_INLINE_VISIBILITY void resize(size_type __n) {resize(__n, value_type());}
 
-    void reserve(size_type __res_arg);
+    void reserve(size_type __requested_capacity);
     _LIBCPP_INLINE_VISIBILITY void __resize_default_init(size_type __n);
 
+    _LIBCPP_DEPRECATED_IN_CXX20 _LIBCPP_INLINE_VISIBILITY
+    void reserve() _NOEXCEPT {shrink_to_fit();}
     _LIBCPP_INLINE_VISIBILITY
-    void reserve() _NOEXCEPT {reserve(0);}
-    _LIBCPP_INLINE_VISIBILITY
-    void shrink_to_fit() _NOEXCEPT {reserve();}
+    void shrink_to_fit() _NOEXCEPT;
     _LIBCPP_INLINE_VISIBILITY
     void clear() _NOEXCEPT;
     _LIBCPP_NODISCARD_AFTER_CXX17 _LIBCPP_INLINE_VISIBILITY
@@ -1414,22 +1437,38 @@ public:
     { return ends_with(__self_view(__s)); }
 #endif
 
+#if _LIBCPP_STD_VER > 20
+    constexpr _LIBCPP_INLINE_VISIBILITY
+    bool contains(__self_view __sv) const noexcept
+    { return __self_view(data(), size()).contains(__sv); }
+
+    constexpr _LIBCPP_INLINE_VISIBILITY
+    bool contains(value_type __c) const noexcept
+    { return __self_view(data(), size()).contains(__c); }
+
+    constexpr _LIBCPP_INLINE_VISIBILITY
+    bool contains(const value_type* __s) const
+    { return __self_view(data(), size()).contains(__s); }
+#endif
+
     _LIBCPP_INLINE_VISIBILITY bool __invariants() const;
 
     _LIBCPP_INLINE_VISIBILITY void __clear_and_shrink() _NOEXCEPT;
 
+    _LIBCPP_INLINE_VISIBILITY void __shrink_or_extend(size_type __target_capacity);
+
     _LIBCPP_INLINE_VISIBILITY
     bool __is_long() const _NOEXCEPT
         {return bool(__r_.first().__s.__size_ & __short_mask);}
 
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
 
     bool __dereferenceable(const const_iterator* __i) const;
     bool __decrementable(const const_iterator* __i) const;
     bool __addable(const const_iterator* __i, ptrdiff_t __n) const;
     bool __subscriptable(const const_iterator* __i, ptrdiff_t __n) const;
 
-#endif  // _LIBCPP_DEBUG_LEVEL >= 2
+#endif  // _LIBCPP_DEBUG_LEVEL == 2
 
 private:
     _LIBCPP_INLINE_VISIBILITY
@@ -1726,21 +1765,21 @@ inline
 void
 basic_string<_CharT, _Traits, _Allocator>::__invalidate_all_iterators()
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__invalidate_all(this);
-#endif  // _LIBCPP_DEBUG_LEVEL >= 2
+#endif
 }
 
 template <class _CharT, class _Traits, class _Allocator>
 inline
 void
 basic_string<_CharT, _Traits, _Allocator>::__invalidate_iterators_past(size_type
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
                                                                         __pos
 #endif
                                                                       )
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __c_node* __c = __get_db()->__find_c_and_lock(this);
     if (__c)
     {
@@ -1753,12 +1792,12 @@ basic_string<_CharT, _Traits, _Allocator>::__invalidate_iterators_past(size_type
             {
                 (*__p)->__c_ = nullptr;
                 if (--__c->end_ != __p)
-                    memmove(__p, __p+1, (__c->end_ - __p)*sizeof(__i_node*));
+                    _VSTD::memmove(__p, __p+1, (__c->end_ - __p)*sizeof(__i_node*));
             }
         }
         __get_db()->unlock();
     }
-#endif  // _LIBCPP_DEBUG_LEVEL >= 2
+#endif  // _LIBCPP_DEBUG_LEVEL == 2
 }
 
 template <class _CharT, class _Traits, class _Allocator>
@@ -1767,7 +1806,7 @@ basic_string<_CharT, _Traits, _Allocator>::basic_string()
     _NOEXCEPT_(is_nothrow_default_constructible<allocator_type>::value)
      : __r_(__default_init_tag(), __default_init_tag())
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     __zero();
@@ -1783,7 +1822,7 @@ basic_string<_CharT, _Traits, _Allocator>::basic_string(const allocator_type& __
 #endif
 : __r_(__default_init_tag(), __a)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     __zero();
@@ -1845,7 +1884,7 @@ basic_string<_CharT, _Traits, _Allocator>::basic_string(const _CharT* __s, const
 {
     _LIBCPP_ASSERT(__s != nullptr, "basic_string(const char*, allocator) detected nullptr");
     __init(__s, traits_type::length(__s));
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
 }
@@ -1857,7 +1896,7 @@ basic_string<_CharT, _Traits, _Allocator>::basic_string(const _CharT* __s, size_
 {
     _LIBCPP_ASSERT(__n == 0 || __s != nullptr, "basic_string(const char*, n) detected nullptr");
     __init(__s, __n);
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
 }
@@ -1869,7 +1908,7 @@ basic_string<_CharT, _Traits, _Allocator>::basic_string(const _CharT* __s, size_
 {
     _LIBCPP_ASSERT(__n == 0 || __s != nullptr, "basic_string(const char*, n, allocator) detected nullptr");
     __init(__s, __n);
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
 }
@@ -1884,7 +1923,7 @@ basic_string<_CharT, _Traits, _Allocator>::basic_string(const basic_string& __st
         __init_copy_ctor_external(_VSTD::__to_address(__str.__get_long_pointer()),
                                   __str.__get_long_size());
 
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
 }
@@ -1899,7 +1938,7 @@ basic_string<_CharT, _Traits, _Allocator>::basic_string(
     else
         __init_copy_ctor_external(_VSTD::__to_address(__str.__get_long_pointer()),
                                   __str.__get_long_size());
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
 }
@@ -1936,7 +1975,7 @@ basic_string<_CharT, _Traits, _Allocator>::basic_string(basic_string&& __str)
     : __r_(_VSTD::move(__str.__r_))
 {
     __str.__zero();
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
     if (__is_long())
         __get_db()->swap(this, &__str);
@@ -1955,7 +1994,7 @@ basic_string<_CharT, _Traits, _Allocator>::basic_string(basic_string&& __str, co
         __r_.first().__r = __str.__r_.first().__r;
         __str.__zero();
     }
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
     if (__is_long())
         __get_db()->swap(this, &__str);
@@ -1994,7 +2033,7 @@ basic_string<_CharT, _Traits, _Allocator>::basic_string(size_type __n, _CharT __
      : __r_(__default_init_tag(), __default_init_tag())
 {
     __init(__n, __c);
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
 }
@@ -2005,7 +2044,7 @@ basic_string<_CharT, _Traits, _Allocator>::basic_string(size_type __n, _CharT __
     : __r_(__default_init_tag(), __a)
 {
     __init(__n, __c);
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
 }
@@ -2020,7 +2059,7 @@ basic_string<_CharT, _Traits, _Allocator>::basic_string(const basic_string& __st
     if (__pos > __str_sz)
         this->__throw_out_of_range();
     __init(__str.data() + __pos, _VSTD::min(__n, __str_sz - __pos));
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
 }
@@ -2035,7 +2074,7 @@ basic_string<_CharT, _Traits, _Allocator>::basic_string(const basic_string& __st
     if (__pos > __str_sz)
         this->__throw_out_of_range();
     __init(__str.data() + __pos, __str_sz - __pos);
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
 }
@@ -2049,7 +2088,7 @@ basic_string<_CharT, _Traits, _Allocator>::basic_string(
     __self_view __sv0 = __t;
     __self_view __sv = __sv0.substr(__pos, __n);
     __init(__sv.data(), __sv.size());
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
 }
@@ -2061,7 +2100,7 @@ basic_string<_CharT, _Traits, _Allocator>::basic_string(const _Tp & __t)
 {
     __self_view __sv = __t;
     __init(__sv.data(), __sv.size());
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
 }
@@ -2073,7 +2112,7 @@ basic_string<_CharT, _Traits, _Allocator>::basic_string(const _Tp & __t, const _
 {
     __self_view __sv = __t;
     __init(__sv.data(), __sv.size());
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
 }
@@ -2141,7 +2180,7 @@ basic_string<_CharT, _Traits, _Allocator>::basic_string(_InputIterator __first,
      : __r_(__default_init_tag(), __default_init_tag())
 {
     __init(__first, __last);
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
 }
@@ -2154,7 +2193,7 @@ basic_string<_CharT, _Traits, _Allocator>::basic_string(_InputIterator __first,
     : __r_(__default_init_tag(), __a)
 {
     __init(__first, __last);
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
 }
@@ -2168,7 +2207,7 @@ basic_string<_CharT, _Traits, _Allocator>::basic_string(
      : __r_(__default_init_tag(), __default_init_tag())
 {
     __init(__il.begin(), __il.end());
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
 }
@@ -2181,7 +2220,7 @@ basic_string<_CharT, _Traits, _Allocator>::basic_string(
     : __r_(__default_init_tag(), __a)
 {
     __init(__il.begin(), __il.end());
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
 }
@@ -2191,7 +2230,7 @@ basic_string<_CharT, _Traits, _Allocator>::basic_string(
 template <class _CharT, class _Traits, class _Allocator>
 basic_string<_CharT, _Traits, _Allocator>::~basic_string()
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__erase_c(this);
 #endif
     if (__is_long())
@@ -2768,7 +2807,7 @@ _EnableIf
 >
 basic_string<_CharT, _Traits, _Allocator>::insert(const_iterator __pos, _InputIterator __first, _InputIterator __last)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__pos) == this,
         "string::insert(iterator, range) called with an iterator not"
         " referring to this string");
@@ -2787,7 +2826,7 @@ _EnableIf
 >
 basic_string<_CharT, _Traits, _Allocator>::insert(const_iterator __pos, _ForwardIterator __first, _ForwardIterator __last)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__pos) == this,
         "string::insert(iterator, range) called with an iterator not"
         " referring to this string");
@@ -2903,7 +2942,7 @@ inline
 typename basic_string<_CharT, _Traits, _Allocator>::iterator
 basic_string<_CharT, _Traits, _Allocator>::insert(const_iterator __pos, size_type __n, value_type __c)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__pos) == this,
         "string::insert(iterator, n, value) called with an iterator not"
         " referring to this string");
@@ -3137,7 +3176,7 @@ inline
 typename basic_string<_CharT, _Traits, _Allocator>::iterator
 basic_string<_CharT, _Traits, _Allocator>::erase(const_iterator __pos)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__pos) == this,
         "string::erase(iterator) called with an iterator not"
         " referring to this string");
@@ -3155,7 +3194,7 @@ inline
 typename basic_string<_CharT, _Traits, _Allocator>::iterator
 basic_string<_CharT, _Traits, _Allocator>::erase(const_iterator __first, const_iterator __last)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__first) == this,
         "string::erase(iterator,  iterator) called with an iterator not"
         " referring to this string");
@@ -3262,65 +3301,88 @@ basic_string<_CharT, _Traits, _Allocator>::max_size() const _NOEXCEPT
 
 template <class _CharT, class _Traits, class _Allocator>
 void
-basic_string<_CharT, _Traits, _Allocator>::reserve(size_type __res_arg)
+basic_string<_CharT, _Traits, _Allocator>::reserve(size_type __requested_capacity)
 {
-    if (__res_arg > max_size())
+    if (__requested_capacity > max_size())
         this->__throw_length_error();
+
+#if _LIBCPP_STD_VER > 17
+    // Reserve never shrinks as of C++20.
+    if (__requested_capacity <= capacity()) return;
+#endif
+
+    size_type __target_capacity = _VSTD::max(__requested_capacity, size());
+    __target_capacity = __recommend(__target_capacity);
+    if (__target_capacity == capacity()) return;
+
+    __shrink_or_extend(__target_capacity);
+}
+
+template <class _CharT, class _Traits, class _Allocator>
+void
+basic_string<_CharT, _Traits, _Allocator>::shrink_to_fit() _NOEXCEPT
+{
+    size_type __target_capacity = __recommend(size());
+    if (__target_capacity == capacity()) return;
+
+    __shrink_or_extend(__target_capacity);
+}
+
+template <class _CharT, class _Traits, class _Allocator>
+void
+basic_string<_CharT, _Traits, _Allocator>::__shrink_or_extend(size_type __target_capacity)
+{
     size_type __cap = capacity();
     size_type __sz = size();
-    __res_arg = _VSTD::max(__res_arg, __sz);
-    __res_arg = __recommend(__res_arg);
-    if (__res_arg != __cap)
+
+    pointer __new_data, __p;
+    bool __was_long, __now_long;
+    if (__target_capacity == __min_cap - 1)
     {
-        pointer __new_data, __p;
-        bool __was_long, __now_long;
-        if (__res_arg == __min_cap - 1)
-        {
-            __was_long = true;
-            __now_long = false;
-            __new_data = __get_short_pointer();
-            __p = __get_long_pointer();
-        }
-        else
-        {
-            if (__res_arg > __cap)
-                __new_data = __alloc_traits::allocate(__alloc(), __res_arg+1);
-            else
-            {
-            #ifndef _LIBCPP_NO_EXCEPTIONS
-                try
-                {
-            #endif  // _LIBCPP_NO_EXCEPTIONS
-                    __new_data = __alloc_traits::allocate(__alloc(), __res_arg+1);
-            #ifndef _LIBCPP_NO_EXCEPTIONS
-                }
-                catch (...)
-                {
-                    return;
-                }
-            #else  // _LIBCPP_NO_EXCEPTIONS
-                if (__new_data == nullptr)
-                    return;
-            #endif  // _LIBCPP_NO_EXCEPTIONS
-            }
-            __now_long = true;
-            __was_long = __is_long();
-            __p = __get_pointer();
-        }
-        traits_type::copy(_VSTD::__to_address(__new_data),
-                          _VSTD::__to_address(__p), size()+1);
-        if (__was_long)
-            __alloc_traits::deallocate(__alloc(), __p, __cap+1);
-        if (__now_long)
-        {
-            __set_long_cap(__res_arg+1);
-            __set_long_size(__sz);
-            __set_long_pointer(__new_data);
-        }
-        else
-            __set_short_size(__sz);
-        __invalidate_all_iterators();
+        __was_long = true;
+        __now_long = false;
+        __new_data = __get_short_pointer();
+        __p = __get_long_pointer();
     }
+    else
+    {
+        if (__target_capacity > __cap)
+            __new_data = __alloc_traits::allocate(__alloc(), __target_capacity+1);
+        else
+        {
+        #ifndef _LIBCPP_NO_EXCEPTIONS
+            try
+            {
+        #endif  // _LIBCPP_NO_EXCEPTIONS
+                __new_data = __alloc_traits::allocate(__alloc(), __target_capacity+1);
+        #ifndef _LIBCPP_NO_EXCEPTIONS
+            }
+            catch (...)
+            {
+                return;
+            }
+        #else  // _LIBCPP_NO_EXCEPTIONS
+            if (__new_data == nullptr)
+                return;
+        #endif  // _LIBCPP_NO_EXCEPTIONS
+        }
+        __now_long = true;
+        __was_long = __is_long();
+        __p = __get_pointer();
+    }
+    traits_type::copy(_VSTD::__to_address(__new_data),
+                        _VSTD::__to_address(__p), size()+1);
+    if (__was_long)
+        __alloc_traits::deallocate(__alloc(), __p, __cap+1);
+    if (__now_long)
+    {
+        __set_long_cap(__target_capacity+1);
+        __set_long_size(__sz);
+        __set_long_pointer(__new_data);
+    }
+    else
+        __set_short_size(__sz);
+    __invalidate_all_iterators();
 }
 
 template <class _CharT, class _Traits, class _Allocator>
@@ -3426,7 +3488,7 @@ basic_string<_CharT, _Traits, _Allocator>::swap(basic_string& __str)
                     __is_nothrow_swappable<allocator_type>::value)
 #endif
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     if (!__is_long())
         __get_db()->__invalidate_all(this);
     if (!__str.__is_long())
@@ -3438,7 +3500,7 @@ basic_string<_CharT, _Traits, _Allocator>::swap(basic_string& __str)
         __alloc_traits::is_always_equal::value ||
         __alloc() == __str.__alloc(), "swapping non-equal allocators");
     _VSTD::swap(__r_.first(), __str.__r_.first());
-    __swap_allocator(__alloc(), __str.__alloc());
+    _VSTD::__swap_allocator(__alloc(), __str.__alloc());
 }
 
 // find
@@ -3939,9 +4001,9 @@ basic_string<_CharT, _Traits, _Allocator>::__invariants() const
         return false;
     if (capacity() < __min_cap - 1)
         return false;
-    if (data() == 0)
+    if (data() == nullptr)
         return false;
-    if (data()[size()] != value_type(0))
+    if (data()[size()] != value_type())
         return false;
     return true;
 }
@@ -3959,6 +4021,7 @@ basic_string<_CharT, _Traits, _Allocator>::__clear_and_shrink() _NOEXCEPT
         __alloc_traits::deallocate(__alloc(), __get_long_pointer(), capacity() + 1);
         __set_long_cap(0);
         __set_short_size(0);
+        traits_type::assign(*__get_short_pointer(), value_type());
     }
 }
 
@@ -4300,24 +4363,15 @@ swap(basic_string<_CharT, _Traits, _Allocator>& __lhs,
     __lhs.swap(__rhs);
 }
 
-#ifndef _LIBCPP_NO_HAS_CHAR8_T
-typedef basic_string<char8_t> u8string;
-#endif
+_LIBCPP_FUNC_VIS int                stoi  (const string& __str, size_t* __idx = nullptr, int __base = 10);
+_LIBCPP_FUNC_VIS long               stol  (const string& __str, size_t* __idx = nullptr, int __base = 10);
+_LIBCPP_FUNC_VIS unsigned long      stoul (const string& __str, size_t* __idx = nullptr, int __base = 10);
+_LIBCPP_FUNC_VIS long long          stoll (const string& __str, size_t* __idx = nullptr, int __base = 10);
+_LIBCPP_FUNC_VIS unsigned long long stoull(const string& __str, size_t* __idx = nullptr, int __base = 10);
 
-#ifndef _LIBCPP_HAS_NO_UNICODE_CHARS
-typedef basic_string<char16_t> u16string;
-typedef basic_string<char32_t> u32string;
-#endif  // _LIBCPP_HAS_NO_UNICODE_CHARS
-
-_LIBCPP_FUNC_VIS int                stoi  (const string& __str, size_t* __idx = 0, int __base = 10);
-_LIBCPP_FUNC_VIS long               stol  (const string& __str, size_t* __idx = 0, int __base = 10);
-_LIBCPP_FUNC_VIS unsigned long      stoul (const string& __str, size_t* __idx = 0, int __base = 10);
-_LIBCPP_FUNC_VIS long long          stoll (const string& __str, size_t* __idx = 0, int __base = 10);
-_LIBCPP_FUNC_VIS unsigned long long stoull(const string& __str, size_t* __idx = 0, int __base = 10);
-
-_LIBCPP_FUNC_VIS float       stof (const string& __str, size_t* __idx = 0);
-_LIBCPP_FUNC_VIS double      stod (const string& __str, size_t* __idx = 0);
-_LIBCPP_FUNC_VIS long double stold(const string& __str, size_t* __idx = 0);
+_LIBCPP_FUNC_VIS float       stof (const string& __str, size_t* __idx = nullptr);
+_LIBCPP_FUNC_VIS double      stod (const string& __str, size_t* __idx = nullptr);
+_LIBCPP_FUNC_VIS long double stold(const string& __str, size_t* __idx = nullptr);
 
 _LIBCPP_FUNC_VIS string to_string(int __val);
 _LIBCPP_FUNC_VIS string to_string(unsigned __val);
@@ -4329,15 +4383,15 @@ _LIBCPP_FUNC_VIS string to_string(float __val);
 _LIBCPP_FUNC_VIS string to_string(double __val);
 _LIBCPP_FUNC_VIS string to_string(long double __val);
 
-_LIBCPP_FUNC_VIS int                stoi  (const wstring& __str, size_t* __idx = 0, int __base = 10);
-_LIBCPP_FUNC_VIS long               stol  (const wstring& __str, size_t* __idx = 0, int __base = 10);
-_LIBCPP_FUNC_VIS unsigned long      stoul (const wstring& __str, size_t* __idx = 0, int __base = 10);
-_LIBCPP_FUNC_VIS long long          stoll (const wstring& __str, size_t* __idx = 0, int __base = 10);
-_LIBCPP_FUNC_VIS unsigned long long stoull(const wstring& __str, size_t* __idx = 0, int __base = 10);
+_LIBCPP_FUNC_VIS int                stoi  (const wstring& __str, size_t* __idx = nullptr, int __base = 10);
+_LIBCPP_FUNC_VIS long               stol  (const wstring& __str, size_t* __idx = nullptr, int __base = 10);
+_LIBCPP_FUNC_VIS unsigned long      stoul (const wstring& __str, size_t* __idx = nullptr, int __base = 10);
+_LIBCPP_FUNC_VIS long long          stoll (const wstring& __str, size_t* __idx = nullptr, int __base = 10);
+_LIBCPP_FUNC_VIS unsigned long long stoull(const wstring& __str, size_t* __idx = nullptr, int __base = 10);
 
-_LIBCPP_FUNC_VIS float       stof (const wstring& __str, size_t* __idx = 0);
-_LIBCPP_FUNC_VIS double      stod (const wstring& __str, size_t* __idx = 0);
-_LIBCPP_FUNC_VIS long double stold(const wstring& __str, size_t* __idx = 0);
+_LIBCPP_FUNC_VIS float       stof (const wstring& __str, size_t* __idx = nullptr);
+_LIBCPP_FUNC_VIS double      stod (const wstring& __str, size_t* __idx = nullptr);
+_LIBCPP_FUNC_VIS long double stold(const wstring& __str, size_t* __idx = nullptr);
 
 _LIBCPP_FUNC_VIS wstring to_wstring(int __val);
 _LIBCPP_FUNC_VIS wstring to_wstring(unsigned __val);
@@ -4425,7 +4479,7 @@ inline _LIBCPP_INLINE_VISIBILITY
 }
 #endif
 
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
 
 template<class _CharT, class _Traits, class _Allocator>
 bool
@@ -4459,7 +4513,7 @@ basic_string<_CharT, _Traits, _Allocator>::__subscriptable(const const_iterator*
     return this->data() <= __p && __p < this->data() + this->size();
 }
 
-#endif  // _LIBCPP_DEBUG_LEVEL >= 2
+#endif  // _LIBCPP_DEBUG_LEVEL == 2
 
 #if _LIBCPP_STD_VER > 11
 // Literal suffixes for basic_string [basic.string.literals]
diff --git a/lib/libcxx/include/string_view b/lib/libcxx/include/string_view
index 8a684a8f96..bc0245cf2b 100644
--- a/lib/libcxx/include/string_view
+++ b/lib/libcxx/include/string_view
@@ -142,12 +142,16 @@ namespace std {
       constexpr size_type find_last_not_of(const charT* s, size_type pos, size_type n) const;
       constexpr size_type find_last_not_of(const charT* s, size_type pos = npos) const;
 
-      constexpr bool starts_with(basic_string_view s) const noexcept; // C++2a
-      constexpr bool starts_with(charT c) const noexcept;             // C++2a
-      constexpr bool starts_with(const charT* s) const;               // C++2a
-      constexpr bool ends_with(basic_string_view s) const noexcept;   // C++2a
-      constexpr bool ends_with(charT c) const noexcept;               // C++2a
-      constexpr bool ends_with(const charT* s) const;                 // C++2a
+      constexpr bool starts_with(basic_string_view s) const noexcept; // C++20
+      constexpr bool starts_with(charT c) const noexcept;             // C++20
+      constexpr bool starts_with(const charT* s) const;               // C++20
+      constexpr bool ends_with(basic_string_view s) const noexcept;   // C++20
+      constexpr bool ends_with(charT c) const noexcept;               // C++20
+      constexpr bool ends_with(const charT* s) const;                 // C++20
+
+      constexpr bool contains(basic_string_view s) const noexcept; // C++2b
+      constexpr bool contains(charT c) const noexcept;             // C++2b
+      constexpr bool contains(const charT* s) const;               // C++2b
 
      private:
       const_pointer data_;  // exposition only
@@ -192,7 +196,26 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template<class _CharT, class _Traits = char_traits<_CharT> >
-class _LIBCPP_TEMPLATE_VIS basic_string_view {
+    class _LIBCPP_TEMPLATE_VIS basic_string_view;
+
+typedef basic_string_view<char>     string_view;
+#ifndef _LIBCPP_NO_HAS_CHAR8_T
+typedef basic_string_view<char8_t>  u8string_view;
+#endif
+typedef basic_string_view<char16_t> u16string_view;
+typedef basic_string_view<char32_t> u32string_view;
+typedef basic_string_view<wchar_t>  wstring_view;
+
+template<class _CharT, class _Traits>
+class
+    _LIBCPP_PREFERRED_NAME(string_view)
+#ifndef _LIBCPP_NO_HAS_CHAR8_T
+    _LIBCPP_PREFERRED_NAME(u8string_view)
+#endif
+    _LIBCPP_PREFERRED_NAME(u16string_view)
+    _LIBCPP_PREFERRED_NAME(u32string_view)
+    _LIBCPP_PREFERRED_NAME(wstring_view)
+    basic_string_view {
 public:
     // types
     typedef _Traits                                    traits_type;
@@ -236,7 +259,7 @@ public:
 
     _LIBCPP_CONSTEXPR _LIBCPP_INLINE_VISIBILITY
     basic_string_view(const _CharT* __s)
-        : __data(__s), __size(std::__char_traits_length_checked<_Traits>(__s)) {}
+        : __data(__s), __size(_VSTD::__char_traits_length_checked<_Traits>(__s)) {}
 
     // [string.view.iterators], iterators
     _LIBCPP_CONSTEXPR _LIBCPP_INLINE_VISIBILITY
@@ -278,7 +301,9 @@ public:
 
     // [string.view.access], element access
     _LIBCPP_CONSTEXPR _LIBCPP_INLINE_VISIBILITY
-    const_reference operator[](size_type __pos) const _NOEXCEPT { return __data[__pos]; }
+    const_reference operator[](size_type __pos) const _NOEXCEPT {
+      return _LIBCPP_ASSERT(__pos < size(), "string_view[] index out of bounds"), __data[__pos];
+    }
 
     _LIBCPP_CONSTEXPR _LIBCPP_INLINE_VISIBILITY
     const_reference at(size_type __pos) const
@@ -601,6 +626,20 @@ public:
     { return ends_with(basic_string_view(__s)); }
 #endif
 
+#if _LIBCPP_STD_VER > 20
+    constexpr _LIBCPP_INLINE_VISIBILITY
+    bool contains(basic_string_view __sv) const noexcept
+    { return find(__sv) != npos; }
+
+    constexpr _LIBCPP_INLINE_VISIBILITY
+    bool contains(value_type __c) const noexcept
+    { return find(__c) != npos; }
+
+    constexpr _LIBCPP_INLINE_VISIBILITY
+    bool contains(const value_type* __s) const
+    { return find(__s) != npos; }
+#endif
+
 private:
     const   value_type* __data;
     size_type           __size;
@@ -774,14 +813,6 @@ basic_ostream<_CharT, _Traits>&
 operator<<(basic_ostream<_CharT, _Traits>& __os,
            basic_string_view<_CharT, _Traits> __str);
 
-typedef basic_string_view<char>     string_view;
-#ifndef _LIBCPP_NO_HAS_CHAR8_T
-typedef basic_string_view<char8_t>  u8string_view;
-#endif
-typedef basic_string_view<char16_t> u16string_view;
-typedef basic_string_view<char32_t> u32string_view;
-typedef basic_string_view<wchar_t>  wstring_view;
-
 // [string.view.hash]
 template<class _CharT>
 struct _LIBCPP_TEMPLATE_VIS hash<basic_string_view<_CharT, char_traits<_CharT> > >
diff --git a/lib/libcxx/include/strstream b/lib/libcxx/include/strstream
index 31999bbae1..0062777cd4 100644
--- a/lib/libcxx/include/strstream
+++ b/lib/libcxx/include/strstream
@@ -17,14 +17,17 @@ class strstreambuf
     : public basic_streambuf<char>
 {
 public:
-    explicit strstreambuf(streamsize alsize_arg = 0);
+    explicit strstreambuf(streamsize alsize_arg = 0); // before C++20
+    strstreambuf() : strstreambuf(0) {}               // C++20
+    explicit strstreambuf(streamsize alsize_arg);     // C++20
+
     strstreambuf(void* (*palloc_arg)(size_t), void (*pfree_arg)(void*));
-    strstreambuf(char* gnext_arg, streamsize n, char* pbeg_arg = 0);
+    strstreambuf(char* gnext_arg, streamsize n, char* pbeg_arg = nullptr);
     strstreambuf(const char* gnext_arg, streamsize n);
 
-    strstreambuf(signed char* gnext_arg, streamsize n, signed char* pbeg_arg = 0);
+    strstreambuf(signed char* gnext_arg, streamsize n, signed char* pbeg_arg = nullptr);
     strstreambuf(const signed char* gnext_arg, streamsize n);
-    strstreambuf(unsigned char* gnext_arg, streamsize n, unsigned char* pbeg_arg = 0);
+    strstreambuf(unsigned char* gnext_arg, streamsize n, unsigned char* pbeg_arg = nullptr);
     strstreambuf(const unsigned char* gnext_arg, streamsize n);
 
     strstreambuf(strstreambuf&& rhs);
@@ -140,14 +143,19 @@ class _LIBCPP_TYPE_VIS strstreambuf
     : public streambuf
 {
 public:
+#ifndef _LIBCPP_CXX03_LANG
+    strstreambuf() : strstreambuf(0) {}
+    explicit strstreambuf(streamsize __alsize);
+#else
     explicit strstreambuf(streamsize __alsize = 0);
+#endif
     strstreambuf(void* (*__palloc)(size_t), void (*__pfree)(void*));
-    strstreambuf(char* __gnext, streamsize __n, char* __pbeg = 0);
+    strstreambuf(char* __gnext, streamsize __n, char* __pbeg = nullptr);
     strstreambuf(const char* __gnext, streamsize __n);
 
-    strstreambuf(signed char* __gnext, streamsize __n, signed char* __pbeg = 0);
+    strstreambuf(signed char* __gnext, streamsize __n, signed char* __pbeg = nullptr);
     strstreambuf(const signed char* __gnext, streamsize __n);
-    strstreambuf(unsigned char* __gnext, streamsize __n, unsigned char* __pbeg = 0);
+    strstreambuf(unsigned char* __gnext, streamsize __n, unsigned char* __pbeg = nullptr);
     strstreambuf(const unsigned char* __gnext, streamsize __n);
 
 #ifndef _LIBCPP_CXX03_LANG
@@ -290,7 +298,7 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     ostrstream(char* __s, int __n, ios_base::openmode __mode = ios_base::out)
         : ostream(&__sb_),
-          __sb_(__s, __n, __s + (__mode & ios::app ? strlen(__s) : 0))
+          __sb_(__s, __n, __s + (__mode & ios::app ? _VSTD::strlen(__s) : 0))
         {}
 
 #ifndef _LIBCPP_CXX03_LANG
@@ -350,7 +358,7 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     strstream(char* __s, int __n, ios_base::openmode __mode = ios_base::in | ios_base::out)
         : iostream(&__sb_),
-          __sb_(__s, __n, __s + (__mode & ios::app ? strlen(__s) : 0))
+          __sb_(__s, __n, __s + (__mode & ios::app ? _VSTD::strlen(__s) : 0))
         {}
 
 #ifndef _LIBCPP_CXX03_LANG
diff --git a/lib/libcxx/include/system_error b/lib/libcxx/include/system_error
index 74e889aa9a..b714e1d426 100644
--- a/lib/libcxx/include/system_error
+++ b/lib/libcxx/include/system_error
@@ -253,7 +253,7 @@ public:
     template <class _Ep>
         _LIBCPP_INLINE_VISIBILITY
         error_condition(_Ep __e,
-              typename enable_if<is_error_condition_enum<_Ep>::value>::type* = 0
+              typename enable_if<is_error_condition_enum<_Ep>::value>::type* = nullptr
                                                                      ) _NOEXCEPT
             {*this = make_error_condition(__e);}
 
@@ -325,7 +325,7 @@ public:
     template <class _Ep>
         _LIBCPP_INLINE_VISIBILITY
         error_code(_Ep __e,
-                   typename enable_if<is_error_code_enum<_Ep>::value>::type* = 0
+                   typename enable_if<is_error_code_enum<_Ep>::value>::type* = nullptr
                                                                      ) _NOEXCEPT
             {*this = make_error_code(__e);}
 
diff --git a/lib/libcxx/include/thread b/lib/libcxx/include/thread
index 6eff1800ac..34e0c2a239 100644
--- a/lib/libcxx/include/thread
+++ b/lib/libcxx/include/thread
@@ -277,18 +277,18 @@ inline _LIBCPP_INLINE_VISIBILITY
 void
 __thread_execute(tuple<_TSp, _Fp, _Args...>& __t, __tuple_indices<_Indices...>)
 {
-    __invoke(_VSTD::move(_VSTD::get<1>(__t)), _VSTD::move(_VSTD::get<_Indices>(__t))...);
+    _VSTD::__invoke(_VSTD::move(_VSTD::get<1>(__t)), _VSTD::move(_VSTD::get<_Indices>(__t))...);
 }
 
 template <class _Fp>
 _LIBCPP_INLINE_VISIBILITY
 void* __thread_proxy(void* __vp)
 {
-    // _Fp = std::tuple< unique_ptr<__thread_struct>, Functor, Args...>
-    std::unique_ptr<_Fp> __p(static_cast<_Fp*>(__vp));
-    __thread_local_data().set_pointer(_VSTD::get<0>(*__p).release());
+    // _Fp = tuple< unique_ptr<__thread_struct>, Functor, Args...>
+    unique_ptr<_Fp> __p(static_cast<_Fp*>(__vp));
+    __thread_local_data().set_pointer(_VSTD::get<0>(*__p.get()).release());
     typedef typename __make_tuple_indices<tuple_size<_Fp>::value, 2>::type _Index;
-    __thread_execute(*__p, _Index());
+    _VSTD::__thread_execute(*__p.get(), _Index());
     return nullptr;
 }
 
@@ -300,11 +300,11 @@ thread::thread(_Fp&& __f, _Args&&... __args)
     typedef unique_ptr<__thread_struct> _TSPtr;
     _TSPtr __tsp(new __thread_struct);
     typedef tuple<_TSPtr, typename decay<_Fp>::type, typename decay<_Args>::type...> _Gp;
-    _VSTD::unique_ptr<_Gp> __p(
-            new _Gp(std::move(__tsp),
-                    __decay_copy(_VSTD::forward<_Fp>(__f)),
-                    __decay_copy(_VSTD::forward<_Args>(__args))...));
-    int __ec = __libcpp_thread_create(&__t_, &__thread_proxy<_Gp>, __p.get());
+    unique_ptr<_Gp> __p(
+            new _Gp(_VSTD::move(__tsp),
+                    _VSTD::__decay_copy(_VSTD::forward<_Fp>(__f)),
+                    _VSTD::__decay_copy(_VSTD::forward<_Args>(__args))...));
+    int __ec = _VSTD::__libcpp_thread_create(&__t_, &__thread_proxy<_Gp>, __p.get());
     if (__ec == 0)
         __p.release();
     else
@@ -326,7 +326,7 @@ struct __thread_invoke_pair {
 template <class _Fp>
 void* __thread_proxy_cxx03(void* __vp)
 {
-    std::unique_ptr<_Fp> __p(static_cast<_Fp*>(__vp));
+    unique_ptr<_Fp> __p(static_cast<_Fp*>(__vp));
     __thread_local_data().set_pointer(__p->__tsp_.release());
     (__p->__fn_)();
     return nullptr;
@@ -337,9 +337,9 @@ thread::thread(_Fp __f)
 {
 
     typedef __thread_invoke_pair<_Fp> _InvokePair;
-    typedef std::unique_ptr<_InvokePair> _PairPtr;
+    typedef unique_ptr<_InvokePair> _PairPtr;
     _PairPtr __pp(new _InvokePair(__f));
-    int __ec = __libcpp_thread_create(&__t_, &__thread_proxy_cxx03<_InvokePair>, __pp.get());
+    int __ec = _VSTD::__libcpp_thread_create(&__t_, &__thread_proxy_cxx03<_InvokePair>, __pp.get());
     if (__ec == 0)
         __pp.release();
     else
@@ -360,25 +360,24 @@ template <class _Rep, class _Period>
 void
 sleep_for(const chrono::duration<_Rep, _Period>& __d)
 {
-    using namespace chrono;
-    if (__d > duration<_Rep, _Period>::zero())
+    if (__d > chrono::duration<_Rep, _Period>::zero())
     {
 #if defined(_LIBCPP_COMPILER_GCC) && (__powerpc__ || __POWERPC__)
     //  GCC's long double const folding is incomplete for IBM128 long doubles.
-        _LIBCPP_CONSTEXPR duration<long double> _Max = duration<long double>(ULLONG_MAX/1000000000ULL) ;
+        _LIBCPP_CONSTEXPR chrono::duration<long double> _Max = chrono::duration<long double>(ULLONG_MAX/1000000000ULL) ;
 #else
-        _LIBCPP_CONSTEXPR duration<long double> _Max = nanoseconds::max();
+        _LIBCPP_CONSTEXPR chrono::duration<long double> _Max = chrono::nanoseconds::max();
 #endif
-        nanoseconds __ns;
+        chrono::nanoseconds __ns;
         if (__d < _Max)
         {
-            __ns = duration_cast<nanoseconds>(__d);
+            __ns = chrono::duration_cast<chrono::nanoseconds>(__d);
             if (__ns < __d)
                 ++__ns;
         }
         else
-            __ns = nanoseconds::max();
-        sleep_for(__ns);
+            __ns = chrono::nanoseconds::max();
+        this_thread::sleep_for(__ns);
     }
 }
 
@@ -386,7 +385,6 @@ template <class _Clock, class _Duration>
 void
 sleep_until(const chrono::time_point<_Clock, _Duration>& __t)
 {
-    using namespace chrono;
     mutex __mut;
     condition_variable __cv;
     unique_lock<mutex> __lk(__mut);
@@ -399,8 +397,7 @@ inline _LIBCPP_INLINE_VISIBILITY
 void
 sleep_until(const chrono::time_point<chrono::steady_clock, _Duration>& __t)
 {
-    using namespace chrono;
-    sleep_for(__t - steady_clock::now());
+    this_thread::sleep_for(__t - chrono::steady_clock::now());
 }
 
 inline _LIBCPP_INLINE_VISIBILITY
diff --git a/lib/libcxx/include/tuple b/lib/libcxx/include/tuple
index 1f80b70759..c3c7db5ff1 100644
--- a/lib/libcxx/include/tuple
+++ b/lib/libcxx/include/tuple
@@ -1393,7 +1393,7 @@ struct _LIBCPP_TEMPLATE_VIS uses_allocator<tuple<_Tp...>, _Alloc>
 
 template <class _T1, class _T2>
 template <class... _Args1, class... _Args2, size_t ..._I1, size_t ..._I2>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 pair<_T1, _T2>::pair(piecewise_construct_t,
                      tuple<_Args1...>& __first_args, tuple<_Args2...>& __second_args,
                      __tuple_indices<_I1...>, __tuple_indices<_I2...>)
diff --git a/lib/libcxx/include/type_traits b/lib/libcxx/include/type_traits
index be031037d6..59dfd1e9ba 100644
--- a/lib/libcxx/include/type_traits
+++ b/lib/libcxx/include/type_traits
@@ -51,6 +51,7 @@ namespace std
     template <class T> struct is_arithmetic;
     template <class T> struct is_fundamental;
     template <class T> struct is_member_pointer;
+    template <class T> struct is_scoped_enum; // C++2b
     template <class T> struct is_scalar;
     template <class T> struct is_object;
     template <class T> struct is_compound;
@@ -284,6 +285,8 @@ namespace std
         = is_compound<T>::value;                                         // C++17
       template <class T> inline constexpr bool is_member_pointer_v
         = is_member_pointer<T>::value;                                   // C++17
+      template <class T> inline constexpr bool is_scoped_enum_v
+        = is_scoped_enum<T>::value;                                      // C++2b
 
       // See C++14 20.10.4.3, type properties
       template <class T> inline constexpr bool is_const_v
@@ -514,7 +517,7 @@ template <template <class...> class, class ...>
 false_type __sfinae_test_impl(...);
 
 template <template <class ...> class _Templ, class ..._Args>
-using _IsValidExpansion _LIBCPP_NODEBUG_TYPE = decltype(std::__sfinae_test_impl<_Templ, _Args...>(0));
+using _IsValidExpansion _LIBCPP_NODEBUG_TYPE = decltype(__sfinae_test_impl<_Templ, _Args...>(0));
 
 template <class>
 struct __void_t { typedef void type; };
@@ -595,75 +598,6 @@ using __is_primary_template = _IsValidExpansion<
     __test_for_primary_template, _Tp
   >;
 
-// addressof
-#ifndef _LIBCPP_HAS_NO_BUILTIN_ADDRESSOF
-
-template <class _Tp>
-inline _LIBCPP_CONSTEXPR_AFTER_CXX14
-_LIBCPP_NO_CFI _LIBCPP_INLINE_VISIBILITY
-_Tp*
-addressof(_Tp& __x) _NOEXCEPT
-{
-    return __builtin_addressof(__x);
-}
-
-#else
-
-template <class _Tp>
-inline _LIBCPP_NO_CFI _LIBCPP_INLINE_VISIBILITY
-_Tp*
-addressof(_Tp& __x) _NOEXCEPT
-{
-  return reinterpret_cast<_Tp *>(
-      const_cast<char *>(&reinterpret_cast<const volatile char &>(__x)));
-}
-
-#endif // _LIBCPP_HAS_NO_BUILTIN_ADDRESSOF
-
-#if defined(_LIBCPP_HAS_OBJC_ARC) && !defined(_LIBCPP_PREDEFINED_OBJC_ARC_ADDRESSOF)
-// Objective-C++ Automatic Reference Counting uses qualified pointers
-// that require special addressof() signatures. When
-// _LIBCPP_PREDEFINED_OBJC_ARC_ADDRESSOF is defined, the compiler
-// itself is providing these definitions. Otherwise, we provide them.
-template <class _Tp>
-inline _LIBCPP_INLINE_VISIBILITY
-__strong _Tp*
-addressof(__strong _Tp& __x) _NOEXCEPT
-{
-  return &__x;
-}
-
-#ifdef _LIBCPP_HAS_OBJC_ARC_WEAK
-template <class _Tp>
-inline _LIBCPP_INLINE_VISIBILITY
-__weak _Tp*
-addressof(__weak _Tp& __x) _NOEXCEPT
-{
-  return &__x;
-}
-#endif
-
-template <class _Tp>
-inline _LIBCPP_INLINE_VISIBILITY
-__autoreleasing _Tp*
-addressof(__autoreleasing _Tp& __x) _NOEXCEPT
-{
-  return &__x;
-}
-
-template <class _Tp>
-inline _LIBCPP_INLINE_VISIBILITY
-__unsafe_unretained _Tp*
-addressof(__unsafe_unretained _Tp& __x) _NOEXCEPT
-{
-  return &__x;
-}
-#endif
-
-#if !defined(_LIBCPP_CXX03_LANG)
-template <class _Tp> _Tp* addressof(const _Tp&&) noexcept = delete;
-#endif
-
 struct __two {char __lx[2];};
 
 // helper class:
@@ -1731,6 +1665,21 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_base_of_v
     = is_base_of<_Bp, _Dp>::value;
 #endif
 
+// __is_core_convertible
+
+// [conv.general]/3 says "E is convertible to T" whenever "T t=E;" is well-formed.
+// We can't test for that, but we can test implicit convertibility by passing it
+// to a function. Notice that __is_core_convertible<void,void> is false,
+// and __is_core_convertible<immovable-type,immovable-type> is true in C++17 and later.
+
+template <class _Tp, class _Up, class = void>
+struct __is_core_convertible : public false_type {};
+
+template <class _Tp, class _Up>
+struct __is_core_convertible<_Tp, _Up, decltype(
+    static_cast<void(*)(_Up)>(0) ( static_cast<_Tp(*)()>(0)() )
+)> : public true_type {};
+
 // is_convertible
 
 #if __has_feature(is_convertible_to) && !defined(_LIBCPP_USE_IS_CONVERTIBLE_FALLBACK)
@@ -1821,7 +1770,7 @@ template <typename _Tp>
 static void __test_noexcept(_Tp) noexcept;
 
 template<typename _Fm, typename _To>
-static bool_constant<noexcept(__test_noexcept<_To>(declval<_Fm>()))>
+static bool_constant<noexcept(_VSTD::__test_noexcept<_To>(declval<_Fm>()))>
 __is_nothrow_convertible_test();
 
 template <typename _Fm, typename _To>
@@ -2550,7 +2499,7 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_destructible_v = __is_destructible(
 //  if it's a function, return false
 //  if it's   void,     return false
 //  if it's an array of unknown bound, return false
-//  Otherwise, return "std::declval<_Up&>().~_Up()" is well-formed
+//  Otherwise, return "declval<_Up&>().~_Up()" is well-formed
 //    where _Up is remove_all_extents<_Tp>::type
 
 template <class>
@@ -2876,167 +2825,13 @@ struct __member_pointer_class_type<_Ret _ClassType::*> {
   typedef _ClassType type;
 };
 
-// result_of
-
-template <class _Callable> class result_of;
-
-#ifdef _LIBCPP_HAS_NO_VARIADICS
-
-template <class _Fn, bool, bool>
-class __result_of
-{
-};
-
-template <class _Fn>
-class __result_of<_Fn(), true, false>
-{
-public:
-    typedef decltype(declval<_Fn>()()) type;
-};
-
-template <class _Fn, class _A0>
-class __result_of<_Fn(_A0), true, false>
-{
-public:
-    typedef decltype(declval<_Fn>()(declval<_A0>())) type;
-};
-
-template <class _Fn, class _A0, class _A1>
-class __result_of<_Fn(_A0, _A1), true, false>
-{
-public:
-    typedef decltype(declval<_Fn>()(declval<_A0>(), declval<_A1>())) type;
-};
-
-template <class _Fn, class _A0, class _A1, class _A2>
-class __result_of<_Fn(_A0, _A1, _A2), true, false>
-{
-public:
-    typedef decltype(declval<_Fn>()(declval<_A0>(), declval<_A1>(), declval<_A2>())) type;
-};
-
-template <class _MP, class _Tp, bool _IsMemberFunctionPtr>
-struct __result_of_mp;
-
-// member function pointer
-
-template <class _MP, class _Tp>
-struct __result_of_mp<_MP, _Tp, true>
-    : public __identity<typename __member_pointer_traits<_MP>::_ReturnType>
-{
-};
-
-// member data pointer
-
-template <class _MP, class _Tp, bool>
-struct __result_of_mdp;
-
-template <class _Rp, class _Class, class _Tp>
-struct __result_of_mdp<_Rp _Class::*, _Tp, false>
-{
-    typedef typename __apply_cv<decltype(*_VSTD::declval<_Tp>()), _Rp>::type& type;
-};
-
-template <class _Rp, class _Class, class _Tp>
-struct __result_of_mdp<_Rp _Class::*, _Tp, true>
-{
-    typedef typename __apply_cv<_Tp, _Rp>::type& type;
-};
-
-template <class _Rp, class _Class, class _Tp>
-struct __result_of_mp<_Rp _Class::*, _Tp, false>
-    : public __result_of_mdp<_Rp _Class::*, _Tp,
-            is_base_of<_Class, typename remove_reference<_Tp>::type>::value>
-{
-};
-
-
-
-template <class _Fn, class _Tp>
-class __result_of<_Fn(_Tp), false, true>  // _Fn must be member pointer
-    : public __result_of_mp<typename remove_reference<_Fn>::type,
-                            _Tp,
-                            is_member_function_pointer<typename remove_reference<_Fn>::type>::value>
-{
-};
-
-template <class _Fn, class _Tp, class _A0>
-class __result_of<_Fn(_Tp, _A0), false, true>  // _Fn must be member pointer
-    : public __result_of_mp<typename remove_reference<_Fn>::type,
-                            _Tp,
-                            is_member_function_pointer<typename remove_reference<_Fn>::type>::value>
-{
-};
-
-template <class _Fn, class _Tp, class _A0, class _A1>
-class __result_of<_Fn(_Tp, _A0, _A1), false, true>  // _Fn must be member pointer
-    : public __result_of_mp<typename remove_reference<_Fn>::type,
-                            _Tp,
-                            is_member_function_pointer<typename remove_reference<_Fn>::type>::value>
-{
-};
-
-template <class _Fn, class _Tp, class _A0, class _A1, class _A2>
-class __result_of<_Fn(_Tp, _A0, _A1, _A2), false, true>  // _Fn must be member pointer
-    : public __result_of_mp<typename remove_reference<_Fn>::type,
-                            _Tp,
-                            is_member_function_pointer<typename remove_reference<_Fn>::type>::value>
-{
-};
-
-// result_of
-
-template <class _Fn>
-class _LIBCPP_TEMPLATE_VIS result_of<_Fn()>
-    : public __result_of<_Fn(),
-                         is_class<typename remove_reference<_Fn>::type>::value ||
-                         is_function<typename remove_pointer<typename remove_reference<_Fn>::type>::type>::value,
-                         is_member_pointer<typename remove_reference<_Fn>::type>::value
-                        >
-{
-};
-
-template <class _Fn, class _A0>
-class _LIBCPP_TEMPLATE_VIS result_of<_Fn(_A0)>
-    : public __result_of<_Fn(_A0),
-                         is_class<typename remove_reference<_Fn>::type>::value ||
-                         is_function<typename remove_pointer<typename remove_reference<_Fn>::type>::type>::value,
-                         is_member_pointer<typename remove_reference<_Fn>::type>::value
-                        >
-{
-};
-
-template <class _Fn, class _A0, class _A1>
-class _LIBCPP_TEMPLATE_VIS result_of<_Fn(_A0, _A1)>
-    : public __result_of<_Fn(_A0, _A1),
-                         is_class<typename remove_reference<_Fn>::type>::value ||
-                         is_function<typename remove_pointer<typename remove_reference<_Fn>::type>::type>::value,
-                         is_member_pointer<typename remove_reference<_Fn>::type>::value
-                        >
-{
-};
-
-template <class _Fn, class _A0, class _A1, class _A2>
-class _LIBCPP_TEMPLATE_VIS result_of<_Fn(_A0, _A1, _A2)>
-    : public __result_of<_Fn(_A0, _A1, _A2),
-                         is_class<typename remove_reference<_Fn>::type>::value ||
-                         is_function<typename remove_pointer<typename remove_reference<_Fn>::type>::type>::value,
-                         is_member_pointer<typename remove_reference<_Fn>::type>::value
-                        >
-{
-};
-
-#endif  // _LIBCPP_HAS_NO_VARIADICS
-
 // template <class T, class... Args> struct is_constructible;
 
-namespace __is_construct
-{
-struct __nat {};
-}
+#if defined(_LIBCPP_COMPILER_GCC) && _GNUC_VER_NEW >= 10000
+# define _LIBCPP_GCC_SUPPORTS_IS_CONSTRUCTIBLE
+#endif
 
-#if !defined(_LIBCPP_CXX03_LANG) && (!__has_feature(is_constructible) || \
-    defined(_LIBCPP_TESTING_FALLBACK_IS_CONSTRUCTIBLE))
+#if !defined(_LIBCPP_CXX03_LANG) && !__has_feature(is_constructible) && !defined(_LIBCPP_GCC_SUPPORTS_IS_CONSTRUCTIBLE)
 
 template <class _Tp, class... _Args>
 struct __libcpp_is_constructible;
@@ -3151,7 +2946,7 @@ struct __libcpp_is_constructible<_Tp&&, _A0>
 
 #endif
 
-#if __has_feature(is_constructible)
+#if __has_feature(is_constructible) || defined(_LIBCPP_GCC_SUPPORTS_IS_CONSTRUCTIBLE)
 template <class _Tp, class ..._Args>
 struct _LIBCPP_TEMPLATE_VIS is_constructible
     : public integral_constant<bool, __is_constructible(_Tp, _Args...)>
@@ -3442,7 +3237,7 @@ void __implicit_conversion_to(_Tp) noexcept { }
 
 template <class _Tp, class _Arg>
 struct __libcpp_is_nothrow_constructible</*is constructible*/true, /*is reference*/true, _Tp, _Arg>
-    : public integral_constant<bool, noexcept(__implicit_conversion_to<_Tp>(declval<_Arg>()))>
+    : public integral_constant<bool, noexcept(_VSTD::__implicit_conversion_to<_Tp>(declval<_Arg>()))>
 {
 };
 
@@ -3807,7 +3602,7 @@ auto __invoke_constexpr(__any, _Args&& ...__args) -> __nat;
 template <class _Fp, class _A0, class ..._Args,
           class = __enable_if_bullet1<_Fp, _A0>>
 inline _LIBCPP_INLINE_VISIBILITY
-auto
+_LIBCPP_CONSTEXPR_AFTER_CXX17 auto
 __invoke(_Fp&& __f, _A0&& __a0, _Args&& ...__args)
 _LIBCPP_INVOKE_RETURN((_VSTD::forward<_A0>(__a0).*__f)(_VSTD::forward<_Args>(__args)...))
 
@@ -3821,7 +3616,7 @@ _LIBCPP_INVOKE_RETURN((_VSTD::forward<_A0>(__a0).*__f)(_VSTD::forward<_Args>(__a
 template <class _Fp, class _A0, class ..._Args,
           class = __enable_if_bullet2<_Fp, _A0>>
 inline _LIBCPP_INLINE_VISIBILITY
-auto
+_LIBCPP_CONSTEXPR_AFTER_CXX17 auto
 __invoke(_Fp&& __f, _A0&& __a0, _Args&& ...__args)
 _LIBCPP_INVOKE_RETURN((__a0.get().*__f)(_VSTD::forward<_Args>(__args)...))
 
@@ -3835,7 +3630,7 @@ _LIBCPP_INVOKE_RETURN((__a0.get().*__f)(_VSTD::forward<_Args>(__args)...))
 template <class _Fp, class _A0, class ..._Args,
           class = __enable_if_bullet3<_Fp, _A0>>
 inline _LIBCPP_INLINE_VISIBILITY
-auto
+_LIBCPP_CONSTEXPR_AFTER_CXX17 auto
 __invoke(_Fp&& __f, _A0&& __a0, _Args&& ...__args)
 _LIBCPP_INVOKE_RETURN(((*_VSTD::forward<_A0>(__a0)).*__f)(_VSTD::forward<_Args>(__args)...))
 
@@ -3851,7 +3646,7 @@ _LIBCPP_INVOKE_RETURN(((*_VSTD::forward<_A0>(__a0)).*__f)(_VSTD::forward<_Args>(
 template <class _Fp, class _A0,
           class = __enable_if_bullet4<_Fp, _A0>>
 inline _LIBCPP_INLINE_VISIBILITY
-auto
+_LIBCPP_CONSTEXPR_AFTER_CXX17 auto
 __invoke(_Fp&& __f, _A0&& __a0)
 _LIBCPP_INVOKE_RETURN(_VSTD::forward<_A0>(__a0).*__f)
 
@@ -3865,7 +3660,7 @@ _LIBCPP_INVOKE_RETURN(_VSTD::forward<_A0>(__a0).*__f)
 template <class _Fp, class _A0,
           class = __enable_if_bullet5<_Fp, _A0>>
 inline _LIBCPP_INLINE_VISIBILITY
-auto
+_LIBCPP_CONSTEXPR_AFTER_CXX17 auto
 __invoke(_Fp&& __f, _A0&& __a0)
 _LIBCPP_INVOKE_RETURN(__a0.get().*__f)
 
@@ -3879,7 +3674,7 @@ _LIBCPP_INVOKE_RETURN(__a0.get().*__f)
 template <class _Fp, class _A0,
           class = __enable_if_bullet6<_Fp, _A0>>
 inline _LIBCPP_INLINE_VISIBILITY
-auto
+_LIBCPP_CONSTEXPR_AFTER_CXX17 auto
 __invoke(_Fp&& __f, _A0&& __a0)
 _LIBCPP_INVOKE_RETURN((*_VSTD::forward<_A0>(__a0)).*__f)
 
@@ -3894,7 +3689,7 @@ _LIBCPP_INVOKE_RETURN((*_VSTD::forward<_A0>(__a0)).*__f)
 
 template <class _Fp, class ..._Args>
 inline _LIBCPP_INLINE_VISIBILITY
-auto
+_LIBCPP_CONSTEXPR_AFTER_CXX17 auto
 __invoke(_Fp&& __f, _Args&& ...__args)
 _LIBCPP_INVOKE_RETURN(_VSTD::forward<_Fp>(__f)(_VSTD::forward<_Args>(__args)...))
 
@@ -3982,14 +3777,97 @@ struct __invoke_of
 {
 };
 
+#endif // _LIBCPP_CXX03_LANG
+
 // result_of
 
+template <class _Callable> class result_of;
+
+#ifndef _LIBCPP_CXX03_LANG
+
 template <class _Fp, class ..._Args>
 class _LIBCPP_TEMPLATE_VIS result_of<_Fp(_Args...)>
     : public __invoke_of<_Fp, _Args...>
 {
 };
 
+#else // C++03
+
+template <class _Fn, bool, bool>
+class __result_of
+{
+};
+
+template <class _Fn, class ..._Args>
+class __result_of<_Fn(_Args...), true, false>
+{
+public:
+    typedef decltype(declval<_Fn>()(declval<_Args>()...)) type;
+};
+
+template <class _MP, class _Tp, bool _IsMemberFunctionPtr>
+struct __result_of_mp;
+
+// member function pointer
+
+template <class _MP, class _Tp>
+struct __result_of_mp<_MP, _Tp, true>
+    : public __identity<typename __member_pointer_traits<_MP>::_ReturnType>
+{
+};
+
+// member data pointer
+
+template <class _MP, class _Tp, bool>
+struct __result_of_mdp;
+
+template <class _Rp, class _Class, class _Tp>
+struct __result_of_mdp<_Rp _Class::*, _Tp, false>
+{
+    typedef typename __apply_cv<decltype(*_VSTD::declval<_Tp>()), _Rp>::type& type;
+};
+
+template <class _Rp, class _Class, class _Tp>
+struct __result_of_mdp<_Rp _Class::*, _Tp, true>
+{
+    typedef typename __apply_cv<_Tp, _Rp>::type& type;
+};
+
+template <class _Rp, class _Class, class _Tp>
+struct __result_of_mp<_Rp _Class::*, _Tp, false>
+    : public __result_of_mdp<_Rp _Class::*, _Tp,
+            is_base_of<_Class, typename remove_reference<_Tp>::type>::value>
+{
+};
+
+template <class _Fn, class _Tp>
+class __result_of<_Fn(_Tp), false, true>  // _Fn must be member pointer
+    : public __result_of_mp<typename remove_reference<_Fn>::type,
+                            _Tp,
+                            is_member_function_pointer<typename remove_reference<_Fn>::type>::value>
+{
+};
+
+template <class _Fn, class _Tp, class ..._Args>
+class __result_of<_Fn(_Tp, _Args...), false, true>  // _Fn must be member pointer
+    : public __result_of_mp<typename remove_reference<_Fn>::type,
+                            _Tp,
+                            is_member_function_pointer<typename remove_reference<_Fn>::type>::value>
+{
+};
+
+template <class _Fn, class ..._Args>
+class _LIBCPP_TEMPLATE_VIS result_of<_Fn(_Args...)>
+    : public __result_of<_Fn(_Args...),
+                         is_class<typename remove_reference<_Fn>::type>::value ||
+                         is_function<typename remove_pointer<typename remove_reference<_Fn>::type>::type>::value,
+                         is_member_pointer<typename remove_reference<_Fn>::type>::value
+                        >
+{
+};
+
+#endif  // C++03
+
 #if _LIBCPP_STD_VER > 11
 template <class _Tp> using result_of_t = typename result_of<_Tp>::type;
 #endif
@@ -4045,8 +3923,6 @@ _LIBCPP_INLINE_VAR constexpr bool is_nothrow_invocable_r_v
 
 #endif // _LIBCPP_STD_VER > 14
 
-#endif  // !defined(_LIBCPP_CXX03_LANG)
-
 template <class _Tp> struct __is_swappable;
 template <class _Tp> struct __is_nothrow_swappable;
 
@@ -4319,6 +4195,25 @@ struct __has_operator_addressof
 
 #endif  // _LIBCPP_CXX03_LANG
 
+// is_scoped_enum [meta.unary.prop]
+
+#if _LIBCPP_STD_VER > 20
+template <class _Tp, bool = is_enum_v<_Tp> >
+struct __is_scoped_enum_helper : false_type {};
+
+template <class _Tp>
+struct __is_scoped_enum_helper<_Tp, true>
+    : public bool_constant<!is_convertible_v<_Tp, underlying_type_t<_Tp> > > {};
+
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_scoped_enum
+    : public __is_scoped_enum_helper<_Tp> {};
+
+template <class _Tp>
+_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_scoped_enum_v =
+    is_scoped_enum<_Tp>::value;
+#endif
+
 #if _LIBCPP_STD_VER > 14
 
 template <class... _Args>
@@ -4341,7 +4236,6 @@ _LIBCPP_INLINE_VAR constexpr bool negation_v
 #endif  // _LIBCPP_STD_VER > 14
 
 // These traits are used in __tree and __hash_table
-#ifndef _LIBCPP_CXX03_LANG
 struct __extract_key_fail_tag {};
 struct __extract_key_self_tag {};
 struct __extract_key_first_tag {};
@@ -4353,7 +4247,7 @@ struct __can_extract_key
                   __extract_key_fail_tag>::type {};
 
 template <class _Pair, class _Key, class _First, class _Second>
-struct __can_extract_key<_Pair, _Key, pair<_First, _Second>>
+struct __can_extract_key<_Pair, _Key, pair<_First, _Second> >
     : conditional<_IsSame<typename remove_const<_First>::type, _Key>::value,
                   __extract_key_first_tag, __extract_key_fail_tag>::type {};
 
@@ -4371,8 +4265,6 @@ template <class _ValTy, class _Key, class _RawValTy>
 struct __can_extract_map_key<_ValTy, _Key, _Key, _RawValTy>
     : false_type {};
 
-#endif
-
 #ifndef _LIBCPP_HAS_NO_BUILTIN_IS_CONSTANT_EVALUATED
 #if _LIBCPP_STD_VER > 17
 _LIBCPP_INLINE_VISIBILITY
diff --git a/lib/libcxx/include/typeinfo b/lib/libcxx/include/typeinfo
index 7e76da5387..048e57614a 100644
--- a/lib/libcxx/include/typeinfo
+++ b/lib/libcxx/include/typeinfo
@@ -57,6 +57,7 @@ public:
 */
 
 #include <__config>
+#include <__availability>
 #include <exception>
 #include <cstddef>
 #include <cstdint>
@@ -141,7 +142,7 @@ public:
 // comparison is equal.
 // -------------------------------------------------------------------------- //
 //                          NonUniqueARMRTTIBit
-// (selected on ARM64 regardless of _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION)
+//               (_LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION = 3)
 // -------------------------------------------------------------------------- //
 // This implementation of type_info does not assume always a unique copy of
 // the RTTI for a given type inside a program. It packs the pointer to the
@@ -160,6 +161,24 @@ public:
 // the pointer when it constructs the type_info, depending on whether it can
 // guarantee uniqueness for that specific type_info.
 
+// This value can be overriden in the __config_site. When it's not overriden,
+// we pick a default implementation based on the platform here.
+#ifndef _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION
+
+  // Windows binaries can't merge typeinfos, so use the NonUnique implementation.
+# ifdef _LIBCPP_OBJECT_FORMAT_COFF
+#   define _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION 2
+
+  // On arm64 on Apple platforms, use the special NonUniqueARMRTTIBit implementation.
+# elif defined(__APPLE__) && defined(__LP64__) && !defined(__x86_64__)
+#   define _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION 3
+
+  // On all other platforms, assume the Itanium C++ ABI and use the Unique implementation.
+# else
+#   define _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION 1
+# endif
+#endif
+
 struct __type_info_implementations {
   struct __string_impl_base {
     typedef const char* __type_name_t;
@@ -243,7 +262,7 @@ struct __type_info_implementations {
    private:
     // The unique bit is the top bit. It is expected that __type_name_t is 64 bits when
     // this implementation is actually used.
-    typedef std::integral_constant<__type_name_t,
+    typedef integral_constant<__type_name_t,
       (1ULL << ((__CHAR_BIT__ * sizeof(__type_name_t)) - 1))> __non_unique_rtti_bit;
 
     _LIBCPP_INLINE_VISIBILITY
@@ -257,12 +276,12 @@ struct __type_info_implementations {
   };
 
   typedef
-#if defined(__APPLE__) && defined(__LP64__) && !defined(__x86_64__)
-    __non_unique_arm_rtti_bit_impl
-#elif _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION == 1
+#if _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION == 1
     __unique_impl
 #elif _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION == 2
     __non_unique_impl
+#elif _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION == 3
+    __non_unique_arm_rtti_bit_impl
 #else
 #   error invalid configuration for _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION
 #endif
diff --git a/lib/libcxx/include/unordered_map b/lib/libcxx/include/unordered_map
index 6156cfddd7..d595302e63 100644
--- a/lib/libcxx/include/unordered_map
+++ b/lib/libcxx/include/unordered_map
@@ -173,10 +173,22 @@ public:
 
     iterator       find(const key_type& k);
     const_iterator find(const key_type& k) const;
+    template<typename K>
+        iterator find(const K& x);              // C++20
+    template<typename K>
+        const_iterator find(const K& x) const;  // C++20
     size_type count(const key_type& k) const;
+    template<typename K>
+        size_type count(const K& k) const; // C++20
     bool contains(const key_type& k) const; // C++20
+    template<typename K>
+        bool contains(const K& k) const; // C++20
     pair<iterator, iterator>             equal_range(const key_type& k);
     pair<const_iterator, const_iterator> equal_range(const key_type& k) const;
+    template<typename K>
+        pair<iterator, iterator>             equal_range(const K& k); // C++20
+    template<typename K>
+        pair<const_iterator, const_iterator> equal_range(const K& k) const; // C++20
 
     mapped_type& operator[](const key_type& k);
     mapped_type& operator[](key_type&& k);
@@ -355,10 +367,22 @@ public:
 
     iterator       find(const key_type& k);
     const_iterator find(const key_type& k) const;
+    template<typename K>
+        iterator find(const K& x);              // C++20
+    template<typename K>
+        const_iterator find(const K& x) const;  // C++20
     size_type count(const key_type& k) const;
+    template<typename K>
+        size_type count(const K& k) const; // C++20
     bool contains(const key_type& k) const; // C++20
+    template<typename K>
+        bool contains(const K& k) const; // C++20
     pair<iterator, iterator>             equal_range(const key_type& k);
     pair<const_iterator, const_iterator> equal_range(const key_type& k) const;
+    template<typename K>
+        pair<iterator, iterator>             equal_range(const K& k); // C++20
+    template<typename K>
+        pair<const_iterator, const_iterator> equal_range(const K& k) const; // C++20
 
     size_type bucket_count() const noexcept;
     size_type max_bucket_count() const noexcept;
@@ -423,7 +447,7 @@ template <class Key, class T, class Hash, class Pred, class Alloc>
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-template <class _Key, class _Cp, class _Hash,
+template <class _Key, class _Cp, class _Hash, class _Pred,
           bool = is_empty<_Hash>::value && !__libcpp_is_final<_Hash>::value>
 class __unordered_map_hasher
     : private _Hash
@@ -445,6 +469,12 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     size_t operator()(const _Key& __x) const
         {return static_cast<const _Hash&>(*this)(__x);}
+#if _LIBCPP_STD_VER > 17
+    template <typename _K2, typename = _EnableIf<__is_transparent<_Hash, _K2>::value && __is_transparent<_Pred, _K2>::value>>
+    _LIBCPP_INLINE_VISIBILITY
+    size_t operator()(const _K2& __x) const
+        {return static_cast<const _Hash&>(*this)(__x);}
+#endif
     void swap(__unordered_map_hasher&__y)
         _NOEXCEPT_(__is_nothrow_swappable<_Hash>::value)
     {
@@ -453,8 +483,8 @@ public:
     }
 };
 
-template <class _Key, class _Cp, class _Hash>
-class __unordered_map_hasher<_Key, _Cp, _Hash, false>
+template <class _Key, class _Cp, class _Hash, class _Pred>
+class __unordered_map_hasher<_Key, _Cp, _Hash, _Pred, false>
 {
     _Hash __hash_;
 public:
@@ -474,6 +504,12 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     size_t operator()(const _Key& __x) const
         {return __hash_(__x);}
+#if _LIBCPP_STD_VER > 17
+    template <typename _K2, typename = _EnableIf<__is_transparent<_Hash, _K2>::value && __is_transparent<_Pred, _K2>::value>>
+    _LIBCPP_INLINE_VISIBILITY
+    size_t operator()(const _K2& __x) const
+        {return __hash_(__x);}
+#endif
     void swap(__unordered_map_hasher&__y)
         _NOEXCEPT_(__is_nothrow_swappable<_Hash>::value)
     {
@@ -482,17 +518,17 @@ public:
     }
 };
 
-template <class _Key, class _Cp, class _Hash, bool __b>
+template <class _Key, class _Cp, class _Hash, class _Pred, bool __b>
 inline _LIBCPP_INLINE_VISIBILITY
 void
-swap(__unordered_map_hasher<_Key, _Cp, _Hash, __b>& __x,
-     __unordered_map_hasher<_Key, _Cp, _Hash, __b>& __y)
+swap(__unordered_map_hasher<_Key, _Cp, _Hash, _Pred, __b>& __x,
+     __unordered_map_hasher<_Key, _Cp, _Hash, _Pred, __b>& __y)
     _NOEXCEPT_(_NOEXCEPT_(__x.swap(__y)))
 {
     __x.swap(__y);
 }
 
-template <class _Key, class _Cp, class _Pred,
+template <class _Key, class _Cp, class _Pred, class _Hash,
           bool = is_empty<_Pred>::value && !__libcpp_is_final<_Pred>::value>
 class __unordered_map_equal
     : private _Pred
@@ -517,6 +553,24 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     bool operator()(const _Key& __x, const _Cp& __y) const
         {return static_cast<const _Pred&>(*this)(__x, __y.__get_value().first);}
+#if _LIBCPP_STD_VER > 17
+    template <typename _K2, typename = _EnableIf<__is_transparent<_Hash, _K2>::value && __is_transparent<_Pred, _K2>::value>>
+    _LIBCPP_INLINE_VISIBILITY
+    bool operator()(const _Cp& __x, const _K2& __y) const
+        {return static_cast<const _Pred&>(*this)(__x.__get_value().first, __y);}
+    template <typename _K2, typename = _EnableIf<__is_transparent<_Hash, _K2>::value && __is_transparent<_Pred, _K2>::value>>
+    _LIBCPP_INLINE_VISIBILITY
+    bool operator()(const _K2& __x, const _Cp& __y) const
+        {return static_cast<const _Pred&>(*this)(__x, __y.__get_value().first);}
+    template <typename _K2, typename = _EnableIf<__is_transparent<_Hash, _K2>::value && __is_transparent<_Pred, _K2>::value>>
+    _LIBCPP_INLINE_VISIBILITY
+    bool operator()(const _Key& __x, const _K2& __y) const
+        {return static_cast<const _Pred&>(*this)(__x, __y);}
+    template <typename _K2, typename = _EnableIf<__is_transparent<_Hash, _K2>::value && __is_transparent<_Pred, _K2>::value>>
+    _LIBCPP_INLINE_VISIBILITY
+    bool operator()(const _K2& __x, const _Key& __y) const
+        {return static_cast<const _Pred&>(*this)(__x, __y);}
+#endif
     void swap(__unordered_map_equal&__y)
         _NOEXCEPT_(__is_nothrow_swappable<_Pred>::value)
     {
@@ -525,8 +579,8 @@ public:
     }
 };
 
-template <class _Key, class _Cp, class _Pred>
-class __unordered_map_equal<_Key, _Cp, _Pred, false>
+template <class _Key, class _Cp, class _Pred, class _Hash>
+class __unordered_map_equal<_Key, _Cp, _Pred, _Hash, false>
 {
     _Pred __pred_;
 public:
@@ -549,6 +603,24 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     bool operator()(const _Key& __x, const _Cp& __y) const
         {return __pred_(__x, __y.__get_value().first);}
+#if _LIBCPP_STD_VER > 17
+    template <typename _K2, typename = _EnableIf<__is_transparent<_Hash, _K2>::value && __is_transparent<_Pred, _K2>::value>>
+    _LIBCPP_INLINE_VISIBILITY
+    bool operator()(const _Cp& __x, const _K2& __y) const
+        {return __pred_(__x.__get_value().first, __y);}
+    template <typename _K2, typename = _EnableIf<__is_transparent<_Hash, _K2>::value && __is_transparent<_Pred, _K2>::value>>
+    _LIBCPP_INLINE_VISIBILITY
+    bool operator()(const _K2& __x, const _Cp& __y) const
+        {return __pred_(__x, __y.__get_value().first);}
+    template <typename _K2, typename = _EnableIf<__is_transparent<_Hash, _K2>::value && __is_transparent<_Pred, _K2>::value>>
+    _LIBCPP_INLINE_VISIBILITY
+    bool operator()(const _Key& __x, const _K2& __y) const
+        {return __pred_(__x, __y);}
+    template <typename _K2, typename = _EnableIf<__is_transparent<_Hash, _K2>::value && __is_transparent<_Pred, _K2>::value>>
+    _LIBCPP_INLINE_VISIBILITY
+    bool operator()(const _K2& __x, const _Key& __y) const
+        {return __pred_(__x, __y);}
+#endif
     void swap(__unordered_map_equal&__y)
         _NOEXCEPT_(__is_nothrow_swappable<_Pred>::value)
     {
@@ -557,11 +629,11 @@ public:
     }
 };
 
-template <class _Key, class _Cp, class _Pred, bool __b>
+template <class _Key, class _Cp, class _Pred, class _Hash, bool __b>
 inline _LIBCPP_INLINE_VISIBILITY
 void
-swap(__unordered_map_equal<_Key, _Cp, _Pred, __b>& __x,
-     __unordered_map_equal<_Key, _Cp, _Pred, __b>& __y)
+swap(__unordered_map_equal<_Key, _Cp, _Pred, _Hash, __b>& __x,
+     __unordered_map_equal<_Key, _Cp, _Pred, _Hash, __b>& __y)
     _NOEXCEPT_(_NOEXCEPT_(__x.swap(__y)))
 {
     __x.swap(__y);
@@ -858,11 +930,11 @@ public:
                   "Invalid allocator::value_type");
 
 private:
-    typedef __hash_value_type<key_type, mapped_type>                 __value_type;
-    typedef __unordered_map_hasher<key_type, __value_type, hasher>   __hasher;
-    typedef __unordered_map_equal<key_type, __value_type, key_equal> __key_equal;
+    typedef __hash_value_type<key_type, mapped_type>                          __value_type;
+    typedef __unordered_map_hasher<key_type, __value_type, hasher, key_equal> __hasher;
+    typedef __unordered_map_equal<key_type, __value_type, key_equal, hasher>  __key_equal;
     typedef typename __rebind_alloc_helper<allocator_traits<allocator_type>,
-                                                 __value_type>::type __allocator_type;
+                                                 __value_type>::type          __allocator_type;
 
     typedef __hash_table<__value_type, __hasher,
                          __key_equal,  __allocator_type>   __table;
@@ -906,7 +978,7 @@ public:
     unordered_map()
         _NOEXCEPT_(is_nothrow_default_constructible<__table>::value)
         {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
             __get_db()->__insert_c(this);
 #endif
         }
@@ -1025,7 +1097,7 @@ public:
         {return __table_.__insert_unique(__x);}
 
     iterator insert(const_iterator __p, const value_type& __x) {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__p) == this,
             "unordered_map::insert(const_iterator, const value_type&) called with an iterator not"
             " referring to this unordered_map");
@@ -1049,7 +1121,7 @@ public:
         {return __table_.__insert_unique(_VSTD::move(__x));}
 
     iterator insert(const_iterator __p, value_type&& __x) {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__p) == this,
             "unordered_map::insert(const_iterator, const value_type&) called with an iterator not"
             " referring to this unordered_map");
@@ -1070,7 +1142,7 @@ public:
         _LIBCPP_INLINE_VISIBILITY
         iterator insert(const_iterator __p, _Pp&& __x)
         {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
             _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__p) == this,
                 "unordered_map::insert(const_iterator, value_type&&) called with an iterator not"
                 " referring to this unordered_map");
@@ -1089,7 +1161,7 @@ public:
     template <class... _Args>
     _LIBCPP_INLINE_VISIBILITY
     iterator emplace_hint(const_iterator __p, _Args&&... __args) {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__p) == this,
             "unordered_map::emplace_hint(const_iterator, args...) called with an iterator not"
             " referring to this unordered_map");
@@ -1106,7 +1178,7 @@ public:
         _LIBCPP_INLINE_VISIBILITY
         pair<iterator, bool> try_emplace(const key_type& __k, _Args&&... __args)
     {
-        return __table_.__emplace_unique_key_args(__k, _VSTD::piecewise_construct,
+        return __table_.__emplace_unique_key_args(__k, piecewise_construct,
             _VSTD::forward_as_tuple(__k),
             _VSTD::forward_as_tuple(_VSTD::forward<_Args>(__args)...));
     }
@@ -1115,7 +1187,7 @@ public:
         _LIBCPP_INLINE_VISIBILITY
         pair<iterator, bool> try_emplace(key_type&& __k, _Args&&... __args)
     {
-        return __table_.__emplace_unique_key_args(__k, _VSTD::piecewise_construct,
+        return __table_.__emplace_unique_key_args(__k, piecewise_construct,
             _VSTD::forward_as_tuple(_VSTD::move(__k)),
             _VSTD::forward_as_tuple(_VSTD::forward<_Args>(__args)...));
     }
@@ -1124,7 +1196,7 @@ public:
         _LIBCPP_INLINE_VISIBILITY
         iterator try_emplace(const_iterator __h, const key_type& __k, _Args&&... __args)
     {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__h) == this,
             "unordered_map::try_emplace(const_iterator, key, args...) called with an iterator not"
             " referring to this unordered_map");
@@ -1138,7 +1210,7 @@ public:
         _LIBCPP_INLINE_VISIBILITY
         iterator try_emplace(const_iterator __h, key_type&& __k, _Args&&... __args)
     {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__h) == this,
             "unordered_map::try_emplace(const_iterator, key, args...) called with an iterator not"
             " referring to this unordered_map");
@@ -1280,11 +1352,34 @@ public:
     iterator       find(const key_type& __k)       {return __table_.find(__k);}
     _LIBCPP_INLINE_VISIBILITY
     const_iterator find(const key_type& __k) const {return __table_.find(__k);}
+
+    #if _LIBCPP_STD_VER > 17
+        template <typename _K2>
+        _LIBCPP_INLINE_VISIBILITY
+        _EnableIf<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value, iterator>
+        find(const _K2& __k)       {return __table_.find(__k);}
+        template <typename _K2>
+        _LIBCPP_INLINE_VISIBILITY
+        _EnableIf<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value, const_iterator>
+        find(const _K2& __k) const {return __table_.find(__k);}
+    #endif // _LIBCPP_STD_VER > 17
+
     _LIBCPP_INLINE_VISIBILITY
     size_type count(const key_type& __k) const {return __table_.__count_unique(__k);}
+    #if _LIBCPP_STD_VER > 17
+        template <typename _K2>
+        _LIBCPP_INLINE_VISIBILITY
+        _EnableIf<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value, size_type>
+        count(const _K2& __k) const {return __table_.__count_unique(__k);}
+    #endif // _LIBCPP_STD_VER > 17
     #if _LIBCPP_STD_VER > 17
         _LIBCPP_INLINE_VISIBILITY
         bool contains(const key_type& __k) const {return find(__k) != end();}
+
+        template <typename _K2>
+        _LIBCPP_INLINE_VISIBILITY
+        _EnableIf<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value, bool>
+        contains(const _K2& __k) const {return find(__k) != end();}
     #endif // _LIBCPP_STD_VER > 17
     _LIBCPP_INLINE_VISIBILITY
     pair<iterator, iterator>             equal_range(const key_type& __k)
@@ -1292,6 +1387,16 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     pair<const_iterator, const_iterator> equal_range(const key_type& __k) const
         {return __table_.__equal_range_unique(__k);}
+    #if _LIBCPP_STD_VER > 17
+        template <typename _K2>
+        _LIBCPP_INLINE_VISIBILITY
+        _EnableIf<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value, pair<iterator, iterator>>
+        equal_range(const _K2& __k)       {return __table_.__equal_range_unique(__k);}
+        template <typename _K2>
+        _LIBCPP_INLINE_VISIBILITY
+        _EnableIf<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value, pair<const_iterator, const_iterator>>
+        equal_range(const _K2& __k) const {return __table_.__equal_range_unique(__k);}
+    #endif // _LIBCPP_STD_VER > 17
 
     mapped_type& operator[](const key_type& __k);
 #ifndef _LIBCPP_CXX03_LANG
@@ -1336,7 +1441,7 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     void reserve(size_type __n) {__table_.reserve(__n);}
 
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
 
     bool __dereferenceable(const const_iterator* __i) const
         {return __table_.__dereferenceable(&__i->__i_);}
@@ -1347,7 +1452,7 @@ public:
     bool __subscriptable(const const_iterator* __i, ptrdiff_t __n) const
         {return __table_.__addable(&__i->__i_, __n);}
 
-#endif  // _LIBCPP_DEBUG_LEVEL >= 2
+#endif  // _LIBCPP_DEBUG_LEVEL == 2
 
 private:
 
@@ -1428,7 +1533,7 @@ unordered_map<_Key, _Tp, _Hash, _Pred, _Alloc>::unordered_map(
         size_type __n, const hasher& __hf, const key_equal& __eql)
     : __table_(__hf, __eql)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     __table_.rehash(__n);
@@ -1440,7 +1545,7 @@ unordered_map<_Key, _Tp, _Hash, _Pred, _Alloc>::unordered_map(
         const allocator_type& __a)
     : __table_(__hf, __eql, typename __table::allocator_type(__a))
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     __table_.rehash(__n);
@@ -1452,7 +1557,7 @@ unordered_map<_Key, _Tp, _Hash, _Pred, _Alloc>::unordered_map(
         const allocator_type& __a)
     : __table_(typename __table::allocator_type(__a))
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
 }
@@ -1462,7 +1567,7 @@ template <class _InputIterator>
 unordered_map<_Key, _Tp, _Hash, _Pred, _Alloc>::unordered_map(
         _InputIterator __first, _InputIterator __last)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     insert(__first, __last);
@@ -1475,7 +1580,7 @@ unordered_map<_Key, _Tp, _Hash, _Pred, _Alloc>::unordered_map(
         const hasher& __hf, const key_equal& __eql)
     : __table_(__hf, __eql)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     __table_.rehash(__n);
@@ -1489,7 +1594,7 @@ unordered_map<_Key, _Tp, _Hash, _Pred, _Alloc>::unordered_map(
         const hasher& __hf, const key_equal& __eql, const allocator_type& __a)
     : __table_(__hf, __eql, typename __table::allocator_type(__a))
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     __table_.rehash(__n);
@@ -1501,7 +1606,7 @@ unordered_map<_Key, _Tp, _Hash, _Pred, _Alloc>::unordered_map(
         const unordered_map& __u)
     : __table_(__u.__table_)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     __table_.rehash(__u.bucket_count());
@@ -1513,7 +1618,7 @@ unordered_map<_Key, _Tp, _Hash, _Pred, _Alloc>::unordered_map(
         const unordered_map& __u, const allocator_type& __a)
     : __table_(__u.__table_, typename __table::allocator_type(__a))
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     __table_.rehash(__u.bucket_count());
@@ -1529,7 +1634,7 @@ unordered_map<_Key, _Tp, _Hash, _Pred, _Alloc>::unordered_map(
     _NOEXCEPT_(is_nothrow_move_constructible<__table>::value)
     : __table_(_VSTD::move(__u.__table_))
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
     __get_db()->swap(this, &__u);
 #endif
@@ -1540,7 +1645,7 @@ unordered_map<_Key, _Tp, _Hash, _Pred, _Alloc>::unordered_map(
         unordered_map&& __u, const allocator_type& __a)
     : __table_(_VSTD::move(__u.__table_), typename __table::allocator_type(__a))
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     if (__a != __u.get_allocator())
@@ -1551,7 +1656,7 @@ unordered_map<_Key, _Tp, _Hash, _Pred, _Alloc>::unordered_map(
                 __u.__table_.remove((__i++).__i_)->__value_.__move());
         }
     }
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     else
         __get_db()->swap(this, &__u);
 #endif
@@ -1561,7 +1666,7 @@ template <class _Key, class _Tp, class _Hash, class _Pred, class _Alloc>
 unordered_map<_Key, _Tp, _Hash, _Pred, _Alloc>::unordered_map(
         initializer_list<value_type> __il)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     insert(__il.begin(), __il.end());
@@ -1573,7 +1678,7 @@ unordered_map<_Key, _Tp, _Hash, _Pred, _Alloc>::unordered_map(
         const key_equal& __eql)
     : __table_(__hf, __eql)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     __table_.rehash(__n);
@@ -1586,7 +1691,7 @@ unordered_map<_Key, _Tp, _Hash, _Pred, _Alloc>::unordered_map(
         const key_equal& __eql, const allocator_type& __a)
     : __table_(__hf, __eql, typename __table::allocator_type(__a))
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     __table_.rehash(__n);
@@ -1633,8 +1738,8 @@ _Tp&
 unordered_map<_Key, _Tp, _Hash, _Pred, _Alloc>::operator[](const key_type& __k)
 {
     return __table_.__emplace_unique_key_args(__k,
-        std::piecewise_construct, std::forward_as_tuple(__k),
-                                  std::forward_as_tuple()).first->__get_value().second;
+        piecewise_construct, _VSTD::forward_as_tuple(__k),
+                             _VSTD::forward_as_tuple()).first->__get_value().second;
 }
 
 template <class _Key, class _Tp, class _Hash, class _Pred, class _Alloc>
@@ -1642,8 +1747,8 @@ _Tp&
 unordered_map<_Key, _Tp, _Hash, _Pred, _Alloc>::operator[](key_type&& __k)
 {
     return __table_.__emplace_unique_key_args(__k,
-        std::piecewise_construct, std::forward_as_tuple(std::move(__k)),
-                                  std::forward_as_tuple()).first->__get_value().second;
+        piecewise_construct, _VSTD::forward_as_tuple(_VSTD::move(__k)),
+                             _VSTD::forward_as_tuple()).first->__get_value().second;
 }
 #else // _LIBCPP_CXX03_LANG
 
@@ -1657,7 +1762,7 @@ unordered_map<_Key, _Tp, _Hash, _Pred, _Alloc>::__construct_node_with_key(const
     __h.get_deleter().__first_constructed = true;
     __node_traits::construct(__na, _VSTD::addressof(__h->__value_.__get_value().second));
     __h.get_deleter().__second_constructed = true;
-    return _LIBCPP_EXPLICIT_MOVE(__h);  // explicitly moved for C++03
+    return __h;
 }
 
 template <class _Key, class _Tp, class _Hash, class _Pred, class _Alloc>
@@ -1762,11 +1867,11 @@ public:
                   "Invalid allocator::value_type");
 
 private:
-    typedef __hash_value_type<key_type, mapped_type>                 __value_type;
-    typedef __unordered_map_hasher<key_type, __value_type, hasher>   __hasher;
-    typedef __unordered_map_equal<key_type, __value_type, key_equal> __key_equal;
+    typedef __hash_value_type<key_type, mapped_type>                          __value_type;
+    typedef __unordered_map_hasher<key_type, __value_type, hasher, key_equal> __hasher;
+    typedef __unordered_map_equal<key_type, __value_type, key_equal, hasher>  __key_equal;
     typedef typename __rebind_alloc_helper<allocator_traits<allocator_type>,
-                                                 __value_type>::type __allocator_type;
+                                                 __value_type>::type          __allocator_type;
 
     typedef __hash_table<__value_type, __hasher,
                          __key_equal,  __allocator_type>   __table;
@@ -1807,7 +1912,7 @@ public:
     unordered_multimap()
         _NOEXCEPT_(is_nothrow_default_constructible<__table>::value)
         {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
             __get_db()->__insert_c(this);
 #endif
         }
@@ -2059,11 +2164,32 @@ public:
     iterator       find(const key_type& __k)       {return __table_.find(__k);}
     _LIBCPP_INLINE_VISIBILITY
     const_iterator find(const key_type& __k) const {return __table_.find(__k);}
+    #if _LIBCPP_STD_VER > 17
+        template <typename _K2>
+        _LIBCPP_INLINE_VISIBILITY
+        _EnableIf<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value, iterator>
+        find(const _K2& __k)       {return __table_.find(__k);}
+        template <typename _K2>
+        _LIBCPP_INLINE_VISIBILITY
+        _EnableIf<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value, const_iterator>
+        find(const _K2& __k) const {return __table_.find(__k);}
+    #endif // _LIBCPP_STD_VER > 17
     _LIBCPP_INLINE_VISIBILITY
     size_type count(const key_type& __k) const {return __table_.__count_multi(__k);}
+    #if _LIBCPP_STD_VER > 17
+        template <typename _K2>
+        _LIBCPP_INLINE_VISIBILITY
+        _EnableIf<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value, size_type>
+        count(const _K2& __k) const {return __table_.__count_multi(__k);}
+    #endif // _LIBCPP_STD_VER > 17
     #if _LIBCPP_STD_VER > 17
         _LIBCPP_INLINE_VISIBILITY
         bool contains(const key_type& __k) const {return find(__k) != end();}
+
+        template <typename _K2>
+        _LIBCPP_INLINE_VISIBILITY
+        _EnableIf<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value, bool>
+        contains(const _K2& __k) const {return find(__k) != end();}
     #endif // _LIBCPP_STD_VER > 17
     _LIBCPP_INLINE_VISIBILITY
     pair<iterator, iterator>             equal_range(const key_type& __k)
@@ -2071,6 +2197,16 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     pair<const_iterator, const_iterator> equal_range(const key_type& __k) const
         {return __table_.__equal_range_multi(__k);}
+    #if _LIBCPP_STD_VER > 17
+        template <typename _K2>
+        _LIBCPP_INLINE_VISIBILITY
+        _EnableIf<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value, pair<iterator, iterator>>
+        equal_range(const _K2& __k)       {return __table_.__equal_range_multi(__k);}
+        template <typename _K2>
+        _LIBCPP_INLINE_VISIBILITY
+        _EnableIf<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value, pair<const_iterator, const_iterator>>
+        equal_range(const _K2& __k) const {return __table_.__equal_range_multi(__k);}
+    #endif // _LIBCPP_STD_VER > 17
 
     _LIBCPP_INLINE_VISIBILITY
     size_type bucket_count() const _NOEXCEPT {return __table_.bucket_count();}
@@ -2108,7 +2244,7 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     void reserve(size_type __n) {__table_.reserve(__n);}
 
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
 
     bool __dereferenceable(const const_iterator* __i) const
         {return __table_.__dereferenceable(&__i->__i_);}
@@ -2119,7 +2255,7 @@ public:
     bool __subscriptable(const const_iterator* __i, ptrdiff_t __n) const
         {return __table_.__addable(&__i->__i_, __n);}
 
-#endif  // _LIBCPP_DEBUG_LEVEL >= 2
+#endif  // _LIBCPP_DEBUG_LEVEL == 2
 
 
 };
@@ -2196,7 +2332,7 @@ unordered_multimap<_Key, _Tp, _Hash, _Pred, _Alloc>::unordered_multimap(
         size_type __n, const hasher& __hf, const key_equal& __eql)
     : __table_(__hf, __eql)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     __table_.rehash(__n);
@@ -2208,7 +2344,7 @@ unordered_multimap<_Key, _Tp, _Hash, _Pred, _Alloc>::unordered_multimap(
         const allocator_type& __a)
     : __table_(__hf, __eql, typename __table::allocator_type(__a))
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     __table_.rehash(__n);
@@ -2219,7 +2355,7 @@ template <class _InputIterator>
 unordered_multimap<_Key, _Tp, _Hash, _Pred, _Alloc>::unordered_multimap(
         _InputIterator __first, _InputIterator __last)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     insert(__first, __last);
@@ -2232,7 +2368,7 @@ unordered_multimap<_Key, _Tp, _Hash, _Pred, _Alloc>::unordered_multimap(
         const hasher& __hf, const key_equal& __eql)
     : __table_(__hf, __eql)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     __table_.rehash(__n);
@@ -2246,7 +2382,7 @@ unordered_multimap<_Key, _Tp, _Hash, _Pred, _Alloc>::unordered_multimap(
         const hasher& __hf, const key_equal& __eql, const allocator_type& __a)
     : __table_(__hf, __eql, typename __table::allocator_type(__a))
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     __table_.rehash(__n);
@@ -2259,7 +2395,7 @@ unordered_multimap<_Key, _Tp, _Hash, _Pred, _Alloc>::unordered_multimap(
         const allocator_type& __a)
     : __table_(typename __table::allocator_type(__a))
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
 }
@@ -2269,7 +2405,7 @@ unordered_multimap<_Key, _Tp, _Hash, _Pred, _Alloc>::unordered_multimap(
         const unordered_multimap& __u)
     : __table_(__u.__table_)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     __table_.rehash(__u.bucket_count());
@@ -2281,7 +2417,7 @@ unordered_multimap<_Key, _Tp, _Hash, _Pred, _Alloc>::unordered_multimap(
         const unordered_multimap& __u, const allocator_type& __a)
     : __table_(__u.__table_, typename __table::allocator_type(__a))
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     __table_.rehash(__u.bucket_count());
@@ -2297,7 +2433,7 @@ unordered_multimap<_Key, _Tp, _Hash, _Pred, _Alloc>::unordered_multimap(
     _NOEXCEPT_(is_nothrow_move_constructible<__table>::value)
     : __table_(_VSTD::move(__u.__table_))
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
     __get_db()->swap(this, &__u);
 #endif
@@ -2308,7 +2444,7 @@ unordered_multimap<_Key, _Tp, _Hash, _Pred, _Alloc>::unordered_multimap(
         unordered_multimap&& __u, const allocator_type& __a)
     : __table_(_VSTD::move(__u.__table_), typename __table::allocator_type(__a))
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     if (__a != __u.get_allocator())
@@ -2320,7 +2456,7 @@ unordered_multimap<_Key, _Tp, _Hash, _Pred, _Alloc>::unordered_multimap(
                 __u.__table_.remove((__i++).__i_)->__value_.__move());
         }
     }
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     else
         __get_db()->swap(this, &__u);
 #endif
@@ -2330,7 +2466,7 @@ template <class _Key, class _Tp, class _Hash, class _Pred, class _Alloc>
 unordered_multimap<_Key, _Tp, _Hash, _Pred, _Alloc>::unordered_multimap(
         initializer_list<value_type> __il)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     insert(__il.begin(), __il.end());
@@ -2342,7 +2478,7 @@ unordered_multimap<_Key, _Tp, _Hash, _Pred, _Alloc>::unordered_multimap(
         const key_equal& __eql)
     : __table_(__hf, __eql)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     __table_.rehash(__n);
@@ -2355,7 +2491,7 @@ unordered_multimap<_Key, _Tp, _Hash, _Pred, _Alloc>::unordered_multimap(
         const key_equal& __eql, const allocator_type& __a)
     : __table_(__hf, __eql, typename __table::allocator_type(__a))
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     __table_.rehash(__n);
diff --git a/lib/libcxx/include/unordered_set b/lib/libcxx/include/unordered_set
index 6c4ad93800..80f460d7ed 100644
--- a/lib/libcxx/include/unordered_set
+++ b/lib/libcxx/include/unordered_set
@@ -145,10 +145,22 @@ public:
 
     iterator       find(const key_type& k);
     const_iterator find(const key_type& k) const;
+    template<typename K>
+        iterator find(const K& x);              // C++20
+    template<typename K>
+        const_iterator find(const K& x) const;  // C++20
     size_type count(const key_type& k) const;
+    template<typename K>
+        size_type count(const K& k) const; // C++20
     bool contains(const key_type& k) const; // C++20
+    template<typename K>
+        bool contains(const K& k) const; // C++20
     pair<iterator, iterator>             equal_range(const key_type& k);
     pair<const_iterator, const_iterator> equal_range(const key_type& k) const;
+    template<typename K>
+        pair<iterator, iterator>             equal_range(const K& k); // C++20
+    template<typename K>
+        pair<const_iterator, const_iterator> equal_range(const K& k) const; // C++20
 
     size_type bucket_count() const noexcept;
     size_type max_bucket_count() const noexcept;
@@ -310,10 +322,22 @@ public:
 
     iterator       find(const key_type& k);
     const_iterator find(const key_type& k) const;
+    template<typename K>
+        iterator find(const K& x);              // C++20
+    template<typename K>
+        const_iterator find(const K& x) const;  // C++20
     size_type count(const key_type& k) const;
+    template<typename K>
+        size_type count(const K& k) const; // C++20
     bool contains(const key_type& k) const; // C++20
+    template<typename K>
+        bool contains(const K& k) const; // C++20
     pair<iterator, iterator>             equal_range(const key_type& k);
     pair<const_iterator, const_iterator> equal_range(const key_type& k) const;
+    template<typename K>
+        pair<iterator, iterator>             equal_range(const K& k); // C++20
+    template<typename K>
+        pair<const_iterator, const_iterator> equal_range(const K& k) const; // C++20
 
     size_type bucket_count() const noexcept;
     size_type max_bucket_count() const noexcept;
@@ -425,7 +449,7 @@ public:
     unordered_set()
         _NOEXCEPT_(is_nothrow_default_constructible<__table>::value)
         {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
             __get_db()->__insert_c(this);
 #endif
         }
@@ -539,7 +563,7 @@ public:
             {return __table_.__emplace_unique(_VSTD::forward<_Args>(__args)...);}
     template <class... _Args>
         _LIBCPP_INLINE_VISIBILITY
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         iterator emplace_hint(const_iterator __p, _Args&&... __args)
         {
             _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__p) == this,
@@ -556,7 +580,7 @@ public:
     pair<iterator, bool> insert(value_type&& __x)
         {return __table_.__insert_unique(_VSTD::move(__x));}
     _LIBCPP_INLINE_VISIBILITY
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     iterator insert(const_iterator __p, value_type&& __x)
         {
             _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__p) == this,
@@ -577,7 +601,7 @@ public:
         {return __table_.__insert_unique(__x);}
 
     _LIBCPP_INLINE_VISIBILITY
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     iterator insert(const_iterator __p, const value_type& __x)
         {
             _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__p) == this,
@@ -679,11 +703,32 @@ public:
     iterator       find(const key_type& __k)       {return __table_.find(__k);}
     _LIBCPP_INLINE_VISIBILITY
     const_iterator find(const key_type& __k) const {return __table_.find(__k);}
+    #if _LIBCPP_STD_VER > 17
+        template <typename _K2>
+        _LIBCPP_INLINE_VISIBILITY
+        _EnableIf<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value, iterator>
+        find(const _K2& __k)       {return __table_.find(__k);}
+        template <typename _K2>
+        _LIBCPP_INLINE_VISIBILITY
+        _EnableIf<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value, const_iterator>
+        find(const _K2& __k) const {return __table_.find(__k);}
+    #endif // _LIBCPP_STD_VER > 17
     _LIBCPP_INLINE_VISIBILITY
     size_type count(const key_type& __k) const {return __table_.__count_unique(__k);}
+    #if _LIBCPP_STD_VER > 17
+        template <typename _K2>
+        _LIBCPP_INLINE_VISIBILITY
+        _EnableIf<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value, size_type>
+        count(const _K2& __k) const {return __table_.__count_unique(__k);}
+    #endif // _LIBCPP_STD_VER > 17
     #if _LIBCPP_STD_VER > 17
         _LIBCPP_INLINE_VISIBILITY
         bool contains(const key_type& __k) const {return find(__k) != end();}
+
+        template <typename _K2>
+        _LIBCPP_INLINE_VISIBILITY
+        _EnableIf<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value, bool>
+        contains(const _K2& __k) const {return find(__k) != end();}
     #endif // _LIBCPP_STD_VER > 17
     _LIBCPP_INLINE_VISIBILITY
     pair<iterator, iterator>             equal_range(const key_type& __k)
@@ -691,6 +736,16 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     pair<const_iterator, const_iterator> equal_range(const key_type& __k) const
         {return __table_.__equal_range_unique(__k);}
+    #if _LIBCPP_STD_VER > 17
+        template <typename _K2>
+        _LIBCPP_INLINE_VISIBILITY
+        _EnableIf<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value, pair<iterator, iterator>>
+        equal_range(const _K2& __k)       {return __table_.__equal_range_unique(__k);}
+        template <typename _K2>
+        _LIBCPP_INLINE_VISIBILITY
+        _EnableIf<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value, pair<const_iterator, const_iterator>>
+        equal_range(const _K2& __k) const {return __table_.__equal_range_unique(__k);}
+    #endif // _LIBCPP_STD_VER > 17
 
     _LIBCPP_INLINE_VISIBILITY
     size_type bucket_count() const _NOEXCEPT {return __table_.bucket_count();}
@@ -726,7 +781,7 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     void reserve(size_type __n) {__table_.reserve(__n);}
 
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
 
     bool __dereferenceable(const const_iterator* __i) const
         {return __table_.__dereferenceable(__i);}
@@ -737,7 +792,7 @@ public:
     bool __subscriptable(const const_iterator* __i, ptrdiff_t __n) const
         {return __table_.__addable(__i, __n);}
 
-#endif  // _LIBCPP_DEBUG_LEVEL >= 2
+#endif  // _LIBCPP_DEBUG_LEVEL == 2
 
 };
 
@@ -802,7 +857,7 @@ unordered_set<_Value, _Hash, _Pred, _Alloc>::unordered_set(size_type __n,
         const hasher& __hf, const key_equal& __eql)
     : __table_(__hf, __eql)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     __table_.rehash(__n);
@@ -813,7 +868,7 @@ unordered_set<_Value, _Hash, _Pred, _Alloc>::unordered_set(size_type __n,
         const hasher& __hf, const key_equal& __eql, const allocator_type& __a)
     : __table_(__hf, __eql, __a)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     __table_.rehash(__n);
@@ -824,7 +879,7 @@ template <class _InputIterator>
 unordered_set<_Value, _Hash, _Pred, _Alloc>::unordered_set(
         _InputIterator __first, _InputIterator __last)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     insert(__first, __last);
@@ -837,7 +892,7 @@ unordered_set<_Value, _Hash, _Pred, _Alloc>::unordered_set(
         const hasher& __hf, const key_equal& __eql)
     : __table_(__hf, __eql)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     __table_.rehash(__n);
@@ -851,7 +906,7 @@ unordered_set<_Value, _Hash, _Pred, _Alloc>::unordered_set(
         const hasher& __hf, const key_equal& __eql, const allocator_type& __a)
     : __table_(__hf, __eql, __a)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     __table_.rehash(__n);
@@ -864,7 +919,7 @@ unordered_set<_Value, _Hash, _Pred, _Alloc>::unordered_set(
         const allocator_type& __a)
     : __table_(__a)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
 }
@@ -874,7 +929,7 @@ unordered_set<_Value, _Hash, _Pred, _Alloc>::unordered_set(
         const unordered_set& __u)
     : __table_(__u.__table_)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     __table_.rehash(__u.bucket_count());
@@ -886,7 +941,7 @@ unordered_set<_Value, _Hash, _Pred, _Alloc>::unordered_set(
         const unordered_set& __u, const allocator_type& __a)
     : __table_(__u.__table_, __a)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     __table_.rehash(__u.bucket_count());
@@ -902,7 +957,7 @@ unordered_set<_Value, _Hash, _Pred, _Alloc>::unordered_set(
     _NOEXCEPT_(is_nothrow_move_constructible<__table>::value)
     : __table_(_VSTD::move(__u.__table_))
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
     __get_db()->swap(this, &__u);
 #endif
@@ -913,7 +968,7 @@ unordered_set<_Value, _Hash, _Pred, _Alloc>::unordered_set(
         unordered_set&& __u, const allocator_type& __a)
     : __table_(_VSTD::move(__u.__table_), __a)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     if (__a != __u.get_allocator())
@@ -922,7 +977,7 @@ unordered_set<_Value, _Hash, _Pred, _Alloc>::unordered_set(
         while (__u.size() != 0)
             __table_.__insert_unique(_VSTD::move(__u.__table_.remove(__i++)->__value_));
     }
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     else
         __get_db()->swap(this, &__u);
 #endif
@@ -932,7 +987,7 @@ template <class _Value, class _Hash, class _Pred, class _Alloc>
 unordered_set<_Value, _Hash, _Pred, _Alloc>::unordered_set(
         initializer_list<value_type> __il)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     insert(__il.begin(), __il.end());
@@ -944,7 +999,7 @@ unordered_set<_Value, _Hash, _Pred, _Alloc>::unordered_set(
         const key_equal& __eql)
     : __table_(__hf, __eql)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     __table_.rehash(__n);
@@ -957,7 +1012,7 @@ unordered_set<_Value, _Hash, _Pred, _Alloc>::unordered_set(
         const key_equal& __eql, const allocator_type& __a)
     : __table_(__hf, __eql, __a)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     __table_.rehash(__n);
@@ -1091,7 +1146,7 @@ public:
     unordered_multiset()
         _NOEXCEPT_(is_nothrow_default_constructible<__table>::value)
         {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
             __get_db()->__insert_c(this);
 #endif
         }
@@ -1314,11 +1369,32 @@ public:
     iterator       find(const key_type& __k)       {return __table_.find(__k);}
     _LIBCPP_INLINE_VISIBILITY
     const_iterator find(const key_type& __k) const {return __table_.find(__k);}
+    #if _LIBCPP_STD_VER > 17
+        template <typename _K2>
+        _LIBCPP_INLINE_VISIBILITY
+        _EnableIf<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value, iterator>
+        find(const _K2& __k)       {return __table_.find(__k);}
+        template <typename _K2>
+        _LIBCPP_INLINE_VISIBILITY
+        _EnableIf<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value, const_iterator>
+        find(const _K2& __k) const {return __table_.find(__k);}
+    #endif // _LIBCPP_STD_VER > 17
     _LIBCPP_INLINE_VISIBILITY
     size_type count(const key_type& __k) const {return __table_.__count_multi(__k);}
+    #if _LIBCPP_STD_VER > 17
+        template <typename _K2>
+        _LIBCPP_INLINE_VISIBILITY
+        _EnableIf<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value, size_type>
+        count(const _K2& __k) const {return __table_.__count_multi(__k);}
+    #endif // _LIBCPP_STD_VER > 17
     #if _LIBCPP_STD_VER > 17
         _LIBCPP_INLINE_VISIBILITY
         bool contains(const key_type& __k) const {return find(__k) != end();}
+
+        template <typename _K2>
+        _LIBCPP_INLINE_VISIBILITY
+        _EnableIf<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value, bool>
+        contains(const _K2& __k) const {return find(__k) != end();}
     #endif // _LIBCPP_STD_VER > 17
     _LIBCPP_INLINE_VISIBILITY
     pair<iterator, iterator>             equal_range(const key_type& __k)
@@ -1326,6 +1402,16 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     pair<const_iterator, const_iterator> equal_range(const key_type& __k) const
         {return __table_.__equal_range_multi(__k);}
+    #if _LIBCPP_STD_VER > 17
+        template <typename _K2>
+        _LIBCPP_INLINE_VISIBILITY
+        _EnableIf<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value, pair<iterator, iterator>>
+        equal_range(const _K2& __k)       {return __table_.__equal_range_multi(__k);}
+        template <typename _K2>
+        _LIBCPP_INLINE_VISIBILITY
+        _EnableIf<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value, pair<const_iterator, const_iterator>>
+        equal_range(const _K2& __k) const {return __table_.__equal_range_multi(__k);}
+    #endif // _LIBCPP_STD_VER > 17
 
     _LIBCPP_INLINE_VISIBILITY
     size_type bucket_count() const _NOEXCEPT {return __table_.bucket_count();}
@@ -1361,7 +1447,7 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     void reserve(size_type __n) {__table_.reserve(__n);}
 
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
 
     bool __dereferenceable(const const_iterator* __i) const
         {return __table_.__dereferenceable(__i);}
@@ -1372,7 +1458,7 @@ public:
     bool __subscriptable(const const_iterator* __i, ptrdiff_t __n) const
         {return __table_.__addable(__i, __n);}
 
-#endif  // _LIBCPP_DEBUG_LEVEL >= 2
+#endif  // _LIBCPP_DEBUG_LEVEL == 2
 
 };
 
@@ -1435,7 +1521,7 @@ unordered_multiset<_Value, _Hash, _Pred, _Alloc>::unordered_multiset(
         size_type __n, const hasher& __hf, const key_equal& __eql)
     : __table_(__hf, __eql)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     __table_.rehash(__n);
@@ -1447,7 +1533,7 @@ unordered_multiset<_Value, _Hash, _Pred, _Alloc>::unordered_multiset(
         const allocator_type& __a)
     : __table_(__hf, __eql, __a)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     __table_.rehash(__n);
@@ -1458,7 +1544,7 @@ template <class _InputIterator>
 unordered_multiset<_Value, _Hash, _Pred, _Alloc>::unordered_multiset(
         _InputIterator __first, _InputIterator __last)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     insert(__first, __last);
@@ -1471,7 +1557,7 @@ unordered_multiset<_Value, _Hash, _Pred, _Alloc>::unordered_multiset(
         const hasher& __hf, const key_equal& __eql)
     : __table_(__hf, __eql)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     __table_.rehash(__n);
@@ -1485,7 +1571,7 @@ unordered_multiset<_Value, _Hash, _Pred, _Alloc>::unordered_multiset(
         const hasher& __hf, const key_equal& __eql, const allocator_type& __a)
     : __table_(__hf, __eql, __a)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     __table_.rehash(__n);
@@ -1498,7 +1584,7 @@ unordered_multiset<_Value, _Hash, _Pred, _Alloc>::unordered_multiset(
         const allocator_type& __a)
     : __table_(__a)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
 }
@@ -1508,7 +1594,7 @@ unordered_multiset<_Value, _Hash, _Pred, _Alloc>::unordered_multiset(
         const unordered_multiset& __u)
     : __table_(__u.__table_)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     __table_.rehash(__u.bucket_count());
@@ -1520,7 +1606,7 @@ unordered_multiset<_Value, _Hash, _Pred, _Alloc>::unordered_multiset(
         const unordered_multiset& __u, const allocator_type& __a)
     : __table_(__u.__table_, __a)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     __table_.rehash(__u.bucket_count());
@@ -1536,7 +1622,7 @@ unordered_multiset<_Value, _Hash, _Pred, _Alloc>::unordered_multiset(
     _NOEXCEPT_(is_nothrow_move_constructible<__table>::value)
     : __table_(_VSTD::move(__u.__table_))
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
     __get_db()->swap(this, &__u);
 #endif
@@ -1547,7 +1633,7 @@ unordered_multiset<_Value, _Hash, _Pred, _Alloc>::unordered_multiset(
         unordered_multiset&& __u, const allocator_type& __a)
     : __table_(_VSTD::move(__u.__table_), __a)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     if (__a != __u.get_allocator())
@@ -1556,7 +1642,7 @@ unordered_multiset<_Value, _Hash, _Pred, _Alloc>::unordered_multiset(
         while (__u.size() != 0)
             __table_.__insert_multi(_VSTD::move(__u.__table_.remove(__i++)->__value_));
     }
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     else
         __get_db()->swap(this, &__u);
 #endif
@@ -1566,7 +1652,7 @@ template <class _Value, class _Hash, class _Pred, class _Alloc>
 unordered_multiset<_Value, _Hash, _Pred, _Alloc>::unordered_multiset(
         initializer_list<value_type> __il)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     insert(__il.begin(), __il.end());
@@ -1578,7 +1664,7 @@ unordered_multiset<_Value, _Hash, _Pred, _Alloc>::unordered_multiset(
         const key_equal& __eql)
     : __table_(__hf, __eql)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     __table_.rehash(__n);
@@ -1591,7 +1677,7 @@ unordered_multiset<_Value, _Hash, _Pred, _Alloc>::unordered_multiset(
         const key_equal& __eql, const allocator_type& __a)
     : __table_(__hf, __eql, __a)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     __table_.rehash(__n);
diff --git a/lib/libcxx/include/utility b/lib/libcxx/include/utility
index 7ac322bfe7..6f27af7064 100644
--- a/lib/libcxx/include/utility
+++ b/lib/libcxx/include/utility
@@ -499,7 +499,7 @@ struct _LIBCPP_TEMPLATE_VIS pair
           second(_VSTD::get<1>(_VSTD::forward<_Tuple>(__p))) {}
 
     template <class... _Args1, class... _Args2>
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
     pair(piecewise_construct_t __pc,
          tuple<_Args1...> __first_args, tuple<_Args2...> __second_args)
         _NOEXCEPT_((is_nothrow_constructible<first_type, _Args1...>::value &&
@@ -508,7 +508,7 @@ struct _LIBCPP_TEMPLATE_VIS pair
                 typename __make_tuple_indices<sizeof...(_Args1)>::type(),
                 typename __make_tuple_indices<sizeof...(_Args2) >::type()) {}
 
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
     pair& operator=(typename conditional<
                         is_copy_assignable<first_type>::value &&
                         is_copy_assignable<second_type>::value,
@@ -521,7 +521,7 @@ struct _LIBCPP_TEMPLATE_VIS pair
         return *this;
     }
 
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
     pair& operator=(typename conditional<
                         is_move_assignable<first_type>::value &&
                         is_move_assignable<second_type>::value,
@@ -537,7 +537,7 @@ struct _LIBCPP_TEMPLATE_VIS pair
     template <class _Tuple, _EnableB<
             _CheckTLC<_Tuple>::template __enable_assign<_Tuple>()
      > = false>
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
     pair& operator=(_Tuple&& __p) {
         first = _VSTD::get<0>(_VSTD::forward<_Tuple>(__p));
         second = _VSTD::get<1>(_VSTD::forward<_Tuple>(__p));
@@ -545,7 +545,7 @@ struct _LIBCPP_TEMPLATE_VIS pair
     }
 #endif
 
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
     void
     swap(pair& __p) _NOEXCEPT_(__is_nothrow_swappable<first_type>::value &&
                                __is_nothrow_swappable<second_type>::value)
@@ -558,10 +558,10 @@ private:
 
 #ifndef _LIBCPP_CXX03_LANG
     template <class... _Args1, class... _Args2, size_t... _I1, size_t... _I2>
-        _LIBCPP_INLINE_VISIBILITY
-        pair(piecewise_construct_t,
-             tuple<_Args1...>& __first_args, tuple<_Args2...>& __second_args,
-             __tuple_indices<_I1...>, __tuple_indices<_I2...>);
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
+    pair(piecewise_construct_t,
+         tuple<_Args1...>& __first_args, tuple<_Args2...>& __second_args,
+         __tuple_indices<_I1...>, __tuple_indices<_I2...>);
 #endif
 };
 
@@ -619,7 +619,7 @@ operator<=(const pair<_T1,_T2>& __x, const pair<_T1,_T2>& __y)
 }
 
 template <class _T1, class _T2>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 typename enable_if
 <
     __is_swappable<_T1>::value &&
@@ -970,7 +970,7 @@ _Size
 __loadword(const void* __p)
 {
     _Size __r;
-    std::memcpy(&__r, __p, sizeof(__r));
+    _VSTD::memcpy(&__r, __p, sizeof(__r));
     return __r;
 }
 
@@ -1189,7 +1189,7 @@ __murmur2_or_cityhash<_Size, 64>::operator()(const void* __key, _Size __len)
     __v = __weak_hash_len_32_with_seeds(__s, __v.second * __k1, __x + __w.first);
     __w = __weak_hash_len_32_with_seeds(__s + 32, __z + __w.second,
                                         __y + __loadword<_Size>(__s + 16));
-    std::swap(__z, __x);
+    _VSTD::swap(__z, __x);
     __s += 64;
     __len -= 64;
   } while (__len != 0);
@@ -1364,6 +1364,16 @@ struct _LIBCPP_TEMPLATE_VIS hash<unsigned char>
     size_t operator()(unsigned char __v) const _NOEXCEPT {return static_cast<size_t>(__v);}
 };
 
+#ifndef _LIBCPP_NO_HAS_CHAR8_T
+template <>
+struct _LIBCPP_TEMPLATE_VIS hash<char8_t>
+    : public unary_function<char8_t, size_t>
+{
+    _LIBCPP_INLINE_VISIBILITY
+    size_t operator()(char8_t __v) const _NOEXCEPT {return static_cast<size_t>(__v);}
+};
+#endif // !_LIBCPP_NO_HAS_CHAR8_T
+
 #ifndef _LIBCPP_HAS_NO_UNICODE_CHARS
 
 template <>
@@ -1506,7 +1516,7 @@ struct _LIBCPP_TEMPLATE_VIS hash<long double>
         // -0.0 and 0.0 should return same hash
         if (__v == 0.0L)
             return 0;
-#if defined(__i386__)
+#if defined(__i386__) || (defined(__x86_64__) && defined(__ILP32__))
         // Zero out padding bits
         union
         {
@@ -1593,7 +1603,7 @@ using __check_hash_requirements _LIBCPP_NODEBUG_TYPE  = integral_constant<bool,
     __invokable_r<size_t, _Hash, _Key const&>::value
 >;
 
-template <class _Key, class _Hash = std::hash<_Key> >
+template <class _Key, class _Hash = hash<_Key> >
 using __has_enabled_hash _LIBCPP_NODEBUG_TYPE = integral_constant<bool,
     __check_hash_requirements<_Key, _Hash>::value &&
     is_default_constructible<_Hash>::value
diff --git a/lib/libcxx/include/valarray b/lib/libcxx/include/valarray
index c048a6d7e4..787d8aca2f 100644
--- a/lib/libcxx/include/valarray
+++ b/lib/libcxx/include/valarray
@@ -136,6 +136,7 @@ public:
     void operator>>=(const valarray<value_type>& v) const;
 
     void operator=(const value_type& x) const;
+    void operator=(const valarray<T>& val_arr) const;
 
     slice_array() = delete;
 };
@@ -802,7 +803,7 @@ private:
 public:
     // construct/destroy:
     _LIBCPP_INLINE_VISIBILITY
-    valarray() : __begin_(0), __end_(0) {}
+    valarray() : __begin_(nullptr), __end_(nullptr) {}
     inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1
     explicit valarray(size_t __n);
     _LIBCPP_INLINE_VISIBILITY
@@ -1264,6 +1265,9 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     void operator=(const value_type& __x) const;
 
+    _LIBCPP_INLINE_VISIBILITY
+    void operator=(const valarray<value_type>& __va) const;
+
 private:
     _LIBCPP_INLINE_VISIBILITY
     slice_array(const slice& __sl, const valarray<value_type>& __v)
@@ -1303,6 +1307,15 @@ slice_array<_Tp>::operator=(const _Expr& __v) const
         *__t = __v[__i];
 }
 
+template <class _Tp>
+inline void
+slice_array<_Tp>::operator=(const valarray<value_type>& __va) const
+{
+    value_type* __t = __vp_;
+    for (size_t __i = 0; __i < __va.size(); ++__i, __t += __stride_)
+        *__t = __va[__i];
+}
+
 template <class _Tp>
 template <class _Expr>
 inline
@@ -2738,11 +2751,9 @@ __val_expr<_ValExpr>::operator valarray<__val_expr::result_type>() const
     if (__n)
     {
         __r.__begin_ =
-            __r.__end_ =
-                static_cast<result_type*>(
-                    _VSTD::__libcpp_allocate(__n * sizeof(result_type), _LIBCPP_ALIGNOF(result_type)));
+            __r.__end_ = allocator<result_type>().allocate(__n);
         for (size_t __i = 0; __i != __n; ++__r.__end_, ++__i)
-            ::new (__r.__end_) result_type(__expr_[__i]);
+            ::new ((void*)__r.__end_) result_type(__expr_[__i]);
     }
     return __r;
 }
@@ -2752,19 +2763,18 @@ __val_expr<_ValExpr>::operator valarray<__val_expr::result_type>() const
 template <class _Tp>
 inline
 valarray<_Tp>::valarray(size_t __n)
-    : __begin_(0),
-      __end_(0)
+    : __begin_(nullptr),
+      __end_(nullptr)
 {
     if (__n)
     {
-        __begin_ = __end_ = static_cast<value_type*>(
-            _VSTD::__libcpp_allocate(__n * sizeof(value_type), _LIBCPP_ALIGNOF(value_type)));
+        __begin_ = __end_ = allocator<value_type>().allocate(__n);
 #ifndef _LIBCPP_NO_EXCEPTIONS
         try
         {
 #endif  // _LIBCPP_NO_EXCEPTIONS
             for (size_t __n_left = __n; __n_left; --__n_left, ++__end_)
-                ::new (__end_) value_type();
+                ::new ((void*)__end_) value_type();
 #ifndef _LIBCPP_NO_EXCEPTIONS
         }
         catch (...)
@@ -2779,27 +2789,26 @@ valarray<_Tp>::valarray(size_t __n)
 template <class _Tp>
 inline
 valarray<_Tp>::valarray(const value_type& __x, size_t __n)
-    : __begin_(0),
-      __end_(0)
+    : __begin_(nullptr),
+      __end_(nullptr)
 {
     resize(__n, __x);
 }
 
 template <class _Tp>
 valarray<_Tp>::valarray(const value_type* __p, size_t __n)
-    : __begin_(0),
-      __end_(0)
+    : __begin_(nullptr),
+      __end_(nullptr)
 {
     if (__n)
     {
-        __begin_ = __end_ = static_cast<value_type*>(
-            _VSTD::__libcpp_allocate(__n * sizeof(value_type), _LIBCPP_ALIGNOF(value_type)));
+        __begin_ = __end_ = allocator<value_type>().allocate(__n);
 #ifndef _LIBCPP_NO_EXCEPTIONS
         try
         {
 #endif  // _LIBCPP_NO_EXCEPTIONS
             for (size_t __n_left = __n; __n_left; ++__end_, ++__p, --__n_left)
-                ::new (__end_) value_type(*__p);
+                ::new ((void*)__end_) value_type(*__p);
 #ifndef _LIBCPP_NO_EXCEPTIONS
         }
         catch (...)
@@ -2813,19 +2822,18 @@ valarray<_Tp>::valarray(const value_type* __p, size_t __n)
 
 template <class _Tp>
 valarray<_Tp>::valarray(const valarray& __v)
-    : __begin_(0),
-      __end_(0)
+    : __begin_(nullptr),
+      __end_(nullptr)
 {
     if (__v.size())
     {
-        __begin_ = __end_ = static_cast<value_type*>(
-            _VSTD::__libcpp_allocate(__v.size() * sizeof(value_type), _LIBCPP_ALIGNOF(value_type)));
+        __begin_ = __end_ = allocator<value_type>().allocate(__v.size());
 #ifndef _LIBCPP_NO_EXCEPTIONS
         try
         {
 #endif  // _LIBCPP_NO_EXCEPTIONS
             for (value_type* __p = __v.__begin_; __p != __v.__end_; ++__end_, ++__p)
-                ::new (__end_) value_type(*__p);
+                ::new ((void*)__end_) value_type(*__p);
 #ifndef _LIBCPP_NO_EXCEPTIONS
         }
         catch (...)
@@ -2850,21 +2858,20 @@ valarray<_Tp>::valarray(valarray&& __v) _NOEXCEPT
 
 template <class _Tp>
 valarray<_Tp>::valarray(initializer_list<value_type> __il)
-    : __begin_(0),
-      __end_(0)
+    : __begin_(nullptr),
+      __end_(nullptr)
 {
     const size_t __n = __il.size();
     if (__n)
     {
-        __begin_ = __end_ = static_cast<value_type*>(
-_VSTD::__libcpp_allocate(__n * sizeof(value_type), _LIBCPP_ALIGNOF(value_type)));
+        __begin_ = __end_ = allocator<value_type>().allocate(__n);
 #ifndef _LIBCPP_NO_EXCEPTIONS
         try
         {
 #endif  // _LIBCPP_NO_EXCEPTIONS
             size_t __n_left = __n;
             for (const value_type* __p = __il.begin(); __n_left; ++__end_, ++__p, --__n_left)
-                ::new (__end_) value_type(*__p);
+                ::new ((void*)__end_) value_type(*__p);
 #ifndef _LIBCPP_NO_EXCEPTIONS
         }
         catch (...)
@@ -2880,21 +2887,20 @@ _VSTD::__libcpp_allocate(__n * sizeof(value_type), _LIBCPP_ALIGNOF(value_type)))
 
 template <class _Tp>
 valarray<_Tp>::valarray(const slice_array<value_type>& __sa)
-    : __begin_(0),
-      __end_(0)
+    : __begin_(nullptr),
+      __end_(nullptr)
 {
     const size_t __n = __sa.__size_;
     if (__n)
     {
-        __begin_ = __end_ = static_cast<value_type*>(
-          _VSTD::__libcpp_allocate(__n * sizeof(value_type), _LIBCPP_ALIGNOF(value_type)));
+        __begin_ = __end_ = allocator<value_type>().allocate(__n);
 #ifndef _LIBCPP_NO_EXCEPTIONS
         try
         {
 #endif  // _LIBCPP_NO_EXCEPTIONS
             size_t __n_left = __n;
             for (const value_type* __p = __sa.__vp_; __n_left; ++__end_, __p += __sa.__stride_, --__n_left)
-                ::new (__end_) value_type(*__p);
+                ::new ((void*)__end_) value_type(*__p);
 #ifndef _LIBCPP_NO_EXCEPTIONS
         }
         catch (...)
@@ -2908,14 +2914,13 @@ valarray<_Tp>::valarray(const slice_array<value_type>& __sa)
 
 template <class _Tp>
 valarray<_Tp>::valarray(const gslice_array<value_type>& __ga)
-    : __begin_(0),
-      __end_(0)
+    : __begin_(nullptr),
+      __end_(nullptr)
 {
     const size_t __n = __ga.__1d_.size();
     if (__n)
     {
-        __begin_ = __end_ = static_cast<value_type*>(
-           _VSTD::__libcpp_allocate(__n * sizeof(value_type), _LIBCPP_ALIGNOF(value_type)));
+        __begin_ = __end_ = allocator<value_type>().allocate(__n);
 #ifndef _LIBCPP_NO_EXCEPTIONS
         try
         {
@@ -2924,7 +2929,7 @@ valarray<_Tp>::valarray(const gslice_array<value_type>& __ga)
             const value_type* __s = __ga.__vp_;
             for (_Ip __i = __ga.__1d_.__begin_, __e = __ga.__1d_.__end_;
                     __i != __e; ++__i, ++__end_)
-                ::new (__end_) value_type(__s[*__i]);
+                ::new ((void*)__end_) value_type(__s[*__i]);
 #ifndef _LIBCPP_NO_EXCEPTIONS
         }
         catch (...)
@@ -2938,14 +2943,13 @@ valarray<_Tp>::valarray(const gslice_array<value_type>& __ga)
 
 template <class _Tp>
 valarray<_Tp>::valarray(const mask_array<value_type>& __ma)
-    : __begin_(0),
-      __end_(0)
+    : __begin_(nullptr),
+      __end_(nullptr)
 {
     const size_t __n = __ma.__1d_.size();
     if (__n)
     {
-        __begin_ = __end_ = static_cast<value_type*>(
-           _VSTD::__libcpp_allocate(__n * sizeof(value_type), _LIBCPP_ALIGNOF(value_type)));
+        __begin_ = __end_ = allocator<value_type>().allocate(__n);
 #ifndef _LIBCPP_NO_EXCEPTIONS
         try
         {
@@ -2954,7 +2958,7 @@ valarray<_Tp>::valarray(const mask_array<value_type>& __ma)
             const value_type* __s = __ma.__vp_;
             for (_Ip __i = __ma.__1d_.__begin_, __e = __ma.__1d_.__end_;
                     __i != __e; ++__i, ++__end_)
-                ::new (__end_) value_type(__s[*__i]);
+                ::new ((void*)__end_) value_type(__s[*__i]);
 #ifndef _LIBCPP_NO_EXCEPTIONS
         }
         catch (...)
@@ -2968,14 +2972,13 @@ valarray<_Tp>::valarray(const mask_array<value_type>& __ma)
 
 template <class _Tp>
 valarray<_Tp>::valarray(const indirect_array<value_type>& __ia)
-    : __begin_(0),
-      __end_(0)
+    : __begin_(nullptr),
+      __end_(nullptr)
 {
     const size_t __n = __ia.__1d_.size();
     if (__n)
     {
-        __begin_ = __end_ = static_cast<value_type*>(
-           _VSTD::__libcpp_allocate(__n * sizeof(value_type), _LIBCPP_ALIGNOF(value_type)));
+        __begin_ = __end_ = allocator<value_type>().allocate(__n);
 #ifndef _LIBCPP_NO_EXCEPTIONS
         try
         {
@@ -2984,7 +2987,7 @@ valarray<_Tp>::valarray(const indirect_array<value_type>& __ia)
             const value_type* __s = __ia.__vp_;
             for (_Ip __i = __ia.__1d_.__begin_, __e = __ia.__1d_.__end_;
                     __i != __e; ++__i, ++__end_)
-                ::new (__end_) value_type(__s[*__i]);
+                ::new ((void*)__end_) value_type(__s[*__i]);
 #ifndef _LIBCPP_NO_EXCEPTIONS
         }
         catch (...)
@@ -3011,8 +3014,7 @@ valarray<_Tp>::__assign_range(const value_type* __f, const value_type* __l)
     if (size() != __n)
     {
         __clear(size());
-        __begin_ = static_cast<value_type*>(
-           _VSTD::__libcpp_allocate(__n * sizeof(value_type), _LIBCPP_ALIGNOF(value_type)));
+        __begin_ = allocator<value_type>().allocate(__n);
         __end_ = __begin_ + __n;
         _VSTD::uninitialized_copy(__f, __l, __begin_);
     } else {
@@ -3265,12 +3267,9 @@ valarray<_Tp>::operator+() const
     size_t __n = size();
     if (__n)
     {
-        __r.__begin_ =
-            __r.__end_ =
-                static_cast<value_type*>(
-           _VSTD::__libcpp_allocate(__n * sizeof(value_type), _LIBCPP_ALIGNOF(value_type)));
+        __r.__begin_ = __r.__end_ = allocator<value_type>().allocate(__n);
         for (const value_type* __p = __begin_; __n; ++__r.__end_, ++__p, --__n)
-            ::new (__r.__end_) value_type(+*__p);
+            ::new ((void*)__r.__end_) value_type(+*__p);
     }
     return __r;
 }
@@ -3283,12 +3282,9 @@ valarray<_Tp>::operator-() const
     size_t __n = size();
     if (__n)
     {
-        __r.__begin_ =
-            __r.__end_ =
-                static_cast<value_type*>(
-           _VSTD::__libcpp_allocate(__n * sizeof(value_type), _LIBCPP_ALIGNOF(value_type)));
+        __r.__begin_ = __r.__end_ = allocator<value_type>().allocate(__n);
         for (const value_type* __p = __begin_; __n; ++__r.__end_, ++__p, --__n)
-            ::new (__r.__end_) value_type(-*__p);
+            ::new ((void*)__r.__end_) value_type(-*__p);
     }
     return __r;
 }
@@ -3301,12 +3297,9 @@ valarray<_Tp>::operator~() const
     size_t __n = size();
     if (__n)
     {
-        __r.__begin_ =
-            __r.__end_ =
-                static_cast<value_type*>(
-           _VSTD::__libcpp_allocate(__n * sizeof(value_type), _LIBCPP_ALIGNOF(value_type)));
+        __r.__begin_ = __r.__end_ = allocator<value_type>().allocate(__n);
         for (const value_type* __p = __begin_; __n; ++__r.__end_, ++__p, --__n)
-            ::new (__r.__end_) value_type(~*__p);
+            ::new ((void*)__r.__end_) value_type(~*__p);
     }
     return __r;
 }
@@ -3319,11 +3312,9 @@ valarray<_Tp>::operator!() const
     size_t __n = size();
     if (__n)
     {
-        __r.__begin_ =
-            __r.__end_ =
-                static_cast<bool*>(_VSTD::__libcpp_allocate(__n * sizeof(bool), _LIBCPP_ALIGNOF(bool)));
+        __r.__begin_ = __r.__end_ = allocator<bool>().allocate(__n);
         for (const value_type* __p = __begin_; __n; ++__r.__end_, ++__p, --__n)
-            ::new (__r.__end_) bool(!*__p);
+            ::new ((void*)__r.__end_) bool(!*__p);
     }
     return __r;
 }
@@ -3639,10 +3630,7 @@ valarray<_Tp>::shift(int __i) const
     size_t __n = size();
     if (__n)
     {
-        __r.__begin_ =
-            __r.__end_ =
-                static_cast<value_type*>(
-           _VSTD::__libcpp_allocate(__n * sizeof(value_type), _LIBCPP_ALIGNOF(value_type)));
+        __r.__begin_ = __r.__end_ = allocator<value_type>().allocate(__n);
         const value_type* __sb;
         value_type* __tb;
         value_type* __te;
@@ -3661,11 +3649,11 @@ valarray<_Tp>::shift(int __i) const
             __te = __r.__begin_ + __n;
         }
         for (; __r.__end_ != __tb; ++__r.__end_)
-            ::new (__r.__end_) value_type();
+            ::new ((void*)__r.__end_) value_type();
         for (; __r.__end_ != __te; ++__r.__end_, ++__sb)
-            ::new (__r.__end_) value_type(*__sb);
+            ::new ((void*)__r.__end_) value_type(*__sb);
         for (__te = __r.__begin_ + __n; __r.__end_ != __te; ++__r.__end_)
-            ::new (__r.__end_) value_type();
+            ::new ((void*)__r.__end_) value_type();
     }
     return __r;
 }
@@ -3678,16 +3666,13 @@ valarray<_Tp>::cshift(int __i) const
     size_t __n = size();
     if (__n)
     {
-        __r.__begin_ =
-            __r.__end_ =
-                static_cast<value_type*>(
-           _VSTD::__libcpp_allocate(__n * sizeof(value_type), _LIBCPP_ALIGNOF(value_type)));
+        __r.__begin_ = __r.__end_ = allocator<value_type>().allocate(__n);
         __i %= static_cast<int>(__n);
         const value_type* __m = __i >= 0 ? __begin_ + __i : __end_ + __i;
         for (const value_type* __s = __m; __s != __end_; ++__r.__end_, ++__s)
-            ::new (__r.__end_) value_type(*__s);
+            ::new ((void*)__r.__end_) value_type(*__s);
         for (const value_type* __s = __begin_; __s != __m; ++__r.__end_, ++__s)
-            ::new (__r.__end_) value_type(*__s);
+            ::new ((void*)__r.__end_) value_type(*__s);
     }
     return __r;
 }
@@ -3700,12 +3685,9 @@ valarray<_Tp>::apply(value_type __f(value_type)) const
     size_t __n = size();
     if (__n)
     {
-        __r.__begin_ =
-            __r.__end_ =
-                static_cast<value_type*>(
-           _VSTD::__libcpp_allocate(__n * sizeof(value_type), _LIBCPP_ALIGNOF(value_type)));
+        __r.__begin_ = __r.__end_ = allocator<value_type>().allocate(__n);
         for (const value_type* __p = __begin_; __n; ++__r.__end_, ++__p, --__n)
-            ::new (__r.__end_) value_type(__f(*__p));
+            ::new ((void*)__r.__end_) value_type(__f(*__p));
     }
     return __r;
 }
@@ -3718,12 +3700,9 @@ valarray<_Tp>::apply(value_type __f(const value_type&)) const
     size_t __n = size();
     if (__n)
     {
-        __r.__begin_ =
-            __r.__end_ =
-                static_cast<value_type*>(
-           _VSTD::__libcpp_allocate(__n * sizeof(value_type), _LIBCPP_ALIGNOF(value_type)));
+        __r.__begin_ = __r.__end_ = allocator<value_type>().allocate(__n);
         for (const value_type* __p = __begin_; __n; ++__r.__end_, ++__p, --__n)
-            ::new (__r.__end_) value_type(__f(*__p));
+            ::new ((void*)__r.__end_) value_type(__f(*__p));
     }
     return __r;
 }
@@ -3736,7 +3715,7 @@ void valarray<_Tp>::__clear(size_t __capacity)
   {
     while (__end_ != __begin_)
       (--__end_)->~value_type();
-    _VSTD::__libcpp_deallocate(__begin_, __capacity * sizeof(value_type), _LIBCPP_ALIGNOF(value_type));
+    allocator<value_type>().deallocate(__begin_, __capacity);
     __begin_ = __end_ = nullptr;
   }
 }
@@ -3748,14 +3727,13 @@ valarray<_Tp>::resize(size_t __n, value_type __x)
     __clear(size());
     if (__n)
     {
-        __begin_ = __end_ = static_cast<value_type*>(
-           _VSTD::__libcpp_allocate(__n * sizeof(value_type), _LIBCPP_ALIGNOF(value_type)));
+        __begin_ = __end_ = allocator<value_type>().allocate(__n);
 #ifndef _LIBCPP_NO_EXCEPTIONS
         try
         {
 #endif  // _LIBCPP_NO_EXCEPTIONS
             for (size_t __n_left = __n; __n_left; --__n_left, ++__end_)
-                ::new (__end_) value_type(__x);
+                ::new ((void*)__end_) value_type(__x);
 #ifndef _LIBCPP_NO_EXCEPTIONS
         }
         catch (...)
diff --git a/lib/libcxx/include/variant b/lib/libcxx/include/variant
index 03557239a6..770dd335ba 100644
--- a/lib/libcxx/include/variant
+++ b/lib/libcxx/include/variant
@@ -169,6 +169,9 @@ namespace std {
   template <class Visitor, class... Variants>
   constexpr see below visit(Visitor&&, Variants&&...);
 
+  template <class R, class Visitor, class... Variants>
+  constexpr R visit(Visitor&&, Variants&&...); // since C++20
+
   // 20.7.7, class monostate
   struct monostate;
 
@@ -197,6 +200,7 @@ namespace std {
 */
 
 #include <__config>
+#include <__availability>
 #include <__tuple>
 #include <array>
 #include <exception>
@@ -227,7 +231,10 @@ public:
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-#if _LIBCPP_STD_VER > 14
+// TODO: GCC 5 lies about its support for C++17 (it says it supports it but it
+//       really doesn't). That breaks variant, which uses some C++17 features.
+//       Remove this once we drop support for GCC 5.
+#if _LIBCPP_STD_VER > 14 && !(defined(_LIBCPP_COMPILER_GCC) && _GNUC_VER_NEW < 6000)
 
 _LIBCPP_NORETURN
 inline _LIBCPP_INLINE_VISIBILITY
@@ -290,9 +297,9 @@ struct _LIBCPP_TEMPLATE_VIS variant_alternative<_Ip, variant<_Types...>> {
 _LIBCPP_INLINE_VAR constexpr size_t variant_npos = static_cast<size_t>(-1);
 
 constexpr int __choose_index_type(unsigned int __num_elem) {
-  if (__num_elem < std::numeric_limits<unsigned char>::max())
+  if (__num_elem < numeric_limits<unsigned char>::max())
     return 0;
-  if (__num_elem < std::numeric_limits<unsigned short>::max())
+  if (__num_elem < numeric_limits<unsigned short>::max())
     return 1;
   return 2;
 }
@@ -482,12 +489,12 @@ private:
     return __result{{_VSTD::forward<_Fs>(__fs)...}};
   }
 
-  template <std::size_t... _Is>
+  template <size_t... _Is>
   struct __dispatcher {
     template <class _Fp, class... _Vs>
     inline _LIBCPP_INLINE_VISIBILITY
     static constexpr decltype(auto) __dispatch(_Fp __f, _Vs... __vs) {
-        return __invoke_constexpr(
+        return _VSTD::__invoke_constexpr(
             static_cast<_Fp>(__f),
             __access::__base::__get_alt<_Is>(static_cast<_Vs>(__vs))...);
     }
@@ -579,6 +586,16 @@ struct __variant {
         __make_value_visitor(_VSTD::forward<_Visitor>(__visitor)),
         _VSTD::forward<_Vs>(__vs)...);
   }
+#if _LIBCPP_STD_VER > 17
+  template <class _Rp, class _Visitor, class... _Vs>
+  inline _LIBCPP_INLINE_VISIBILITY
+  static constexpr _Rp __visit_value(_Visitor&& __visitor,
+                                     _Vs&&... __vs) {
+    return __visit_alt(
+        __make_value_visitor<_Rp>(_VSTD::forward<_Visitor>(__visitor)),
+        _VSTD::forward<_Vs>(__vs)...);
+  }
+#endif
 
 private:
   template <class _Visitor, class... _Values>
@@ -595,17 +612,48 @@ private:
       __std_visit_exhaustive_visitor_check<
           _Visitor,
           decltype((_VSTD::forward<_Alts>(__alts).__value))...>();
-      return __invoke_constexpr(_VSTD::forward<_Visitor>(__visitor),
+      return _VSTD::__invoke_constexpr(_VSTD::forward<_Visitor>(__visitor),
                                 _VSTD::forward<_Alts>(__alts).__value...);
     }
     _Visitor&& __visitor;
   };
 
+#if _LIBCPP_STD_VER > 17
+  template <class _Rp, class _Visitor>
+  struct __value_visitor_return_type {
+    template <class... _Alts>
+    inline _LIBCPP_INLINE_VISIBILITY
+    constexpr _Rp operator()(_Alts&&... __alts) const {
+      __std_visit_exhaustive_visitor_check<
+          _Visitor,
+          decltype((_VSTD::forward<_Alts>(__alts).__value))...>();
+      if constexpr (is_void_v<_Rp>) {
+        _VSTD::__invoke_constexpr(_VSTD::forward<_Visitor>(__visitor),
+                                  _VSTD::forward<_Alts>(__alts).__value...);
+      }
+      else {
+        return _VSTD::__invoke_constexpr(_VSTD::forward<_Visitor>(__visitor),
+                                         _VSTD::forward<_Alts>(__alts).__value...);
+      }
+    }
+
+    _Visitor&& __visitor;
+  };
+#endif
+
   template <class _Visitor>
   inline _LIBCPP_INLINE_VISIBILITY
   static constexpr auto __make_value_visitor(_Visitor&& __visitor) {
     return __value_visitor<_Visitor>{_VSTD::forward<_Visitor>(__visitor)};
   }
+
+#if _LIBCPP_STD_VER > 17
+  template <class _Rp, class _Visitor>
+  inline _LIBCPP_INLINE_VISIBILITY
+  static constexpr auto __make_value_visitor(_Visitor&& __visitor) {
+    return __value_visitor_return_type<_Rp, _Visitor>{_VSTD::forward<_Visitor>(__visitor)};
+  }
+#endif
 };
 
 } // namespace __visitation
@@ -720,12 +768,12 @@ protected:
 };
 
 template <class _Traits, _Trait = _Traits::__destructible_trait>
-class _LIBCPP_TEMPLATE_VIS __destructor;
+class _LIBCPP_TEMPLATE_VIS __dtor;
 
 #define _LIBCPP_VARIANT_DESTRUCTOR(destructible_trait, destructor, destroy)    \
   template <class... _Types>                                                   \
-  class _LIBCPP_TEMPLATE_VIS __destructor<__traits<_Types...>,                 \
-                                           destructible_trait>                 \
+  class _LIBCPP_TEMPLATE_VIS __dtor<__traits<_Types...>,                       \
+                                    destructible_trait>                        \
       : public __base<destructible_trait, _Types...> {                         \
     using __base_type = __base<destructible_trait, _Types...>;                 \
     using __index_t = typename __base_type::__index_t;                         \
@@ -734,11 +782,11 @@ class _LIBCPP_TEMPLATE_VIS __destructor;
     using __base_type::__base_type;                                            \
     using __base_type::operator=;                                              \
                                                                                \
-    __destructor(const __destructor&) = default;                               \
-    __destructor(__destructor&&) = default;                                    \
+    __dtor(const __dtor&) = default;                                           \
+    __dtor(__dtor&&) = default;                                                \
     destructor                                                                 \
-    __destructor& operator=(const __destructor&) = default;                    \
-    __destructor& operator=(__destructor&&) = default;                         \
+    __dtor& operator=(const __dtor&) = default;                                \
+    __dtor& operator=(__dtor&&) = default;                                     \
                                                                                \
   protected:                                                                   \
     inline _LIBCPP_INLINE_VISIBILITY                                           \
@@ -747,12 +795,12 @@ class _LIBCPP_TEMPLATE_VIS __destructor;
 
 _LIBCPP_VARIANT_DESTRUCTOR(
     _Trait::_TriviallyAvailable,
-    ~__destructor() = default;,
+    ~__dtor() = default;,
     void __destroy() noexcept { this->__index = __variant_npos<__index_t>; });
 
 _LIBCPP_VARIANT_DESTRUCTOR(
     _Trait::_Available,
-    ~__destructor() { __destroy(); },
+    ~__dtor() { __destroy(); },
     void __destroy() noexcept {
       if (!this->valueless_by_exception()) {
         __visitation::__base::__visit_alt(
@@ -767,14 +815,14 @@ _LIBCPP_VARIANT_DESTRUCTOR(
 
 _LIBCPP_VARIANT_DESTRUCTOR(
     _Trait::_Unavailable,
-    ~__destructor() = delete;,
+    ~__dtor() = delete;,
     void __destroy() noexcept = delete;);
 
 #undef _LIBCPP_VARIANT_DESTRUCTOR
 
 template <class _Traits>
-class _LIBCPP_TEMPLATE_VIS __constructor : public __destructor<_Traits> {
-  using __base_type = __destructor<_Traits>;
+class _LIBCPP_TEMPLATE_VIS __ctor : public __dtor<_Traits> {
+  using __base_type = __dtor<_Traits>;
 
 public:
   using __base_type::__base_type;
@@ -791,7 +839,7 @@ protected:
 
   template <class _Rhs>
   inline _LIBCPP_INLINE_VISIBILITY
-  static void __generic_construct(__constructor& __lhs, _Rhs&& __rhs) {
+  static void __generic_construct(__ctor& __lhs, _Rhs&& __rhs) {
     __lhs.__destroy();
     if (!__rhs.valueless_by_exception()) {
       __visitation::__base::__visit_alt_at(
@@ -813,10 +861,10 @@ class _LIBCPP_TEMPLATE_VIS __move_constructor;
 #define _LIBCPP_VARIANT_MOVE_CONSTRUCTOR(move_constructible_trait,             \
                                          move_constructor)                     \
   template <class... _Types>                                                   \
-  class _LIBCPP_TEMPLATE_VIS __move_constructor<__traits<_Types...>,          \
-                                                 move_constructible_trait>     \
-      : public __constructor<__traits<_Types...>> {                            \
-    using __base_type = __constructor<__traits<_Types...>>;                    \
+  class _LIBCPP_TEMPLATE_VIS __move_constructor<__traits<_Types...>,           \
+                                                move_constructible_trait>      \
+      : public __ctor<__traits<_Types...>> {                                   \
+    using __base_type = __ctor<__traits<_Types...>>;                           \
                                                                                \
   public:                                                                      \
     using __base_type::__base_type;                                            \
@@ -1104,7 +1152,7 @@ struct __narrowing_check {
   template <class _Dest>
   static auto __test_impl(_Dest (&&)[1]) -> __identity<_Dest>;
   template <class _Dest, class _Source>
-  using _Apply _LIBCPP_NODEBUG_TYPE = decltype(__test_impl<_Dest>({std::declval<_Source>()}));
+  using _Apply _LIBCPP_NODEBUG_TYPE = decltype(__test_impl<_Dest>({_VSTD::declval<_Source>()}));
 };
 
 template <class _Dest, class _Source>
@@ -1510,7 +1558,7 @@ template <class _Operator>
 struct __convert_to_bool {
   template <class _T1, class _T2>
   _LIBCPP_INLINE_VISIBILITY constexpr bool operator()(_T1 && __t1, _T2&& __t2) const {
-    static_assert(std::is_convertible<decltype(_Operator{}(_VSTD::forward<_T1>(__t1), _VSTD::forward<_T2>(__t2))), bool>::value,
+    static_assert(is_convertible<decltype(_Operator{}(_VSTD::forward<_T1>(__t1), _VSTD::forward<_T2>(__t2))), bool>::value,
         "the relational operator does not return a type which is implicitly convertible to bool");
     return _Operator{}(_VSTD::forward<_T1>(__t1), _VSTD::forward<_T2>(__t2));
   }
@@ -1587,21 +1635,38 @@ constexpr bool operator>=(const variant<_Types...>& __lhs,
       __lhs.index(), __convert_to_bool<greater_equal<>>{}, __lhs, __rhs);
 }
 
+template <class... _Vs>
+inline _LIBCPP_INLINE_VISIBILITY
+_LIBCPP_AVAILABILITY_THROW_BAD_VARIANT_ACCESS
+constexpr void __throw_if_valueless(_Vs&&... __vs) {
+  const bool __valueless = (... || __vs.valueless_by_exception());
+  if (__valueless) {
+      __throw_bad_variant_access();
+  }
+}
+
 template <class _Visitor, class... _Vs>
 inline _LIBCPP_INLINE_VISIBILITY
 _LIBCPP_AVAILABILITY_THROW_BAD_VARIANT_ACCESS
 constexpr decltype(auto) visit(_Visitor&& __visitor, _Vs&&... __vs) {
   using __variant_detail::__visitation::__variant;
-  bool __results[] = {__vs.valueless_by_exception()...};
-  for (bool __result : __results) {
-    if (__result) {
-      __throw_bad_variant_access();
-    }
-  }
+  _VSTD::__throw_if_valueless(_VSTD::forward<_Vs>(__vs)...);
   return __variant::__visit_value(_VSTD::forward<_Visitor>(__visitor),
                                   _VSTD::forward<_Vs>(__vs)...);
 }
 
+#if _LIBCPP_STD_VER > 17
+template <class _Rp, class _Visitor, class... _Vs>
+inline _LIBCPP_INLINE_VISIBILITY
+_LIBCPP_AVAILABILITY_THROW_BAD_VARIANT_ACCESS
+constexpr _Rp visit(_Visitor&& __visitor, _Vs&&... __vs) {
+  using __variant_detail::__visitation::__variant;
+  _VSTD::__throw_if_valueless(_VSTD::forward<_Vs>(__vs)...);
+  return __variant::__visit_value<_Rp>(_VSTD::forward<_Visitor>(__visitor),
+                                       _VSTD::forward<_Vs>(__vs)...);
+}
+#endif
+
 struct _LIBCPP_TEMPLATE_VIS monostate {};
 
 inline _LIBCPP_INLINE_VISIBILITY
diff --git a/lib/libcxx/include/vector b/lib/libcxx/include/vector
index 1007beeaaf..8e2df79f92 100644
--- a/lib/libcxx/include/vector
+++ b/lib/libcxx/include/vector
@@ -454,7 +454,7 @@ inline _LIBCPP_INLINE_VISIBILITY
 __vector_base<_Tp, _Allocator>::__vector_base(allocator_type&& __a) _NOEXCEPT
     : __begin_(nullptr),
       __end_(nullptr),
-      __end_cap_(nullptr, std::move(__a)) {}
+      __end_cap_(nullptr, _VSTD::move(__a)) {}
 #endif
 
 template <class _Tp, class _Allocator>
@@ -496,7 +496,7 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     vector() _NOEXCEPT_(is_nothrow_default_constructible<allocator_type>::value)
         {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
             __get_db()->__insert_c(this);
 #endif
         }
@@ -508,7 +508,7 @@ public:
 #endif
         : __base(__a)
     {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         __get_db()->__insert_c(this);
 #endif
     }
@@ -551,7 +551,7 @@ public:
     ~vector()
     {
         __annotate_delete();
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         __get_db()->__erase_c(this);
 #endif
     }
@@ -789,14 +789,14 @@ public:
 
     bool __invariants() const;
 
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
 
     bool __dereferenceable(const const_iterator* __i) const;
     bool __decrementable(const const_iterator* __i) const;
     bool __addable(const const_iterator* __i, ptrdiff_t __n) const;
     bool __subscriptable(const const_iterator* __i, ptrdiff_t __n) const;
 
-#endif  // _LIBCPP_DEBUG_LEVEL >= 2
+#endif  // _LIBCPP_DEBUG_LEVEL == 2
 
 private:
     _LIBCPP_INLINE_VISIBILITY void __invalidate_all_iterators();
@@ -931,7 +931,7 @@ private:
 
 #ifndef _LIBCPP_HAS_NO_DEDUCTION_GUIDES
 template<class _InputIterator,
-         class _Alloc = typename std::allocator<typename iterator_traits<_InputIterator>::value_type>,
+         class _Alloc = allocator<typename iterator_traits<_InputIterator>::value_type>,
          class = typename enable_if<__is_allocator<_Alloc>::value, void>::type
          >
 vector(_InputIterator, _InputIterator)
@@ -951,8 +951,7 @@ vector<_Tp, _Allocator>::__swap_out_circular_buffer(__split_buffer<value_type, a
 {
 
     __annotate_delete();
-    __alloc_traits::__construct_backward_with_exception_guarantees(
-        this->__alloc(), this->__begin_, this->__end_, __v.__begin_);
+    _VSTD::__construct_backward_with_exception_guarantees(this->__alloc(), this->__begin_, this->__end_, __v.__begin_);
     _VSTD::swap(this->__begin_, __v.__begin_);
     _VSTD::swap(this->__end_, __v.__end_);
     _VSTD::swap(this->__end_cap(), __v.__end_cap());
@@ -967,10 +966,8 @@ vector<_Tp, _Allocator>::__swap_out_circular_buffer(__split_buffer<value_type, a
 {
     __annotate_delete();
     pointer __r = __v.__begin_;
-    __alloc_traits::__construct_backward_with_exception_guarantees(
-        this->__alloc(), this->__begin_, __p, __v.__begin_);
-    __alloc_traits::__construct_forward_with_exception_guarantees(
-        this->__alloc(), __p, this->__end_, __v.__end_);
+    _VSTD::__construct_backward_with_exception_guarantees(this->__alloc(), this->__begin_, __p, __v.__begin_);
+    _VSTD::__construct_forward_with_exception_guarantees(this->__alloc(), __p, this->__end_, __v.__end_);
     _VSTD::swap(this->__begin_, __v.__begin_);
     _VSTD::swap(this->__end_, __v.__end_);
     _VSTD::swap(this->__end_cap(), __v.__end_cap());
@@ -1077,7 +1074,7 @@ typename enable_if
 vector<_Tp, _Allocator>::__construct_at_end(_ForwardIterator __first, _ForwardIterator __last, size_type __n)
 {
     _ConstructTransaction __tx(*this, __n);
-    __alloc_traits::__construct_range_forward(this->__alloc(), __first, __last, __tx.__pos_);
+    _VSTD::__construct_range_forward(this->__alloc(), __first, __last, __tx.__pos_);
 }
 
 //  Default constructs __n objects starting at __end_
@@ -1121,7 +1118,7 @@ vector<_Tp, _Allocator>::__append(size_type __n, const_reference __x)
 template <class _Tp, class _Allocator>
 vector<_Tp, _Allocator>::vector(size_type __n)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     if (__n > 0)
@@ -1136,7 +1133,7 @@ template <class _Tp, class _Allocator>
 vector<_Tp, _Allocator>::vector(size_type __n, const allocator_type& __a)
     : __base(__a)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     if (__n > 0)
@@ -1150,7 +1147,7 @@ vector<_Tp, _Allocator>::vector(size_type __n, const allocator_type& __a)
 template <class _Tp, class _Allocator>
 vector<_Tp, _Allocator>::vector(size_type __n, const value_type& __x)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     if (__n > 0)
@@ -1164,7 +1161,7 @@ template <class _Tp, class _Allocator>
 vector<_Tp, _Allocator>::vector(size_type __n, const value_type& __x, const allocator_type& __a)
     : __base(__a)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     if (__n > 0)
@@ -1184,7 +1181,7 @@ vector<_Tp, _Allocator>::vector(_InputIterator __first,
                             typename iterator_traits<_InputIterator>::reference>::value,
                           _InputIterator>::type __last)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     for (; __first != __last; ++__first)
@@ -1201,7 +1198,7 @@ vector<_Tp, _Allocator>::vector(_InputIterator __first, _InputIterator __last, c
                             typename iterator_traits<_InputIterator>::reference>::value>::type*)
     : __base(__a)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     for (; __first != __last; ++__first)
@@ -1217,7 +1214,7 @@ vector<_Tp, _Allocator>::vector(_ForwardIterator __first,
                                    typename iterator_traits<_ForwardIterator>::reference>::value,
                                                    _ForwardIterator>::type __last)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     size_type __n = static_cast<size_type>(_VSTD::distance(__first, __last));
@@ -1237,7 +1234,7 @@ vector<_Tp, _Allocator>::vector(_ForwardIterator __first, _ForwardIterator __las
                                    typename iterator_traits<_ForwardIterator>::reference>::value>::type*)
     : __base(__a)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     size_type __n = static_cast<size_type>(_VSTD::distance(__first, __last));
@@ -1252,7 +1249,7 @@ template <class _Tp, class _Allocator>
 vector<_Tp, _Allocator>::vector(const vector& __x)
     : __base(__alloc_traits::select_on_container_copy_construction(__x.__alloc()))
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     size_type __n = __x.size();
@@ -1267,7 +1264,7 @@ template <class _Tp, class _Allocator>
 vector<_Tp, _Allocator>::vector(const vector& __x, const allocator_type& __a)
     : __base(__a)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     size_type __n = __x.size();
@@ -1290,7 +1287,7 @@ vector<_Tp, _Allocator>::vector(vector&& __x)
 #endif
     : __base(_VSTD::move(__x.__alloc()))
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
     __get_db()->swap(this, &__x);
 #endif
@@ -1305,7 +1302,7 @@ inline _LIBCPP_INLINE_VISIBILITY
 vector<_Tp, _Allocator>::vector(vector&& __x, const allocator_type& __a)
     : __base(__a)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     if (__a == __x.__alloc())
@@ -1314,7 +1311,7 @@ vector<_Tp, _Allocator>::vector(vector&& __x, const allocator_type& __a)
         this->__end_ = __x.__end_;
         this->__end_cap() = __x.__end_cap();
         __x.__begin_ = __x.__end_ = __x.__end_cap() = nullptr;
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
         __get_db()->swap(this, &__x);
 #endif
     }
@@ -1329,7 +1326,7 @@ template <class _Tp, class _Allocator>
 inline _LIBCPP_INLINE_VISIBILITY
 vector<_Tp, _Allocator>::vector(initializer_list<value_type> __il)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     if (__il.size() > 0)
@@ -1344,7 +1341,7 @@ inline _LIBCPP_INLINE_VISIBILITY
 vector<_Tp, _Allocator>::vector(initializer_list<value_type> __il, const allocator_type& __a)
     : __base(__a)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__insert_c(this);
 #endif
     if (__il.size() > 0)
@@ -1390,7 +1387,7 @@ vector<_Tp, _Allocator>::__move_assign(vector& __c, true_type)
     this->__end_ = __c.__end_;
     this->__end_cap() = __c.__end_cap();
     __c.__begin_ = __c.__end_ = __c.__end_cap() = nullptr;
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->swap(this, &__c);
 #endif
 }
@@ -1493,7 +1490,7 @@ inline _LIBCPP_INLINE_VISIBILITY
 typename vector<_Tp, _Allocator>::iterator
 vector<_Tp, _Allocator>::__make_iter(pointer __p) _NOEXCEPT
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     return iterator(this, __p);
 #else
     return iterator(__p);
@@ -1505,7 +1502,7 @@ inline _LIBCPP_INLINE_VISIBILITY
 typename vector<_Tp, _Allocator>::const_iterator
 vector<_Tp, _Allocator>::__make_iter(const_pointer __p) const _NOEXCEPT
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     return const_iterator(this, __p);
 #else
     return const_iterator(__p);
@@ -1709,7 +1706,7 @@ inline _LIBCPP_INLINE_VISIBILITY
 typename vector<_Tp, _Allocator>::iterator
 vector<_Tp, _Allocator>::erase(const_iterator __position)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__position) == this,
         "vector::erase(iterator) called with an iterator not"
         " referring to this vector");
@@ -1728,7 +1725,7 @@ template <class _Tp, class _Allocator>
 typename vector<_Tp, _Allocator>::iterator
 vector<_Tp, _Allocator>::erase(const_iterator __first, const_iterator __last)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__first) == this,
         "vector::erase(iterator,  iterator) called with an iterator not"
         " referring to this vector");
@@ -1769,7 +1766,7 @@ template <class _Tp, class _Allocator>
 typename vector<_Tp, _Allocator>::iterator
 vector<_Tp, _Allocator>::insert(const_iterator __position, const_reference __x)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__position) == this,
         "vector::insert(iterator, x) called with an iterator not"
         " referring to this vector");
@@ -1806,7 +1803,7 @@ template <class _Tp, class _Allocator>
 typename vector<_Tp, _Allocator>::iterator
 vector<_Tp, _Allocator>::insert(const_iterator __position, value_type&& __x)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__position) == this,
         "vector::insert(iterator, x) called with an iterator not"
         " referring to this vector");
@@ -1839,7 +1836,7 @@ template <class... _Args>
 typename vector<_Tp, _Allocator>::iterator
 vector<_Tp, _Allocator>::emplace(const_iterator __position, _Args&&... __args)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__position) == this,
         "vector::emplace(iterator, x) called with an iterator not"
         " referring to this vector");
@@ -1874,7 +1871,7 @@ template <class _Tp, class _Allocator>
 typename vector<_Tp, _Allocator>::iterator
 vector<_Tp, _Allocator>::insert(const_iterator __position, size_type __n, const_reference __x)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__position) == this,
         "vector::insert(iterator, n, x) called with an iterator not"
         " referring to this vector");
@@ -1925,7 +1922,7 @@ typename enable_if
 >::type
 vector<_Tp, _Allocator>::insert(const_iterator __position, _InputIterator __first, _InputIterator __last)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__position) == this,
         "vector::insert(iterator, range) called with an iterator not"
         " referring to this vector");
@@ -1978,7 +1975,7 @@ typename enable_if
 >::type
 vector<_Tp, _Allocator>::insert(const_iterator __position, _ForwardIterator __first, _ForwardIterator __last)
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__position) == this,
         "vector::insert(iterator, range) called with an iterator not"
         " referring to this vector");
@@ -2057,11 +2054,11 @@ vector<_Tp, _Allocator>::swap(vector& __x)
     _VSTD::swap(this->__begin_, __x.__begin_);
     _VSTD::swap(this->__end_, __x.__end_);
     _VSTD::swap(this->__end_cap(), __x.__end_cap());
-    __swap_allocator(this->__alloc(), __x.__alloc(),
+    _VSTD::__swap_allocator(this->__alloc(), __x.__alloc(),
         integral_constant<bool,__alloc_traits::propagate_on_container_swap::value>());
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->swap(this, &__x);
-#endif  // _LIBCPP_DEBUG_LEVEL >= 2
+#endif
 }
 
 template <class _Tp, class _Allocator>
@@ -2085,7 +2082,7 @@ vector<_Tp, _Allocator>::__invariants() const
     return true;
 }
 
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
 
 template <class _Tp, class _Allocator>
 bool
@@ -2117,16 +2114,16 @@ vector<_Tp, _Allocator>::__subscriptable(const const_iterator* __i, ptrdiff_t __
     return this->__begin_ <= __p && __p < this->__end_;
 }
 
-#endif  // _LIBCPP_DEBUG_LEVEL >= 2
+#endif  // _LIBCPP_DEBUG_LEVEL == 2
 
 template <class _Tp, class _Allocator>
 inline _LIBCPP_INLINE_VISIBILITY
 void
 vector<_Tp, _Allocator>::__invalidate_all_iterators()
 {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
     __get_db()->__invalidate_all(this);
-#endif  // _LIBCPP_DEBUG_LEVEL >= 2
+#endif
 }
 
 
@@ -2134,7 +2131,7 @@ template <class _Tp, class _Allocator>
 inline _LIBCPP_INLINE_VISIBILITY
 void
 vector<_Tp, _Allocator>::__invalidate_iterators_past(pointer __new_last) {
-#if _LIBCPP_DEBUG_LEVEL >= 2
+#if _LIBCPP_DEBUG_LEVEL == 2
   __c_node* __c = __get_db()->__find_c_and_lock(this);
   for (__i_node** __p = __c->end_; __p != __c->beg_; ) {
     --__p;
@@ -2142,7 +2139,7 @@ vector<_Tp, _Allocator>::__invalidate_iterators_past(pointer __new_last) {
     if (__i->base() > __new_last) {
       (*__p)->__c_ = nullptr;
       if (--__c->end_ != __p)
-        memmove(__p, __p+1, (__c->end_ - __p)*sizeof(__i_node*));
+        _VSTD::memmove(__p, __p+1, (__c->end_ - __p)*sizeof(__i_node*));
     }
   }
   __get_db()->unlock();
@@ -2883,7 +2880,7 @@ inline _LIBCPP_INLINE_VISIBILITY vector<bool, _Allocator>::vector(vector&& __v)
 #endif
     : __begin_(__v.__begin_),
       __size_(__v.__size_),
-      __cap_alloc_(std::move(__v.__cap_alloc_)) {
+      __cap_alloc_(_VSTD::move(__v.__cap_alloc_)) {
     __v.__begin_ = nullptr;
     __v.__size_ = 0;
     __v.__cap() = 0;
@@ -3235,7 +3232,7 @@ vector<bool, _Allocator>::swap(vector& __x)
     _VSTD::swap(this->__begin_, __x.__begin_);
     _VSTD::swap(this->__size_, __x.__size_);
     _VSTD::swap(this->__cap(), __x.__cap());
-    __swap_allocator(this->__alloc(), __x.__alloc(),
+    _VSTD::__swap_allocator(this->__alloc(), __x.__alloc(),
         integral_constant<bool, __alloc_traits::propagate_on_container_swap::value>());
 }
 
diff --git a/lib/libcxx/include/version b/lib/libcxx/include/version
index acedd03073..813bc1ab9e 100644
--- a/lib/libcxx/include/version
+++ b/lib/libcxx/include/version
@@ -15,20 +15,30 @@
 
 Macro name                                              Value   Headers
 __cpp_lib_addressof_constexpr                           201603L <memory>
-__cpp_lib_allocator_traits_is_always_equal              201411L <memory> <scoped_allocator> <string>
-                                                                <deque> <forward_list> <list>
-                                                                <vector> <map> <set>
-                                                                <unordered_map> <unordered_set>
+__cpp_lib_allocator_traits_is_always_equal              201411L <deque> <forward_list> <list>
+                                                                <map> <memory> <scoped_allocator>
+                                                                <set> <string> <unordered_map>
+                                                                <unordered_set> <vector>
 __cpp_lib_any                                           201606L <any>
 __cpp_lib_apply                                         201603L <tuple>
-__cpp_lib_array_constexpr                               201811L <iterator> <array>
+__cpp_lib_array_constexpr                               201811L <array> <iterator>
                                                         201603L // C++17
 __cpp_lib_as_const                                      201510L <utility>
+__cpp_lib_assume_aligned                                201811L <memory>
+__cpp_lib_atomic_flag_test                              201907L <atomic>
+__cpp_lib_atomic_float                                  201711L <atomic>
 __cpp_lib_atomic_is_always_lock_free                    201603L <atomic>
+__cpp_lib_atomic_lock_free_type_aliases                 201907L <atomic>
 __cpp_lib_atomic_ref                                    201806L <atomic>
-__cpp_lib_bind_front                                    201811L <functional>
+__cpp_lib_atomic_shared_ptr                             201711L <atomic>
+__cpp_lib_atomic_value_initialization                   201911L <atomic> <memory>
+__cpp_lib_atomic_wait                                   201907L <atomic>
+__cpp_lib_barrier                                       201907L <barrier>
+__cpp_lib_bind_front                                    201907L <functional>
 __cpp_lib_bit_cast                                      201806L <bit>
+__cpp_lib_bitops                                        201907L <bit>
 __cpp_lib_bool_constant                                 201505L <type_traits>
+__cpp_lib_bounded_array_traits                          201902L <type_traits>
 __cpp_lib_boyer_moore_searcher                          201603L <functional>
 __cpp_lib_byte                                          201603L <cstddef>
 __cpp_lib_char8_t                                       201811L <atomic> <filesystem> <istream>
@@ -38,18 +48,29 @@ __cpp_lib_chrono                                        201611L <chrono>
 __cpp_lib_chrono_udls                                   201304L <chrono>
 __cpp_lib_clamp                                         201603L <algorithm>
 __cpp_lib_complex_udls                                  201309L <complex>
-__cpp_lib_concepts                                      201806L <concepts>
-__cpp_lib_constexpr_misc                                201811L <array> <functional> <iterator>
-                                                                <string_view> <tuple> <utility>
-__cpp_lib_constexpr_swap_algorithms                     201806L <algorithm>
+__cpp_lib_concepts                                      202002L <concepts>
+__cpp_lib_constexpr_algorithms                          201806L <algorithm>
+__cpp_lib_constexpr_complex                             201711L <complex>
+__cpp_lib_constexpr_dynamic_alloc                       201907L <memory>
+__cpp_lib_constexpr_functional                          201907L <functional>
+__cpp_lib_constexpr_iterator                            201811L <iterator>
+__cpp_lib_constexpr_memory                              201811L <memory>
+__cpp_lib_constexpr_numeric                             201911L <numeric>
+__cpp_lib_constexpr_string                              201907L <string>
+__cpp_lib_constexpr_string_view                         201811L <string_view>
+__cpp_lib_constexpr_tuple                               201811L <tuple>
+__cpp_lib_constexpr_utility                             201811L <utility>
+__cpp_lib_constexpr_vector                              201907L <vector>
+__cpp_lib_coroutine                                     201902L <coroutine>
 __cpp_lib_destroying_delete                             201806L <new>
 __cpp_lib_enable_shared_from_this                       201603L <memory>
 __cpp_lib_endian                                        201907L <bit>
-__cpp_lib_erase_if                                      202002L <string> <deque> <forward_list>
-                                                                <list> <vector> <map>
-                                                                <set> <unordered_map> <unordered_set>
+__cpp_lib_erase_if                                      202002L <deque> <forward_list> <list>
+                                                                <map> <set> <string>
+                                                                <unordered_map> <unordered_set> <vector>
 __cpp_lib_exchange_function                             201304L <utility>
-__cpp_lib_execution                                     201603L <execution>
+__cpp_lib_execution                                     201902L <execution>
+                                                        201603L // C++17
 __cpp_lib_filesystem                                    201703L <filesystem>
 __cpp_lib_gcd_lcm                                       201606L <numeric>
 __cpp_lib_generic_associative_lookup                    201304L <map> <set>
@@ -58,16 +79,24 @@ __cpp_lib_hardware_interference_size                    201703L <new>
 __cpp_lib_has_unique_object_representations             201606L <type_traits>
 __cpp_lib_hypot                                         201603L <cmath>
 __cpp_lib_incomplete_container_elements                 201505L <forward_list> <list> <vector>
+__cpp_lib_int_pow2                                      202002L <bit>
+__cpp_lib_integer_comparison_functions                  202002L <utility>
 __cpp_lib_integer_sequence                              201304L <utility>
 __cpp_lib_integral_constant_callable                    201304L <type_traits>
-__cpp_lib_interpolate                                   201902L <numeric>
+__cpp_lib_interpolate                                   201902L <cmath> <numeric>
 __cpp_lib_invoke                                        201411L <functional>
 __cpp_lib_is_aggregate                                  201703L <type_traits>
 __cpp_lib_is_constant_evaluated                         201811L <type_traits>
 __cpp_lib_is_final                                      201402L <type_traits>
 __cpp_lib_is_invocable                                  201703L <type_traits>
+__cpp_lib_is_layout_compatible                          201907L <type_traits>
+__cpp_lib_is_nothrow_convertible                        201806L <type_traits>
 __cpp_lib_is_null_pointer                               201309L <type_traits>
+__cpp_lib_is_pointer_interconvertible                   201907L <type_traits>
+__cpp_lib_is_scoped_enum                                202011L <type_traits>
 __cpp_lib_is_swappable                                  201603L <type_traits>
+__cpp_lib_jthread                                       201911L <stop_token> <thread>
+__cpp_lib_latch                                         201907L <latch>
 __cpp_lib_launder                                       201606L <new>
 __cpp_lib_list_remove_return_type                       201806L <forward_list> <list>
 __cpp_lib_logical_traits                                201510L <type_traits>
@@ -80,40 +109,55 @@ __cpp_lib_math_special_functions                        201603L <cmath>
 __cpp_lib_memory_resource                               201603L <memory_resource>
 __cpp_lib_node_extract                                  201606L <map> <set> <unordered_map>
                                                                 <unordered_set>
-__cpp_lib_nonmember_container_access                    201411L <iterator> <array> <deque>
-                                                                <forward_list> <list> <map>
+__cpp_lib_nonmember_container_access                    201411L <array> <deque> <forward_list>
+                                                                <iterator> <list> <map>
                                                                 <regex> <set> <string>
                                                                 <unordered_map> <unordered_set> <vector>
 __cpp_lib_not_fn                                        201603L <functional>
 __cpp_lib_null_iterators                                201304L <iterator>
 __cpp_lib_optional                                      201606L <optional>
 __cpp_lib_parallel_algorithm                            201603L <algorithm> <numeric>
+__cpp_lib_polymorphic_allocator                         201902L <memory>
 __cpp_lib_quoted_string_io                              201304L <iomanip>
 __cpp_lib_ranges                                        201811L <algorithm> <functional> <iterator>
                                                                 <memory> <ranges>
 __cpp_lib_raw_memory_algorithms                         201606L <memory>
+__cpp_lib_remove_cvref                                  201711L <type_traits>
 __cpp_lib_result_of_sfinae                              201210L <functional> <type_traits>
 __cpp_lib_robust_nonmodifying_seq_ops                   201304L <algorithm>
 __cpp_lib_sample                                        201603L <algorithm>
 __cpp_lib_scoped_lock                                   201703L <mutex>
+__cpp_lib_semaphore                                     201907L <semaphore>
 __cpp_lib_shared_mutex                                  201505L <shared_mutex>
 __cpp_lib_shared_ptr_arrays                             201611L <memory>
 __cpp_lib_shared_ptr_weak_type                          201606L <memory>
 __cpp_lib_shared_timed_mutex                            201402L <shared_mutex>
+__cpp_lib_shift                                         201806L <algorithm>
+__cpp_lib_smart_ptr_for_overwrite                       202002L <memory>
+__cpp_lib_source_location                               201907L <source_location>
 __cpp_lib_span                                          202002L <span>
+__cpp_lib_ssize                                         201902L <iterator>
+__cpp_lib_stacktrace                                    202011L <stacktrace>
+__cpp_lib_starts_ends_with                              201711L <string> <string_view>
+__cpp_lib_stdatomic_h                                   202011L <stdatomic.h>
+__cpp_lib_string_contains                               202011L <string> <string_view>
 __cpp_lib_string_udls                                   201304L <string>
-__cpp_lib_string_view                                   201606L <string> <string_view>
-__cpp_lib_three_way_comparison                          201711L <compare>
+__cpp_lib_string_view                                   201803L <string> <string_view>
+                                                        201606L // C++17
+__cpp_lib_syncbuf                                       201803L <syncstream>
+__cpp_lib_three_way_comparison                          201907L <compare>
+__cpp_lib_to_address                                    201711L <memory>
 __cpp_lib_to_array                                      201907L <array>
 __cpp_lib_to_chars                                      201611L <utility>
 __cpp_lib_transformation_trait_aliases                  201304L <type_traits>
-__cpp_lib_transparent_operators                         201510L <functional>
+__cpp_lib_transparent_operators                         201510L <functional> <memory>
                                                         201210L // C++14
 __cpp_lib_tuple_element_t                               201402L <tuple>
-__cpp_lib_tuples_by_type                                201304L <utility> <tuple>
+__cpp_lib_tuples_by_type                                201304L <tuple> <utility>
 __cpp_lib_type_trait_variable_templates                 201510L <type_traits>
 __cpp_lib_uncaught_exceptions                           201411L <exception>
 __cpp_lib_unordered_map_try_emplace                     201411L <unordered_map>
+__cpp_lib_unwrap_ref                                    201811L <functional>
 __cpp_lib_variant                                       201606L <variant>
 __cpp_lib_void_t                                        201411L <type_traits>
 
@@ -200,7 +244,7 @@ __cpp_lib_void_t                                        201411L <type_traits>
 # if !defined(_LIBCPP_HAS_NO_THREADS)
 #   define __cpp_lib_shared_mutex                       201505L
 # endif
-// # define __cpp_lib_shared_ptr_arrays                    201611L
+# define __cpp_lib_shared_ptr_arrays                    201611L
 # define __cpp_lib_shared_ptr_weak_type                 201606L
 # define __cpp_lib_string_view                          201606L
 // # define __cpp_lib_to_chars                             201611L
@@ -216,35 +260,105 @@ __cpp_lib_void_t                                        201411L <type_traits>
 #if _LIBCPP_STD_VER > 17
 # undef  __cpp_lib_array_constexpr
 # define __cpp_lib_array_constexpr                      201811L
+// # define __cpp_lib_assume_aligned                       201811L
+# if !defined(_LIBCPP_HAS_NO_THREADS)
+#   define __cpp_lib_atomic_flag_test                   201907L
+# endif
+# if !defined(_LIBCPP_HAS_NO_THREADS)
+// #   define __cpp_lib_atomic_float                       201711L
+# endif
+# if !defined(_LIBCPP_HAS_NO_THREADS)
+#   define __cpp_lib_atomic_lock_free_type_aliases      201907L
+# endif
 # if !defined(_LIBCPP_HAS_NO_THREADS)
 // #   define __cpp_lib_atomic_ref                         201806L
 # endif
-// # define __cpp_lib_bind_front                           201811L
+# if !defined(_LIBCPP_HAS_NO_THREADS)
+// #   define __cpp_lib_atomic_shared_ptr                  201711L
+# endif
+# if !defined(_LIBCPP_HAS_NO_THREADS)
+// #   define __cpp_lib_atomic_value_initialization        201911L
+# endif
+# if !defined(_LIBCPP_HAS_NO_THREADS)
+#   define __cpp_lib_atomic_wait                        201907L
+# endif
+# if !defined(_LIBCPP_HAS_NO_THREADS)
+#   define __cpp_lib_barrier                            201907L
+# endif
+// # define __cpp_lib_bind_front                           201907L
 // # define __cpp_lib_bit_cast                             201806L
+// # define __cpp_lib_bitops                               201907L
+# define __cpp_lib_bounded_array_traits                 201902L
 # if !defined(_LIBCPP_NO_HAS_CHAR8_T)
 #   define __cpp_lib_char8_t                            201811L
 # endif
-// # define __cpp_lib_concepts                             201806L
-// # define __cpp_lib_constexpr_misc                       201811L
-// # define __cpp_lib_constexpr_swap_algorithms            201806L
+// # define __cpp_lib_concepts                             202002L
+// # define __cpp_lib_constexpr_algorithms                 201806L
+// # define __cpp_lib_constexpr_complex                    201711L
+# define __cpp_lib_constexpr_dynamic_alloc              201907L
+# define __cpp_lib_constexpr_functional                 201907L
+// # define __cpp_lib_constexpr_iterator                   201811L
+// # define __cpp_lib_constexpr_memory                     201811L
+# define __cpp_lib_constexpr_numeric                    201911L
+// # define __cpp_lib_constexpr_string                     201907L
+// # define __cpp_lib_constexpr_string_view                201811L
+// # define __cpp_lib_constexpr_tuple                      201811L
+# define __cpp_lib_constexpr_utility                    201811L
+// # define __cpp_lib_constexpr_vector                     201907L
+// # define __cpp_lib_coroutine                            201902L
 # if _LIBCPP_STD_VER > 17 && defined(__cpp_impl_destroying_delete) && __cpp_impl_destroying_delete >= 201806L
 #   define __cpp_lib_destroying_delete                  201806L
 # endif
 # define __cpp_lib_endian                               201907L
 # define __cpp_lib_erase_if                             202002L
-// # define __cpp_lib_generic_unordered_lookup             201811L
+# undef  __cpp_lib_execution
+// # define __cpp_lib_execution                            201902L
+# define __cpp_lib_generic_unordered_lookup             201811L
+# define __cpp_lib_int_pow2                             202002L
+// # define __cpp_lib_integer_comparison_functions         202002L
 # define __cpp_lib_interpolate                          201902L
 # if !defined(_LIBCPP_HAS_NO_BUILTIN_IS_CONSTANT_EVALUATED)
 #   define __cpp_lib_is_constant_evaluated              201811L
 # endif
+// # define __cpp_lib_is_layout_compatible                 201907L
+# define __cpp_lib_is_nothrow_convertible               201806L
+// # define __cpp_lib_is_pointer_interconvertible          201907L
+# if !defined(_LIBCPP_HAS_NO_THREADS)
+// #   define __cpp_lib_jthread                            201911L
+# endif
+# if !defined(_LIBCPP_HAS_NO_THREADS)
+#   define __cpp_lib_latch                              201907L
+# endif
 # define __cpp_lib_list_remove_return_type              201806L
 # if defined(__cpp_concepts) && __cpp_concepts >= 201811L
 #   define __cpp_lib_math_constants                     201907L
 # endif
+// # define __cpp_lib_polymorphic_allocator                201902L
 // # define __cpp_lib_ranges                               201811L
+# define __cpp_lib_remove_cvref                         201711L
+# if !defined(_LIBCPP_HAS_NO_THREADS)
+#   define __cpp_lib_semaphore                          201907L
+# endif
+# define __cpp_lib_shift                                201806L
+// # define __cpp_lib_smart_ptr_for_overwrite              202002L
+// # define __cpp_lib_source_location                      201907L
 # define __cpp_lib_span                                 202002L
-// # define __cpp_lib_three_way_comparison                 201711L
+# define __cpp_lib_ssize                                201902L
+# define __cpp_lib_starts_ends_with                     201711L
+# undef  __cpp_lib_string_view
+# define __cpp_lib_string_view                          201803L
+// # define __cpp_lib_syncbuf                              201803L
+// # define __cpp_lib_three_way_comparison                 201907L
+# define __cpp_lib_to_address                           201711L
 # define __cpp_lib_to_array                             201907L
+# define __cpp_lib_unwrap_ref                           201811L
+#endif
+
+#if _LIBCPP_STD_VER > 20
+# define __cpp_lib_is_scoped_enum                       202011L
+// # define __cpp_lib_stacktrace                           202011L
+// # define __cpp_lib_stdatomic_h                          202011L
+# define __cpp_lib_string_contains                      202011L
 #endif
 
 #endif // _LIBCPP_VERSIONH
diff --git a/lib/libcxx/include/wctype.h b/lib/libcxx/include/wctype.h
index bdcf37234c..2d4c36d992 100644
--- a/lib/libcxx/include/wctype.h
+++ b/lib/libcxx/include/wctype.h
@@ -50,7 +50,9 @@ wctrans_t wctrans(const char* property);
 #pragma GCC system_header
 #endif
 
-#include_next <wctype.h>
+#if __has_include_next(<wctype.h>)
+#   include_next <wctype.h>
+#endif
 
 #ifdef __cplusplus
 
diff --git a/lib/libcxx/src/atomic.cpp b/lib/libcxx/src/atomic.cpp
index 65d4837bb4..9ae1fb5199 100644
--- a/lib/libcxx/src/atomic.cpp
+++ b/lib/libcxx/src/atomic.cpp
@@ -13,14 +13,18 @@
 #include <atomic>
 #include <functional>
 
-#include <iostream>
-
 #ifdef __linux__
 
 #include <unistd.h>
 #include <linux/futex.h>
 #include <sys/syscall.h>
 
+// libc++ uses SYS_futex as a universal syscall name. However, on 32 bit architectures
+// with a 64 bit time_t, we need to specify SYS_futex_time64.
+#if !defined(SYS_futex) && defined(SYS_futex_time64)
+# define SYS_futex SYS_futex_time64
+#endif
+
 #else // <- Add other operating systems here
 
 // Baseline needs no new headers
diff --git a/lib/libcxx/src/barrier.cpp b/lib/libcxx/src/barrier.cpp
index c5e33cbba3..9ee476993b 100644
--- a/lib/libcxx/src/barrier.cpp
+++ b/lib/libcxx/src/barrier.cpp
@@ -26,21 +26,15 @@ public:
         } __tickets[64];
     };
 
-    ptrdiff_t&         __expected;
-    unique_ptr<char[]> __state_allocation;
-    __state_t*         __state;
+    ptrdiff_t&              __expected;
+    unique_ptr<__state_t[]> __state;
 
     _LIBCPP_HIDDEN
     __barrier_algorithm_base(ptrdiff_t& __expected)
         : __expected(__expected)
     {
         size_t const __count = (__expected + 1) >> 1;
-        size_t const __size = sizeof(__state_t) * __count;
-        size_t __allocation_size = __size + alignof(__state_t);
-        __state_allocation = unique_ptr<char[]>(new char[__allocation_size]);
-        void* __allocation = __state_allocation.get();
-        void* const __state_ = align(alignof(__state_t), __size, __allocation, __allocation_size);
-        __state = new (__state_) __barrier_algorithm_base::__state_t[__count];
+        __state = unique_ptr<__state_t[]>(new __state_t[__count]);
     }
     _LIBCPP_HIDDEN
     bool __arrive(__barrier_phase_t __old_phase)
diff --git a/lib/libcxx/src/chrono.cpp b/lib/libcxx/src/chrono.cpp
index f0a5d50ddf..085fbfde26 100644
--- a/lib/libcxx/src/chrono.cpp
+++ b/lib/libcxx/src/chrono.cpp
@@ -13,11 +13,15 @@
 #include "include/apple_availability.h"
 
 #if __has_include(<unistd.h>)
-#include <unistd.h>
+# include <unistd.h>
+#endif
+
+#if __has_include(<sys/time.h>)
+# include <sys/time.h> // for gettimeofday and timeval
 #endif
 
 #if !defined(__APPLE__) && _POSIX_TIMERS > 0
-#define _LIBCPP_USE_CLOCK_GETTIME
+# define _LIBCPP_USE_CLOCK_GETTIME
 #endif
 
 #if defined(_LIBCPP_WIN32API)
@@ -27,12 +31,12 @@
 #  if _WIN32_WINNT >= _WIN32_WINNT_WIN8
 #    include <winapifamily.h>
 #  endif
-#else
-#  if !defined(CLOCK_REALTIME)
-#    include <sys/time.h>        // for gettimeofday and timeval
-#  endif // !defined(CLOCK_REALTIME)
 #endif // defined(_LIBCPP_WIN32API)
 
+#if __has_include(<mach/mach_time.h>)
+# include <mach/mach_time.h>
+#endif
+
 #if defined(__ELF__) && defined(_LIBCPP_LINK_RT_LIB)
 #  pragma comment(lib, "rt")
 #endif
@@ -42,14 +46,13 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 namespace chrono
 {
 
+//
 // system_clock
+//
 
-const bool system_clock::is_steady;
-
-system_clock::time_point
-system_clock::now() _NOEXCEPT
-{
 #if defined(_LIBCPP_WIN32API)
+
+static system_clock::time_point __libcpp_system_clock_now() {
   // FILETIME is in 100ns units
   using filetime_duration =
       _VSTD::chrono::duration<__int64,
@@ -60,31 +63,42 @@ system_clock::now() _NOEXCEPT
   static _LIBCPP_CONSTEXPR const seconds nt_to_unix_epoch{11644473600};
 
   FILETIME ft;
-#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
-#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#if _WIN32_WINNT >= _WIN32_WINNT_WIN8 && WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
   GetSystemTimePreciseAsFileTime(&ft);
-#else
-  GetSystemTimeAsFileTime(&ft);
-#endif
 #else
   GetSystemTimeAsFileTime(&ft);
 #endif
 
   filetime_duration d{(static_cast<__int64>(ft.dwHighDateTime) << 32) |
                        static_cast<__int64>(ft.dwLowDateTime)};
-  return time_point(duration_cast<duration>(d - nt_to_unix_epoch));
-#else
-#if defined(CLOCK_REALTIME)
+  return system_clock::time_point(duration_cast<system_clock::duration>(d - nt_to_unix_epoch));
+}
+
+#elif defined(CLOCK_REALTIME) && defined(_LIBCPP_USE_CLOCK_GETTIME)
+
+static system_clock::time_point __libcpp_system_clock_now() {
   struct timespec tp;
   if (0 != clock_gettime(CLOCK_REALTIME, &tp))
     __throw_system_error(errno, "clock_gettime(CLOCK_REALTIME) failed");
-  return time_point(seconds(tp.tv_sec) + microseconds(tp.tv_nsec / 1000));
+  return system_clock::time_point(seconds(tp.tv_sec) + microseconds(tp.tv_nsec / 1000));
+}
+
 #else
+
+static system_clock::time_point __libcpp_system_clock_now() {
     timeval tv;
     gettimeofday(&tv, 0);
-    return time_point(seconds(tv.tv_sec) + microseconds(tv.tv_usec));
-#endif // CLOCK_REALTIME
+    return system_clock::time_point(seconds(tv.tv_sec) + microseconds(tv.tv_usec));
+}
+
 #endif
+
+const bool system_clock::is_steady;
+
+system_clock::time_point
+system_clock::now() _NOEXCEPT
+{
+    return __libcpp_system_clock_now();
 }
 
 time_t
@@ -99,35 +113,85 @@ system_clock::from_time_t(time_t t) _NOEXCEPT
     return system_clock::time_point(seconds(t));
 }
 
-#ifndef _LIBCPP_HAS_NO_MONOTONIC_CLOCK
+//
 // steady_clock
 //
 // Warning:  If this is not truly steady, then it is non-conforming.  It is
 //  better for it to not exist and have the rest of libc++ use system_clock
 //  instead.
+//
 
-const bool steady_clock::is_steady;
+#ifndef _LIBCPP_HAS_NO_MONOTONIC_CLOCK
 
 #if defined(__APPLE__)
 
-#if !defined(CLOCK_MONOTONIC_RAW)
-#  error "Building libc++ on Apple platforms requires CLOCK_MONOTONIC_RAW"
+// TODO(ldionne):
+// This old implementation of steady_clock is retained until Chrome drops supports
+// for macOS < 10.12. The issue is that they link libc++ statically into their
+// application, which means that libc++ must support being built for such deployment
+// targets. See https://llvm.org/D74489 for details.
+#if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 101200) || \
+    (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 100000) || \
+    (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 100000) || \
+    (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 30000)
+# define _LIBCPP_USE_OLD_MACH_ABSOLUTE_TIME
 #endif
 
+#if defined(_LIBCPP_USE_OLD_MACH_ABSOLUTE_TIME)
+
+//   mach_absolute_time() * MachInfo.numer / MachInfo.denom is the number of
+//   nanoseconds since the computer booted up.  MachInfo.numer and MachInfo.denom
+//   are run time constants supplied by the OS.  This clock has no relationship
+//   to the Gregorian calendar.  It's main use is as a high resolution timer.
+
+// MachInfo.numer / MachInfo.denom is often 1 on the latest equipment.  Specialize
+//   for that case as an optimization.
+
+static steady_clock::rep steady_simplified() {
+    return static_cast<steady_clock::rep>(mach_absolute_time());
+}
+static double compute_steady_factor() {
+    mach_timebase_info_data_t MachInfo;
+    mach_timebase_info(&MachInfo);
+    return static_cast<double>(MachInfo.numer) / MachInfo.denom;
+}
+
+static steady_clock::rep steady_full() {
+    static const double factor = compute_steady_factor();
+    return static_cast<steady_clock::rep>(mach_absolute_time() * factor);
+}
+
+typedef steady_clock::rep (*FP)();
+
+static FP init_steady_clock() {
+    mach_timebase_info_data_t MachInfo;
+    mach_timebase_info(&MachInfo);
+    if (MachInfo.numer == MachInfo.denom)
+        return &steady_simplified;
+    return &steady_full;
+}
+
+static steady_clock::time_point __libcpp_steady_clock_now() {
+    static FP fp = init_steady_clock();
+    return steady_clock::time_point(steady_clock::duration(fp()));
+}
+
+#else // vvvvv default behavior for Apple platforms  vvvvv
+
 // On Apple platforms, only CLOCK_UPTIME_RAW, CLOCK_MONOTONIC_RAW or
 // mach_absolute_time are able to time functions in the nanosecond range.
 // Furthermore, only CLOCK_MONOTONIC_RAW is truly monotonic, because it
 // also counts cycles when the system is asleep. Thus, it is the only
 // acceptable implementation of steady_clock.
-steady_clock::time_point
-steady_clock::now() _NOEXCEPT
-{
+static steady_clock::time_point __libcpp_steady_clock_now() {
     struct timespec tp;
     if (0 != clock_gettime(CLOCK_MONOTONIC_RAW, &tp))
         __throw_system_error(errno, "clock_gettime(CLOCK_MONOTONIC_RAW) failed");
-    return time_point(seconds(tp.tv_sec) + nanoseconds(tp.tv_nsec));
+    return steady_clock::time_point(seconds(tp.tv_sec) + nanoseconds(tp.tv_nsec));
 }
 
+#endif
+
 #elif defined(_LIBCPP_WIN32API)
 
 // https://msdn.microsoft.com/en-us/library/windows/desktop/ms644905(v=vs.85).aspx says:
@@ -138,36 +202,43 @@ steady_clock::now() _NOEXCEPT
 static LARGE_INTEGER
 __QueryPerformanceFrequency()
 {
-	LARGE_INTEGER val;
-	(void) QueryPerformanceFrequency(&val);
-	return val;
+    LARGE_INTEGER val;
+    (void) QueryPerformanceFrequency(&val);
+    return val;
 }
 
-steady_clock::time_point
-steady_clock::now() _NOEXCEPT
-{
+static steady_clock::time_point __libcpp_steady_clock_now() {
   static const LARGE_INTEGER freq = __QueryPerformanceFrequency();
 
   LARGE_INTEGER counter;
   (void) QueryPerformanceCounter(&counter);
-  return time_point(duration(counter.QuadPart * nano::den / freq.QuadPart));
+  auto seconds = counter.QuadPart / freq.QuadPart;
+  auto fractions = counter.QuadPart % freq.QuadPart;
+  auto dur = seconds * nano::den + fractions * nano::den / freq.QuadPart;
+  return steady_clock::time_point(steady_clock::duration(dur));
 }
 
 #elif defined(CLOCK_MONOTONIC)
 
-steady_clock::time_point
-steady_clock::now() _NOEXCEPT
-{
+static steady_clock::time_point __libcpp_steady_clock_now() {
     struct timespec tp;
     if (0 != clock_gettime(CLOCK_MONOTONIC, &tp))
         __throw_system_error(errno, "clock_gettime(CLOCK_MONOTONIC) failed");
-    return time_point(seconds(tp.tv_sec) + nanoseconds(tp.tv_nsec));
+    return steady_clock::time_point(seconds(tp.tv_sec) + nanoseconds(tp.tv_nsec));
 }
 
 #else
-#  error "Monotonic clock not implemented"
+#   error "Monotonic clock not implemented on this platform"
 #endif
 
+const bool steady_clock::is_steady;
+
+steady_clock::time_point
+steady_clock::now() _NOEXCEPT
+{
+    return __libcpp_steady_clock_now();
+}
+
 #endif // !_LIBCPP_HAS_NO_MONOTONIC_CLOCK
 
 }
diff --git a/lib/libcxx/src/experimental/memory_resource.cpp b/lib/libcxx/src/experimental/memory_resource.cpp
index 68c5bc99cc..1304ef3df7 100644
--- a/lib/libcxx/src/experimental/memory_resource.cpp
+++ b/lib/libcxx/src/experimental/memory_resource.cpp
@@ -76,16 +76,6 @@ union ResourceInitHelper {
   ~ResourceInitHelper() {}
 };
 
-// Detect if the init_priority attribute is supported.
-#if (defined(_LIBCPP_COMPILER_GCC) && defined(__APPLE__)) \
-  || defined(_LIBCPP_COMPILER_MSVC)
-// GCC on Apple doesn't support the init priority attribute,
-// and MSVC doesn't support any GCC attributes.
-# define _LIBCPP_INIT_PRIORITY_MAX
-#else
-# define _LIBCPP_INIT_PRIORITY_MAX __attribute__((init_priority(101)))
-#endif
-
 // When compiled in C++14 this initialization should be a constant expression.
 // Only in C++11 is "init_priority" needed to ensure initialization order.
 #if _LIBCPP_STD_VER > 11
diff --git a/lib/libcxx/src/filesystem/directory_iterator.cpp b/lib/libcxx/src/filesystem/directory_iterator.cpp
index e8941b3494..2721dea5c9 100644
--- a/lib/libcxx/src/filesystem/directory_iterator.cpp
+++ b/lib/libcxx/src/filesystem/directory_iterator.cpp
@@ -10,6 +10,7 @@
 #include "__config"
 #if defined(_LIBCPP_WIN32API)
 #define WIN32_LEAN_AND_MEAN
+#define NOMINMAX
 #include <windows.h>
 #else
 #include <dirent.h>
@@ -72,16 +73,20 @@ static pair<string_view, file_type> posix_readdir(DIR* dir_stream,
   }
 }
 #else
+// defined(_LIBCPP_WIN32API)
 
-static file_type get_file_type(const WIN32_FIND_DATA& data) {
-  //auto attrs = data.dwFileAttributes;
-  // FIXME(EricWF)
-  return file_type::unknown;
+static file_type get_file_type(const WIN32_FIND_DATAW& data) {
+  if (data.dwFileAttributes & FILE_ATTRIBUTE_REPARSE_POINT &&
+      data.dwReserved0 == IO_REPARSE_TAG_SYMLINK)
+    return file_type::symlink;
+  if (data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)
+    return file_type::directory;
+  return file_type::regular;
 }
-static uintmax_t get_file_size(const WIN32_FIND_DATA& data) {
-  return (data.nFileSizeHigh * (MAXDWORD + 1)) + data.nFileSizeLow;
+static uintmax_t get_file_size(const WIN32_FIND_DATAW& data) {
+  return (static_cast<uint64_t>(data.nFileSizeHigh) << 32) + data.nFileSizeLow;
 }
-static file_time_type get_write_time(const WIN32_FIND_DATA& data) {
+static file_time_type get_write_time(const WIN32_FIND_DATAW& data) {
   ULARGE_INTEGER tmp;
   const FILETIME& time = data.ftLastWriteTime;
   tmp.u.LowPart = time.dwLowDateTime;
@@ -110,15 +115,21 @@ public:
 
   __dir_stream(const path& root, directory_options opts, error_code& ec)
       : __stream_(INVALID_HANDLE_VALUE), __root_(root) {
-    __stream_ = ::FindFirstFile(root.c_str(), &__data_);
+    if (root.native().empty()) {
+      ec = make_error_code(errc::no_such_file_or_directory);
+      return;
+    }
+    __stream_ = ::FindFirstFileW((root / "*").c_str(), &__data_);
     if (__stream_ == INVALID_HANDLE_VALUE) {
-      ec = error_code(::GetLastError(), generic_category());
+      ec = detail::make_windows_error(GetLastError());
       const bool ignore_permission_denied =
           bool(opts & directory_options::skip_permission_denied);
       if (ignore_permission_denied && ec.value() == ERROR_ACCESS_DENIED)
         ec.clear();
       return;
     }
+    if (!assign())
+      advance(ec);
   }
 
   ~__dir_stream() noexcept {
@@ -130,35 +141,39 @@ public:
   bool good() const noexcept { return __stream_ != INVALID_HANDLE_VALUE; }
 
   bool advance(error_code& ec) {
-    while (::FindNextFile(__stream_, &__data_)) {
-      if (!strcmp(__data_.cFileName, ".") || strcmp(__data_.cFileName, ".."))
-        continue;
-      // FIXME: Cache more of this
-      //directory_entry::__cached_data cdata;
-      //cdata.__type_ = get_file_type(__data_);
-      //cdata.__size_ = get_file_size(__data_);
-      //cdata.__write_time_ = get_write_time(__data_);
-      __entry_.__assign_iter_entry(
-          __root_ / __data_.cFileName,
-          directory_entry::__create_iter_result(detail::get_file_type(__data)));
-      return true;
+    while (::FindNextFileW(__stream_, &__data_)) {
+      if (assign())
+        return true;
     }
-    ec = error_code(::GetLastError(), generic_category());
     close();
     return false;
   }
 
+  bool assign() {
+    if (!wcscmp(__data_.cFileName, L".") || !wcscmp(__data_.cFileName, L".."))
+      return false;
+    // FIXME: Cache more of this
+    //directory_entry::__cached_data cdata;
+    //cdata.__type_ = get_file_type(__data_);
+    //cdata.__size_ = get_file_size(__data_);
+    //cdata.__write_time_ = get_write_time(__data_);
+    __entry_.__assign_iter_entry(
+        __root_ / __data_.cFileName,
+        directory_entry::__create_iter_result(detail::get_file_type(__data_)));
+    return true;
+  }
+
 private:
   error_code close() noexcept {
     error_code ec;
     if (!::FindClose(__stream_))
-      ec = error_code(::GetLastError(), generic_category());
+      ec = detail::make_windows_error(GetLastError());
     __stream_ = INVALID_HANDLE_VALUE;
     return ec;
   }
 
   HANDLE __stream_{INVALID_HANDLE_VALUE};
-  WIN32_FIND_DATA __data_;
+  WIN32_FIND_DATAW __data_;
 
 public:
   path __root_;
diff --git a/lib/libcxx/src/filesystem/filesystem_common.h b/lib/libcxx/src/filesystem/filesystem_common.h
index fe5c42f5e6..e0fdbccf96 100644
--- a/lib/libcxx/src/filesystem/filesystem_common.h
+++ b/lib/libcxx/src/filesystem/filesystem_common.h
@@ -13,14 +13,17 @@
 #include "filesystem"
 #include "array"
 #include "chrono"
-#include "cstdlib"
 #include "climits"
+#include "cstdlib"
+#include "ctime"
 
-#include <unistd.h>
-#include <sys/stat.h>
-#include <sys/statvfs.h>
-#include <sys/time.h> // for ::utimes as used in __last_write_time
-#include <fcntl.h>    /* values for fchmodat */
+#if !defined(_LIBCPP_WIN32API)
+# include <unistd.h>
+# include <sys/stat.h>
+# include <sys/statvfs.h>
+# include <sys/time.h> // for ::utimes as used in __last_write_time
+# include <fcntl.h>    /* values for fchmodat */
+#endif
 
 #include "../include/apple_availability.h"
 
@@ -37,9 +40,21 @@
 #pragma GCC diagnostic ignored "-Wunused-function"
 #endif
 
+#if defined(_LIBCPP_WIN32API)
+#define PS(x) (L##x)
+#else
+#define PS(x) (x)
+#endif
+
 _LIBCPP_BEGIN_NAMESPACE_FILESYSTEM
 
 namespace detail {
+
+#if defined(_LIBCPP_WIN32API)
+// Non anonymous, to allow access from two translation units.
+errc __win_err_to_errc(int err);
+#endif
+
 namespace {
 
 static string format_string_imp(const char* msg, ...) {
@@ -47,7 +62,7 @@ static string format_string_imp(const char* msg, ...) {
   struct GuardVAList {
     va_list& target;
     bool active = true;
-    GuardVAList(va_list& target) : target(target), active(true) {}
+    GuardVAList(va_list& tgt) : target(tgt), active(true) {}
     void clear() {
       if (active)
         va_end(target);
@@ -93,8 +108,8 @@ static string format_string_imp(const char* msg, ...) {
   return result;
 }
 
-const char* unwrap(string const& s) { return s.c_str(); }
-const char* unwrap(path const& p) { return p.native().c_str(); }
+const path::value_type* unwrap(path::string_type const& s) { return s.c_str(); }
+const path::value_type* unwrap(path const& p) { return p.native().c_str(); }
 template <class Arg>
 Arg const& unwrap(Arg const& a) {
   static_assert(!is_class<Arg>::value, "cannot pass class here");
@@ -111,6 +126,12 @@ error_code capture_errno() {
   return error_code(errno, generic_category());
 }
 
+#if defined(_LIBCPP_WIN32API)
+error_code make_windows_error(int err) {
+  return make_error_code(__win_err_to_errc(err));
+}
+#endif
+
 template <class T>
 T error_value();
 template <>
@@ -119,6 +140,12 @@ template <>
 bool error_value<bool>() {
   return false;
 }
+#if __SIZEOF_SIZE_T__ != __SIZEOF_LONG_LONG__
+template <>
+size_t error_value<size_t>() {
+  return size_t(-1);
+}
+#endif
 template <>
 uintmax_t error_value<uintmax_t>() {
   return uintmax_t(-1);
@@ -134,50 +161,50 @@ path error_value<path>() {
 
 template <class T>
 struct ErrorHandler {
-  const char* func_name;
-  error_code* ec = nullptr;
-  const path* p1 = nullptr;
-  const path* p2 = nullptr;
+  const char* func_name_;
+  error_code* ec_ = nullptr;
+  const path* p1_ = nullptr;
+  const path* p2_ = nullptr;
 
   ErrorHandler(const char* fname, error_code* ec, const path* p1 = nullptr,
                const path* p2 = nullptr)
-      : func_name(fname), ec(ec), p1(p1), p2(p2) {
-    if (ec)
-      ec->clear();
+      : func_name_(fname), ec_(ec), p1_(p1), p2_(p2) {
+    if (ec_)
+      ec_->clear();
   }
 
-  T report(const error_code& m_ec) const {
-    if (ec) {
-      *ec = m_ec;
+  T report(const error_code& ec) const {
+    if (ec_) {
+      *ec_ = ec;
       return error_value<T>();
     }
-    string what = string("in ") + func_name;
-    switch (bool(p1) + bool(p2)) {
+    string what = string("in ") + func_name_;
+    switch (bool(p1_) + bool(p2_)) {
     case 0:
-      __throw_filesystem_error(what, m_ec);
+      __throw_filesystem_error(what, ec);
     case 1:
-      __throw_filesystem_error(what, *p1, m_ec);
+      __throw_filesystem_error(what, *p1_, ec);
     case 2:
-      __throw_filesystem_error(what, *p1, *p2, m_ec);
+      __throw_filesystem_error(what, *p1_, *p2_, ec);
     }
     _LIBCPP_UNREACHABLE();
   }
 
   template <class... Args>
-  T report(const error_code& m_ec, const char* msg, Args const&... args) const {
-    if (ec) {
-      *ec = m_ec;
+  T report(const error_code& ec, const char* msg, Args const&... args) const {
+    if (ec_) {
+      *ec_ = ec;
       return error_value<T>();
     }
     string what =
-        string("in ") + func_name + ": " + format_string(msg, args...);
-    switch (bool(p1) + bool(p2)) {
+        string("in ") + func_name_ + ": " + format_string(msg, args...);
+    switch (bool(p1_) + bool(p2_)) {
     case 0:
-      __throw_filesystem_error(what, m_ec);
+      __throw_filesystem_error(what, ec);
     case 1:
-      __throw_filesystem_error(what, *p1, m_ec);
+      __throw_filesystem_error(what, *p1_, ec);
     case 2:
-      __throw_filesystem_error(what, *p1, *p2, m_ec);
+      __throw_filesystem_error(what, *p1_, *p2_, ec);
     }
     _LIBCPP_UNREACHABLE();
   }
@@ -197,8 +224,9 @@ private:
 using chrono::duration;
 using chrono::duration_cast;
 
-using TimeSpec = struct ::timespec;
-using StatT = struct ::stat;
+using TimeSpec = struct timespec;
+using TimeVal = struct timeval;
+using StatT = struct stat;
 
 template <class FileTimeT, class TimeT,
           bool IsFloat = is_floating_point<typename FileTimeT::rep>::value>
@@ -380,26 +408,38 @@ public:
 using fs_time = time_util<file_time_type, time_t, TimeSpec>;
 
 #if defined(__APPLE__)
-TimeSpec extract_mtime(StatT const& st) { return st.st_mtimespec; }
-TimeSpec extract_atime(StatT const& st) { return st.st_atimespec; }
+inline TimeSpec extract_mtime(StatT const& st) { return st.st_mtimespec; }
+inline TimeSpec extract_atime(StatT const& st) { return st.st_atimespec; }
+#elif defined(__MVS__)
+inline TimeSpec extract_mtime(StatT const& st) {
+  TimeSpec TS = {st.st_mtime, 0};
+  return TS;
+}
+inline TimeSpec extract_atime(StatT const& st) {
+  TimeSpec TS = {st.st_atime, 0};
+  return TS;
+}
 #else
-TimeSpec extract_mtime(StatT const& st) { return st.st_mtim; }
-TimeSpec extract_atime(StatT const& st) { return st.st_atim; }
+inline TimeSpec extract_mtime(StatT const& st) { return st.st_mtim; }
+inline TimeSpec extract_atime(StatT const& st) { return st.st_atim; }
 #endif
 
-// allow the utimes implementation to compile even it we're not going
-// to use it.
-
-bool posix_utimes(const path& p, std::array<TimeSpec, 2> const& TS,
-                  error_code& ec) {
+inline TimeVal make_timeval(TimeSpec const& ts) {
   using namespace chrono;
   auto Convert = [](long nsec) {
-    using int_type = decltype(std::declval< ::timeval>().tv_usec);
+    using int_type = decltype(std::declval<TimeVal>().tv_usec);
     auto dur = duration_cast<microseconds>(nanoseconds(nsec)).count();
     return static_cast<int_type>(dur);
   };
-  struct ::timeval ConvertedTS[2] = {{TS[0].tv_sec, Convert(TS[0].tv_nsec)},
-                                     {TS[1].tv_sec, Convert(TS[1].tv_nsec)}};
+  TimeVal TV = {};
+  TV.tv_sec = ts.tv_sec;
+  TV.tv_usec = Convert(ts.tv_nsec);
+  return TV;
+}
+
+inline bool posix_utimes(const path& p, std::array<TimeSpec, 2> const& TS,
+                  error_code& ec) {
+  TimeVal ConvertedTS[2] = {make_timeval(TS[0]), make_timeval(TS[1])};
   if (::utimes(p.c_str(), ConvertedTS) == -1) {
     ec = capture_errno();
     return true;
diff --git a/lib/libcxx/src/filesystem/operations.cpp b/lib/libcxx/src/filesystem/operations.cpp
index 5c98671fee..50a895dc2f 100644
--- a/lib/libcxx/src/filesystem/operations.cpp
+++ b/lib/libcxx/src/filesystem/operations.cpp
@@ -9,8 +9,6 @@
 #include "filesystem"
 #include "array"
 #include "iterator"
-#include "fstream"
-#include "random" /* for unique_path */
 #include "string_view"
 #include "type_traits"
 #include "vector"
@@ -19,40 +17,51 @@
 
 #include "filesystem_common.h"
 
-#include <unistd.h>
-#include <sys/stat.h>
-#include <sys/statvfs.h>
+#if defined(_LIBCPP_WIN32API)
+# define WIN32_LEAN_AND_MEAN
+# define NOMINMAX
+# include <windows.h>
+#else
+# include <unistd.h>
+# include <sys/stat.h>
+# include <sys/statvfs.h>
+#endif
 #include <time.h>
 #include <fcntl.h> /* values for fchmodat */
 
-#if defined(__linux__)
-#include <linux/version.h>
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 33)
-#include <sys/sendfile.h>
-#define _LIBCPP_USE_SENDFILE
-#endif
+#if __has_include(<sys/sendfile.h>)
+# include <sys/sendfile.h>
+# define _LIBCPP_FILESYSTEM_USE_SENDFILE
 #elif defined(__APPLE__) || __has_include(<copyfile.h>)
-#include <copyfile.h>
-#define _LIBCPP_USE_COPYFILE
+# include <copyfile.h>
+# define _LIBCPP_FILESYSTEM_USE_COPYFILE
+#else
+# include "fstream"
+# define _LIBCPP_FILESYSTEM_USE_FSTREAM
 #endif
 
 #if !defined(CLOCK_REALTIME)
-#include <sys/time.h> // for gettimeofday and timeval
-#endif // !defined(CLOCK_REALTIME)
+# include <sys/time.h> // for gettimeofday and timeval
+#endif
 
 #if defined(__ELF__) && defined(_LIBCPP_LINK_RT_LIB)
-#pragma comment(lib, "rt")
-#endif
-
-#if defined(_LIBCPP_COMPILER_GCC)
-#if _GNUC_VER < 500
-#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
-#endif
+# pragma comment(lib, "rt")
 #endif
 
 _LIBCPP_BEGIN_NAMESPACE_FILESYSTEM
 
 namespace {
+
+bool isSeparator(path::value_type C) {
+  if (C == '/')
+    return true;
+#if defined(_LIBCPP_WIN32API)
+  if (C == '\\')
+    return true;
+#endif
+  return false;
+}
+
 namespace parser {
 
 using string_view_t = path::__string_view;
@@ -179,11 +188,14 @@ public:
     switch (State) {
     case PS_BeforeBegin:
     case PS_AtEnd:
-      return "";
+      return PS("");
     case PS_InRootDir:
-      return "/";
+      if (RawEntry[0] == '\\')
+        return PS("\\");
+      else
+        return PS("/");
     case PS_InTrailingSep:
-      return "";
+      return PS("");
     case PS_InRootName:
     case PS_InFilenames:
       return RawEntry;
@@ -270,29 +282,29 @@ private:
   }
 
   PosPtr consumeSeparator(PosPtr P, PosPtr End) const noexcept {
-    if (P == End || *P != '/')
+    if (P == End || !isSeparator(*P))
       return nullptr;
     const int Inc = P < End ? 1 : -1;
     P += Inc;
-    while (P != End && *P == '/')
+    while (P != End && isSeparator(*P))
       P += Inc;
     return P;
   }
 
   PosPtr consumeName(PosPtr P, PosPtr End) const noexcept {
-    if (P == End || *P == '/')
+    if (P == End || isSeparator(*P))
       return nullptr;
     const int Inc = P < End ? 1 : -1;
     P += Inc;
-    while (P != End && *P != '/')
+    while (P != End && !isSeparator(*P))
       P += Inc;
     return P;
   }
 };
 
 string_view_pair separate_filename(string_view_t const& s) {
-  if (s == "." || s == ".." || s.empty())
-    return string_view_pair{s, ""};
+  if (s == PS(".") || s == PS("..") || s.empty())
+    return string_view_pair{s, PS("")};
   auto pos = s.find_last_of('.');
   if (pos == string_view_t::npos || pos == 0)
     return string_view_pair{s, string_view_t{}};
@@ -308,6 +320,73 @@ string_view_t createView(PosPtr S, PosPtr E) noexcept {
 
 //                       POSIX HELPERS
 
+#if defined(_LIBCPP_WIN32API)
+namespace detail {
+
+errc __win_err_to_errc(int err) {
+  constexpr struct {
+    DWORD win;
+    errc errc;
+  } win_error_mapping[] = {
+      {ERROR_ACCESS_DENIED, errc::permission_denied},
+      {ERROR_ALREADY_EXISTS, errc::file_exists},
+      {ERROR_BAD_NETPATH, errc::no_such_file_or_directory},
+      {ERROR_BAD_UNIT, errc::no_such_device},
+      {ERROR_BROKEN_PIPE, errc::broken_pipe},
+      {ERROR_BUFFER_OVERFLOW, errc::filename_too_long},
+      {ERROR_BUSY, errc::device_or_resource_busy},
+      {ERROR_BUSY_DRIVE, errc::device_or_resource_busy},
+      {ERROR_CANNOT_MAKE, errc::permission_denied},
+      {ERROR_CANTOPEN, errc::io_error},
+      {ERROR_CANTREAD, errc::io_error},
+      {ERROR_CANTWRITE, errc::io_error},
+      {ERROR_CURRENT_DIRECTORY, errc::permission_denied},
+      {ERROR_DEV_NOT_EXIST, errc::no_such_device},
+      {ERROR_DEVICE_IN_USE, errc::device_or_resource_busy},
+      {ERROR_DIR_NOT_EMPTY, errc::directory_not_empty},
+      {ERROR_DIRECTORY, errc::invalid_argument},
+      {ERROR_DISK_FULL, errc::no_space_on_device},
+      {ERROR_FILE_EXISTS, errc::file_exists},
+      {ERROR_FILE_NOT_FOUND, errc::no_such_file_or_directory},
+      {ERROR_HANDLE_DISK_FULL, errc::no_space_on_device},
+      {ERROR_INVALID_ACCESS, errc::permission_denied},
+      {ERROR_INVALID_DRIVE, errc::no_such_device},
+      {ERROR_INVALID_FUNCTION, errc::function_not_supported},
+      {ERROR_INVALID_HANDLE, errc::invalid_argument},
+      {ERROR_INVALID_NAME, errc::no_such_file_or_directory},
+      {ERROR_INVALID_PARAMETER, errc::invalid_argument},
+      {ERROR_LOCK_VIOLATION, errc::no_lock_available},
+      {ERROR_LOCKED, errc::no_lock_available},
+      {ERROR_NEGATIVE_SEEK, errc::invalid_argument},
+      {ERROR_NOACCESS, errc::permission_denied},
+      {ERROR_NOT_ENOUGH_MEMORY, errc::not_enough_memory},
+      {ERROR_NOT_READY, errc::resource_unavailable_try_again},
+      {ERROR_NOT_SAME_DEVICE, errc::cross_device_link},
+      {ERROR_NOT_SUPPORTED, errc::not_supported},
+      {ERROR_OPEN_FAILED, errc::io_error},
+      {ERROR_OPEN_FILES, errc::device_or_resource_busy},
+      {ERROR_OPERATION_ABORTED, errc::operation_canceled},
+      {ERROR_OUTOFMEMORY, errc::not_enough_memory},
+      {ERROR_PATH_NOT_FOUND, errc::no_such_file_or_directory},
+      {ERROR_READ_FAULT, errc::io_error},
+      {ERROR_REPARSE_TAG_INVALID, errc::invalid_argument},
+      {ERROR_RETRY, errc::resource_unavailable_try_again},
+      {ERROR_SEEK, errc::io_error},
+      {ERROR_SHARING_VIOLATION, errc::permission_denied},
+      {ERROR_TOO_MANY_OPEN_FILES, errc::too_many_files_open},
+      {ERROR_WRITE_FAULT, errc::io_error},
+      {ERROR_WRITE_PROTECT, errc::permission_denied},
+  };
+
+  for (const auto &pair : win_error_mapping)
+    if (pair.win == static_cast<DWORD>(err))
+      return pair.errc;
+  return errc::invalid_argument;
+}
+
+} // namespace detail
+#endif
+
 namespace detail {
 namespace {
 
@@ -503,19 +582,25 @@ _FilesystemClock::time_point _FilesystemClock::now() noexcept {
 
 filesystem_error::~filesystem_error() {}
 
+#if defined(_LIBCPP_WIN32API)
+#define PS_FMT "%ls"
+#else
+#define PS_FMT "%s"
+#endif
+
 void filesystem_error::__create_what(int __num_paths) {
   const char* derived_what = system_error::what();
   __storage_->__what_ = [&]() -> string {
-    const char* p1 = path1().native().empty() ? "\"\"" : path1().c_str();
-    const char* p2 = path2().native().empty() ? "\"\"" : path2().c_str();
+    const path::value_type* p1 = path1().native().empty() ? PS("\"\"") : path1().c_str();
+    const path::value_type* p2 = path2().native().empty() ? PS("\"\"") : path2().c_str();
     switch (__num_paths) {
     default:
       return detail::format_string("filesystem error: %s", derived_what);
     case 1:
-      return detail::format_string("filesystem error: %s [%s]", derived_what,
+      return detail::format_string("filesystem error: %s [" PS_FMT "]", derived_what,
                                    p1);
     case 2:
-      return detail::format_string("filesystem error: %s [%s] [%s]",
+      return detail::format_string("filesystem error: %s [" PS_FMT "] [" PS_FMT "]",
                                    derived_what, p1, p2);
     }
   }();
@@ -542,14 +627,18 @@ path __canonical(path const& orig_p, error_code* ec) {
   ErrorHandler<path> err("canonical", ec, &orig_p, &cwd);
 
   path p = __do_absolute(orig_p, &cwd, ec);
-#if _POSIX_VERSION >= 200112
+#if defined(_POSIX_VERSION) && _POSIX_VERSION >= 200112
   std::unique_ptr<char, decltype(&::free)>
     hold(::realpath(p.c_str(), nullptr), &::free);
   if (hold.get() == nullptr)
     return err.report(capture_errno());
   return {hold.get()};
 #else
-  char buff[PATH_MAX + 1];
+  #if defined(__MVS__) && !defined(PATH_MAX)
+    char buff[ _XOPEN_PATH_MAX + 1 ];
+  #else
+    char buff[PATH_MAX + 1];
+  #endif
   char* ret;
   if ((ret = ::realpath(p.c_str(), buff)) == nullptr)
     return err.report(capture_errno());
@@ -646,96 +735,83 @@ void __copy(const path& from, const path& to, copy_options options,
 namespace detail {
 namespace {
 
-#ifdef _LIBCPP_USE_SENDFILE
-bool copy_file_impl_sendfile(FileDescriptor& read_fd, FileDescriptor& write_fd,
-                             error_code& ec) {
+#if defined(_LIBCPP_FILESYSTEM_USE_SENDFILE)
+  bool copy_file_impl(FileDescriptor& read_fd, FileDescriptor& write_fd, error_code& ec) {
+    size_t count = read_fd.get_stat().st_size;
+    do {
+      ssize_t res;
+      if ((res = ::sendfile(write_fd.fd, read_fd.fd, nullptr, count)) == -1) {
+        ec = capture_errno();
+        return false;
+      }
+      count -= res;
+    } while (count > 0);
 
-  size_t count = read_fd.get_stat().st_size;
-  do {
-    ssize_t res;
-    if ((res = ::sendfile(write_fd.fd, read_fd.fd, nullptr, count)) == -1) {
+    ec.clear();
+
+    return true;
+  }
+#elif defined(_LIBCPP_FILESYSTEM_USE_COPYFILE)
+  bool copy_file_impl(FileDescriptor& read_fd, FileDescriptor& write_fd, error_code& ec) {
+    struct CopyFileState {
+      copyfile_state_t state;
+      CopyFileState() { state = copyfile_state_alloc(); }
+      ~CopyFileState() { copyfile_state_free(state); }
+
+    private:
+      CopyFileState(CopyFileState const&) = delete;
+      CopyFileState& operator=(CopyFileState const&) = delete;
+    };
+
+    CopyFileState cfs;
+    if (fcopyfile(read_fd.fd, write_fd.fd, cfs.state, COPYFILE_DATA) < 0) {
       ec = capture_errno();
       return false;
     }
-    count -= res;
-  } while (count > 0);
 
-  ec.clear();
-
-  return true;
-}
-#elif defined(_LIBCPP_USE_COPYFILE)
-bool copy_file_impl_copyfile(FileDescriptor& read_fd, FileDescriptor& write_fd,
-                             error_code& ec) {
-  struct CopyFileState {
-    copyfile_state_t state;
-    CopyFileState() { state = copyfile_state_alloc(); }
-    ~CopyFileState() { copyfile_state_free(state); }
-
-  private:
-    CopyFileState(CopyFileState const&) = delete;
-    CopyFileState& operator=(CopyFileState const&) = delete;
-  };
-
-  CopyFileState cfs;
-  if (fcopyfile(read_fd.fd, write_fd.fd, cfs.state, COPYFILE_DATA) < 0) {
-    ec = capture_errno();
-    return false;
+    ec.clear();
+    return true;
   }
+#elif defined(_LIBCPP_FILESYSTEM_USE_FSTREAM)
+  bool copy_file_impl(FileDescriptor& read_fd, FileDescriptor& write_fd, error_code& ec) {
+    ifstream in;
+    in.__open(read_fd.fd, ios::binary);
+    if (!in.is_open()) {
+      // This assumes that __open didn't reset the error code.
+      ec = capture_errno();
+      return false;
+    }
+    read_fd.fd = -1;
+    ofstream out;
+    out.__open(write_fd.fd, ios::binary);
+    if (!out.is_open()) {
+      ec = capture_errno();
+      return false;
+    }
+    write_fd.fd = -1;
 
-  ec.clear();
-  return true;
-}
-#endif
+    if (in.good() && out.good()) {
+      using InIt = istreambuf_iterator<char>;
+      using OutIt = ostreambuf_iterator<char>;
+      InIt bin(in);
+      InIt ein;
+      OutIt bout(out);
+      copy(bin, ein, bout);
+    }
+    if (out.fail() || in.fail()) {
+      ec = make_error_code(errc::io_error);
+      return false;
+    }
 
-// Note: This function isn't guarded by ifdef's even though it may be unused
-// in order to assure it still compiles.
-__attribute__((unused)) bool copy_file_impl_default(FileDescriptor& read_fd,
-                                                    FileDescriptor& write_fd,
-                                                    error_code& ec) {
-  ifstream in;
-  in.__open(read_fd.fd, ios::binary);
-  if (!in.is_open()) {
-    // This assumes that __open didn't reset the error code.
-    ec = capture_errno();
-    return false;
+    ec.clear();
+    return true;
   }
-  ofstream out;
-  out.__open(write_fd.fd, ios::binary);
-  if (!out.is_open()) {
-    ec = capture_errno();
-    return false;
-  }
-
-  if (in.good() && out.good()) {
-    using InIt = istreambuf_iterator<char>;
-    using OutIt = ostreambuf_iterator<char>;
-    InIt bin(in);
-    InIt ein;
-    OutIt bout(out);
-    copy(bin, ein, bout);
-  }
-  if (out.fail() || in.fail()) {
-    ec = make_error_code(errc::io_error);
-    return false;
-  }
-
-  ec.clear();
-  return true;
-}
-
-bool copy_file_impl(FileDescriptor& from, FileDescriptor& to, error_code& ec) {
-#if defined(_LIBCPP_USE_SENDFILE)
-  return copy_file_impl_sendfile(from, to, ec);
-#elif defined(_LIBCPP_USE_COPYFILE)
-  return copy_file_impl_copyfile(from, to, ec);
 #else
-  return copy_file_impl_default(from, to, ec);
-#endif
-}
+# error "Unknown implementation for copy_file_impl"
+#endif // copy_file_impl implementation
 
-} // namespace
-} // namespace detail
+} // end anonymous namespace
+} // end namespace detail
 
 bool __copy_file(const path& from, const path& to, copy_options options,
                  error_code* ec) {
@@ -870,8 +946,17 @@ bool __create_directory(const path& p, error_code* ec) {
 
   if (::mkdir(p.c_str(), static_cast<int>(perms::all)) == 0)
     return true;
-  if (errno != EEXIST)
+
+  if (errno == EEXIST) {
+    error_code mec = capture_errno();
+    error_code ignored_ec;
+    const file_status st = status(p, ignored_ec);
+    if (!is_directory(st)) {
+      err.report(mec);
+    }
+  } else {
     err.report(capture_errno());
+  }
   return false;
 }
 
@@ -889,8 +974,17 @@ bool __create_directory(path const& p, path const& attributes, error_code* ec) {
 
   if (::mkdir(p.c_str(), attr_stat.st_mode) == 0)
     return true;
-  if (errno != EEXIST)
+
+  if (errno == EEXIST) {
+    error_code mec = capture_errno();
+    error_code ignored_ec;
+    const file_status st = status(p, ignored_ec);
+    if (!is_directory(st)) {
+      err.report(mec);
+    }
+  } else {
     err.report(capture_errno());
+  }
   return false;
 }
 
@@ -1225,10 +1319,10 @@ path __temp_directory_path(error_code* ec) {
   error_code m_ec;
   file_status st = detail::posix_stat(p, &m_ec);
   if (!status_known(st))
-    return err.report(m_ec, "cannot access path \"%s\"", p);
+    return err.report(m_ec, "cannot access path \"" PS_FMT "\"", p);
 
   if (!exists(st) || !is_directory(st))
-    return err.report(errc::not_a_directory, "path \"%s\" is not a directory",
+    return err.report(errc::not_a_directory, "path \"" PS_FMT "\" is not a directory",
                       p);
 
   return p;
@@ -1284,7 +1378,7 @@ path& path::replace_extension(path const& replacement) {
   }
   if (!replacement.empty()) {
     if (replacement.native()[0] != '.') {
-      __pn_ += ".";
+      __pn_ += PS(".");
     }
     __pn_.append(replacement.__pn_);
   }
@@ -1314,7 +1408,7 @@ string_view_t path::__root_path_raw() const {
   auto PP = PathParser::CreateBegin(__pn_);
   if (PP.State == PathParser::PS_InRootName) {
     auto NextCh = PP.peek();
-    if (NextCh && *NextCh == '/') {
+    if (NextCh && isSeparator(*NextCh)) {
       ++PP;
       return createView(__pn_.data(), &PP.RawEntry.back());
     }
@@ -1406,12 +1500,16 @@ enum PathPartKind : unsigned char {
 static PathPartKind ClassifyPathPart(string_view_t Part) {
   if (Part.empty())
     return PK_TrailingSep;
-  if (Part == ".")
+  if (Part == PS("."))
     return PK_Dot;
-  if (Part == "..")
+  if (Part == PS(".."))
     return PK_DotDot;
-  if (Part == "/")
+  if (Part == PS("/"))
     return PK_RootSep;
+#if defined(_LIBCPP_WIN32API)
+  if (Part == PS("\\"))
+    return PK_RootSep;
+#endif
   return PK_Filename;
 }
 
@@ -1459,7 +1557,7 @@ path path::lexically_normal() const {
         NewPathSize -= Parts.back().first.size();
         Parts.pop_back();
       } else if (LastKind != PK_RootSep)
-        AddPart(PK_DotDot, "..");
+        AddPart(PK_DotDot, PS(".."));
       MaybeNeedTrailingSep = LastKind == PK_Filename;
       break;
     }
@@ -1474,7 +1572,7 @@ path path::lexically_normal() const {
   }
   // [fs.path.generic]p6.8: If the path is empty, add a dot.
   if (Parts.empty())
-    return ".";
+    return PS(".");
 
   // [fs.path.generic]p6.7: If the last filename is dot-dot, remove any
   // trailing directory-separator.
@@ -1486,7 +1584,7 @@ path path::lexically_normal() const {
     Result /= PK.first;
 
   if (NeedTrailingSep)
-    Result /= "";
+    Result /= PS("");
 
   return Result;
 }
@@ -1495,9 +1593,9 @@ static int DetermineLexicalElementCount(PathParser PP) {
   int Count = 0;
   for (; PP; ++PP) {
     auto Elem = *PP;
-    if (Elem == "..")
+    if (Elem == PS(".."))
       --Count;
-    else if (Elem != "." && Elem != "")
+    else if (Elem != PS(".") && Elem != PS(""))
       ++Count;
   }
   return Count;
@@ -1544,15 +1642,15 @@ path path::lexically_relative(const path& base) const {
     return {};
 
   // if n == 0 and (a == end() || a->empty()), returns path("."); otherwise
-  if (ElemCount == 0 && (PP.atEnd() || *PP == ""))
-    return ".";
+  if (ElemCount == 0 && (PP.atEnd() || *PP == PS("")))
+    return PS(".");
 
   // return a path constructed with 'n' dot-dot elements, followed by the the
   // elements of '*this' after the mismatch.
   path Result;
   // FIXME: Reserve enough room in Result that it won't have to re-allocate.
   while (ElemCount--)
-    Result /= "..";
+    Result /= PS("..");
   for (; PP; ++PP)
     Result /= *PP;
   return Result;
@@ -1565,7 +1663,7 @@ static int CompareRootName(PathParser *LHS, PathParser *RHS) {
     return 0;
 
   auto GetRootName = [](PathParser *Parser) -> string_view_t {
-    return Parser->inRootName() ? **Parser : "";
+    return Parser->inRootName() ? **Parser : PS("");
   };
   int res = GetRootName(LHS).compare(GetRootName(RHS));
   ConsumeRootName(LHS);
@@ -1674,6 +1772,36 @@ path::iterator& path::iterator::__decrement() {
   return *this;
 }
 
+#if defined(_LIBCPP_WIN32API)
+////////////////////////////////////////////////////////////////////////////
+// Windows path conversions
+size_t __wide_to_char(const wstring &str, char *out, size_t outlen) {
+  if (str.empty())
+    return 0;
+  ErrorHandler<size_t> err("__wide_to_char", nullptr);
+  UINT codepage = AreFileApisANSI() ? CP_ACP : CP_OEMCP;
+  BOOL used_default = FALSE;
+  int ret = WideCharToMultiByte(codepage, 0, str.data(), str.size(), out,
+                                outlen, nullptr, &used_default);
+  if (ret <= 0 || used_default)
+    return err.report(errc::illegal_byte_sequence);
+  return ret;
+}
+
+size_t __char_to_wide(const string &str, wchar_t *out, size_t outlen) {
+  if (str.empty())
+    return 0;
+  ErrorHandler<size_t> err("__char_to_wide", nullptr);
+  UINT codepage = AreFileApisANSI() ? CP_ACP : CP_OEMCP;
+  int ret = MultiByteToWideChar(codepage, MB_ERR_INVALID_CHARS, str.data(),
+                                str.size(), out, outlen);
+  if (ret <= 0)
+    return err.report(errc::illegal_byte_sequence);
+  return ret;
+}
+#endif
+
+
 ///////////////////////////////////////////////////////////////////////////////
 //                           directory entry definitions
 ///////////////////////////////////////////////////////////////////////////////
diff --git a/lib/libcxx/src/include/config_elast.h b/lib/libcxx/src/include/config_elast.h
index 501cbc4ffe..3113f9fb5c 100644
--- a/lib/libcxx/src/include/config_elast.h
+++ b/lib/libcxx/src/include/config_elast.h
@@ -17,10 +17,14 @@
 #include <errno.h>
 #endif
 
+// Note: _LIBCPP_ELAST needs to be defined only on platforms
+// where strerror/strerror_r can't handle out-of-range errno values.
 #if defined(ELAST)
 #define _LIBCPP_ELAST ELAST
 #elif defined(_NEWLIB_VERSION)
 #define _LIBCPP_ELAST __ELASTERROR
+#elif defined(__NuttX__)
+// No _LIBCPP_ELAST needed on NuttX
 #elif defined(__Fuchsia__)
 // No _LIBCPP_ELAST needed on Fuchsia
 #elif defined(__wasi__)
diff --git a/lib/libcxx/src/include/refstring.h b/lib/libcxx/src/include/refstring.h
index e464b79ba8..cefd7caf0f 100644
--- a/lib/libcxx/src/include/refstring.h
+++ b/lib/libcxx/src/include/refstring.h
@@ -13,12 +13,25 @@
 #include <stdexcept>
 #include <cstddef>
 #include <cstring>
-#ifdef __APPLE__
-#include <dlfcn.h>
-#include <mach-o/dyld.h>
-#endif
 #include "atomic_support.h"
 
+// MacOS and iOS used to ship with libstdc++, and still support old applications
+// linking against libstdc++. The libc++ and libstdc++ exceptions are supposed
+// to be ABI compatible, such that they can be thrown from one library and caught
+// in the other.
+//
+// For that reason, we must look for libstdc++ in the same process and if found,
+// check the string stored in the exception object to see if it is the GCC empty
+// string singleton before manipulating the reference count. This is done so that
+// if an exception is created with a zero-length string in libstdc++, libc++abi
+// won't try to delete the memory.
+#if defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) || \
+    defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__)
+#   define _LIBCPP_CHECK_FOR_GCC_EMPTY_STRING_STORAGE
+#   include <dlfcn.h>
+#   include <mach-o/dyld.h>
+#endif
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace __refstring_imp { namespace {
@@ -40,7 +53,7 @@ inline char * data_from_rep(_Rep_base *rep) noexcept {
     return data + sizeof(*rep);
 }
 
-#if defined(__APPLE__)
+#if defined(_LIBCPP_CHECK_FOR_GCC_EMPTY_STRING_STORAGE)
 inline
 const char* compute_gcc_empty_string_storage() _NOEXCEPT
 {
@@ -115,7 +128,7 @@ __libcpp_refstring::~__libcpp_refstring() {
 
 inline
 bool __libcpp_refstring::__uses_refcount() const {
-#ifdef __APPLE__
+#if defined(_LIBCPP_CHECK_FOR_GCC_EMPTY_STRING_STORAGE)
     return __imp_ != get_gcc_empty_string_storage();
 #else
     return true;
diff --git a/lib/libcxx/src/ios.cpp b/lib/libcxx/src/ios.cpp
index 2dc84be828..3a92964657 100644
--- a/lib/libcxx/src/ios.cpp
+++ b/lib/libcxx/src/ios.cpp
@@ -15,30 +15,14 @@
 #include "__locale"
 #include "algorithm"
 #include "include/config_elast.h"
-#include "istream"
 #include "limits"
 #include "memory"
 #include "new"
-#include "streambuf"
 #include "string"
 #include "__undef_macros"
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS basic_ios<char>;
-template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS basic_ios<wchar_t>;
-
-template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS basic_streambuf<char>;
-template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS basic_streambuf<wchar_t>;
-
-template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS basic_istream<char>;
-template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS basic_istream<wchar_t>;
-
-template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS basic_ostream<char>;
-template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS basic_ostream<wchar_t>;
-
-template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS basic_iostream<char>;
-
 class _LIBCPP_HIDDEN __iostream_category
     : public __do_message
 {
diff --git a/lib/libcxx/src/ios.instantiations.cpp b/lib/libcxx/src/ios.instantiations.cpp
new file mode 100644
index 0000000000..1a23687d12
--- /dev/null
+++ b/lib/libcxx/src/ios.instantiations.cpp
@@ -0,0 +1,43 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "__config"
+#include "fstream"
+#include "ios"
+#include "istream"
+#include "ostream"
+#include "sstream"
+#include "streambuf"
+
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+// Original explicit instantiations provided in the library
+template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS basic_ios<char>;
+template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS basic_ios<wchar_t>;
+template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS basic_streambuf<char>;
+template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS basic_streambuf<wchar_t>;
+template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS basic_istream<char>;
+template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS basic_istream<wchar_t>;
+template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS basic_ostream<char>;
+template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS basic_ostream<wchar_t>;
+template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS basic_iostream<char>;
+
+// Additional instantiations added later. Whether programs rely on these being
+// available is protected by _LIBCPP_ABI_ENABLE_ADDITIONAL_IOSTREAM_EXPLICIT_INSTANTIATIONS_1.
+template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS basic_stringbuf<char>;
+template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS basic_stringstream<char>;
+template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS basic_ostringstream<char>;
+template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS basic_istringstream<char>;
+template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS basic_ifstream<char>;
+template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS basic_ofstream<char>;
+template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS basic_filebuf<char>;
+
+// Add more here if needed...
+
+_LIBCPP_END_NAMESPACE_STD
diff --git a/lib/libcxx/src/iostream.cpp b/lib/libcxx/src/iostream.cpp
index ad1920abc6..ea95534c76 100644
--- a/lib/libcxx/src/iostream.cpp
+++ b/lib/libcxx/src/iostream.cpp
@@ -77,7 +77,7 @@ __asm__("?wclog@" _LIBCPP_ABI_NAMESPACE_STR "@std@@3V?$basic_ostream@_WU?$char_t
 #endif
 ;
 
-_LIBCPP_HIDDEN ios_base::Init __start_std_streams;
+_LIBCPP_HIDDEN ios_base::Init __start_std_streams _LIBCPP_INIT_PRIORITY_MAX;
 
 // On Windows the TLS storage for locales needs to be initialized before we create
 // the standard streams, otherwise it may not be alive during program termination
diff --git a/lib/libcxx/src/locale.cpp b/lib/libcxx/src/locale.cpp
index b9180880e4..a0209d0ce8 100644
--- a/lib/libcxx/src/locale.cpp
+++ b/lib/libcxx/src/locale.cpp
@@ -29,8 +29,8 @@
 #include "cwctype"
 #include "__sso_allocator"
 #if defined(_LIBCPP_MSVCRT) || defined(__MINGW32__)
-#include "support/win32/locale_win32.h"
-#elif !defined(__BIONIC__)
+#include "__support/win32/locale_win32.h"
+#elif !defined(__BIONIC__) && !defined(__NuttX__)
 #include <langinfo.h>
 #endif
 #include <stdlib.h>
@@ -158,7 +158,7 @@ const locale::category locale::all;
 class _LIBCPP_HIDDEN locale::__imp
     : public facet
 {
-    enum {N = 28};
+    enum {N = 30};
 #if defined(_LIBCPP_COMPILER_MSVC)
 // FIXME: MSVC doesn't support aligned parameters by value.
 // I can't get the __sso_allocator to work here
@@ -202,8 +202,14 @@ locale::__imp::__imp(size_t refs)
     install(&make<_VSTD::ctype<wchar_t> >(1u));
     install(&make<codecvt<char, char, mbstate_t> >(1u));
     install(&make<codecvt<wchar_t, char, mbstate_t> >(1u));
+_LIBCPP_SUPPRESS_DEPRECATED_PUSH
     install(&make<codecvt<char16_t, char, mbstate_t> >(1u));
     install(&make<codecvt<char32_t, char, mbstate_t> >(1u));
+_LIBCPP_SUPPRESS_DEPRECATED_POP
+#ifndef _LIBCPP_NO_HAS_CHAR8_T
+    install(&make<codecvt<char16_t, char8_t, mbstate_t> >(1u));
+    install(&make<codecvt<char32_t, char8_t, mbstate_t> >(1u));
+#endif
     install(&make<numpunct<char> >(1u));
     install(&make<numpunct<wchar_t> >(1u));
     install(&make<num_get<char> >(1u));
@@ -245,8 +251,14 @@ locale::__imp::__imp(const string& name, size_t refs)
         install(new ctype_byname<wchar_t>(name_));
         install(new codecvt_byname<char, char, mbstate_t>(name_));
         install(new codecvt_byname<wchar_t, char, mbstate_t>(name_));
+_LIBCPP_SUPPRESS_DEPRECATED_PUSH
         install(new codecvt_byname<char16_t, char, mbstate_t>(name_));
         install(new codecvt_byname<char32_t, char, mbstate_t>(name_));
+_LIBCPP_SUPPRESS_DEPRECATED_POP
+#ifndef _LIBCPP_NO_HAS_CHAR8_T
+        install(new codecvt_byname<char16_t, char8_t, mbstate_t>(name_));
+        install(new codecvt_byname<char32_t, char8_t, mbstate_t>(name_));
+#endif
         install(new numpunct_byname<char>(name_));
         install(new numpunct_byname<wchar_t>(name_));
         install(new moneypunct_byname<char, false>(name_));
@@ -315,8 +327,14 @@ locale::__imp::__imp(const __imp& other, const string& name, locale::category c)
             install(new ctype_byname<wchar_t>(name));
             install(new codecvt_byname<char, char, mbstate_t>(name));
             install(new codecvt_byname<wchar_t, char, mbstate_t>(name));
+_LIBCPP_SUPPRESS_DEPRECATED_PUSH
             install(new codecvt_byname<char16_t, char, mbstate_t>(name));
             install(new codecvt_byname<char32_t, char, mbstate_t>(name));
+_LIBCPP_SUPPRESS_DEPRECATED_POP
+#ifndef _LIBCPP_NO_HAS_CHAR8_T
+            install(new codecvt_byname<char16_t, char8_t, mbstate_t>(name));
+            install(new codecvt_byname<char32_t, char8_t, mbstate_t>(name));
+#endif
         }
         if (c & locale::monetary)
         {
@@ -385,8 +403,14 @@ locale::__imp::__imp(const __imp& other, const __imp& one, locale::category c)
             install_from<_VSTD::ctype<char> >(one);
             install_from<_VSTD::ctype<wchar_t> >(one);
             install_from<_VSTD::codecvt<char, char, mbstate_t> >(one);
+_LIBCPP_SUPPRESS_DEPRECATED_PUSH
             install_from<_VSTD::codecvt<char16_t, char, mbstate_t> >(one);
             install_from<_VSTD::codecvt<char32_t, char, mbstate_t> >(one);
+_LIBCPP_SUPPRESS_DEPRECATED_POP
+#ifndef _LIBCPP_NO_HAS_CHAR8_T
+            install_from<_VSTD::codecvt<char16_t, char8_t, mbstate_t> >(one);
+            install_from<_VSTD::codecvt<char32_t, char8_t, mbstate_t> >(one);
+#endif
             install_from<_VSTD::codecvt<wchar_t, char, mbstate_t> >(one);
         }
         if (c & locale::monetary)
@@ -1149,7 +1173,7 @@ ctype<char>::__classic_upper_table() _NOEXCEPT
 {
     return _LIBCPP_GET_C_LOCALE->__ctype_toupper;
 }
-#elif __NetBSD__
+#elif defined(__NetBSD__)
 const short*
 ctype<char>::__classic_lower_table() _NOEXCEPT
 {
@@ -3171,6 +3195,87 @@ codecvt<char16_t, char, mbstate_t>::do_max_length() const  _NOEXCEPT
     return 4;
 }
 
+#ifndef _LIBCPP_NO_HAS_CHAR8_T
+
+// template <> class codecvt<char16_t, char8_t, mbstate_t>
+
+locale::id codecvt<char16_t, char8_t, mbstate_t>::id;
+
+codecvt<char16_t, char8_t, mbstate_t>::~codecvt()
+{
+}
+
+codecvt<char16_t, char8_t, mbstate_t>::result
+codecvt<char16_t, char8_t, mbstate_t>::do_out(state_type&,
+    const intern_type* frm, const intern_type* frm_end, const intern_type*& frm_nxt,
+    extern_type* to, extern_type* to_end, extern_type*& to_nxt) const
+{
+    const uint16_t* _frm = reinterpret_cast<const uint16_t*>(frm);
+    const uint16_t* _frm_end = reinterpret_cast<const uint16_t*>(frm_end);
+    const uint16_t* _frm_nxt = _frm;
+    uint8_t* _to = reinterpret_cast<uint8_t*>(to);
+    uint8_t* _to_end = reinterpret_cast<uint8_t*>(to_end);
+    uint8_t* _to_nxt = _to;
+    result r = utf16_to_utf8(_frm, _frm_end, _frm_nxt, _to, _to_end, _to_nxt);
+    frm_nxt = frm + (_frm_nxt - _frm);
+    to_nxt = to + (_to_nxt - _to);
+    return r;
+}
+
+codecvt<char16_t, char8_t, mbstate_t>::result
+codecvt<char16_t, char8_t, mbstate_t>::do_in(state_type&,
+    const extern_type* frm, const extern_type* frm_end, const extern_type*& frm_nxt,
+    intern_type* to, intern_type* to_end, intern_type*& to_nxt) const
+{
+    const uint8_t* _frm = reinterpret_cast<const uint8_t*>(frm);
+    const uint8_t* _frm_end = reinterpret_cast<const uint8_t*>(frm_end);
+    const uint8_t* _frm_nxt = _frm;
+    uint16_t* _to = reinterpret_cast<uint16_t*>(to);
+    uint16_t* _to_end = reinterpret_cast<uint16_t*>(to_end);
+    uint16_t* _to_nxt = _to;
+    result r = utf8_to_utf16(_frm, _frm_end, _frm_nxt, _to, _to_end, _to_nxt);
+    frm_nxt = frm + (_frm_nxt - _frm);
+    to_nxt = to + (_to_nxt - _to);
+    return r;
+}
+
+codecvt<char16_t, char8_t, mbstate_t>::result
+codecvt<char16_t, char8_t, mbstate_t>::do_unshift(state_type&,
+    extern_type* to, extern_type*, extern_type*& to_nxt) const
+{
+    to_nxt = to;
+    return noconv;
+}
+
+int
+codecvt<char16_t, char8_t, mbstate_t>::do_encoding() const  _NOEXCEPT
+{
+    return 0;
+}
+
+bool
+codecvt<char16_t, char8_t, mbstate_t>::do_always_noconv() const  _NOEXCEPT
+{
+    return false;
+}
+
+int
+codecvt<char16_t, char8_t, mbstate_t>::do_length(state_type&,
+    const extern_type* frm, const extern_type* frm_end, size_t mx) const
+{
+    const uint8_t* _frm = reinterpret_cast<const uint8_t*>(frm);
+    const uint8_t* _frm_end = reinterpret_cast<const uint8_t*>(frm_end);
+    return utf8_to_utf16_length(_frm, _frm_end, mx);
+}
+
+int
+codecvt<char16_t, char8_t, mbstate_t>::do_max_length() const  _NOEXCEPT
+{
+    return 4;
+}
+
+#endif
+
 // template <> class codecvt<char32_t, char, mbstate_t>
 
 locale::id codecvt<char32_t, char, mbstate_t>::id;
@@ -3248,6 +3353,87 @@ codecvt<char32_t, char, mbstate_t>::do_max_length() const  _NOEXCEPT
     return 4;
 }
 
+#ifndef _LIBCPP_NO_HAS_CHAR8_T
+
+// template <> class codecvt<char32_t, char8_t, mbstate_t>
+
+locale::id codecvt<char32_t, char8_t, mbstate_t>::id;
+
+codecvt<char32_t, char8_t, mbstate_t>::~codecvt()
+{
+}
+
+codecvt<char32_t, char8_t, mbstate_t>::result
+codecvt<char32_t, char8_t, mbstate_t>::do_out(state_type&,
+    const intern_type* frm, const intern_type* frm_end, const intern_type*& frm_nxt,
+    extern_type* to, extern_type* to_end, extern_type*& to_nxt) const
+{
+    const uint32_t* _frm = reinterpret_cast<const uint32_t*>(frm);
+    const uint32_t* _frm_end = reinterpret_cast<const uint32_t*>(frm_end);
+    const uint32_t* _frm_nxt = _frm;
+    uint8_t* _to = reinterpret_cast<uint8_t*>(to);
+    uint8_t* _to_end = reinterpret_cast<uint8_t*>(to_end);
+    uint8_t* _to_nxt = _to;
+    result r = ucs4_to_utf8(_frm, _frm_end, _frm_nxt, _to, _to_end, _to_nxt);
+    frm_nxt = frm + (_frm_nxt - _frm);
+    to_nxt = to + (_to_nxt - _to);
+    return r;
+}
+
+codecvt<char32_t, char8_t, mbstate_t>::result
+codecvt<char32_t, char8_t, mbstate_t>::do_in(state_type&,
+    const extern_type* frm, const extern_type* frm_end, const extern_type*& frm_nxt,
+    intern_type* to, intern_type* to_end, intern_type*& to_nxt) const
+{
+    const uint8_t* _frm = reinterpret_cast<const uint8_t*>(frm);
+    const uint8_t* _frm_end = reinterpret_cast<const uint8_t*>(frm_end);
+    const uint8_t* _frm_nxt = _frm;
+    uint32_t* _to = reinterpret_cast<uint32_t*>(to);
+    uint32_t* _to_end = reinterpret_cast<uint32_t*>(to_end);
+    uint32_t* _to_nxt = _to;
+    result r = utf8_to_ucs4(_frm, _frm_end, _frm_nxt, _to, _to_end, _to_nxt);
+    frm_nxt = frm + (_frm_nxt - _frm);
+    to_nxt = to + (_to_nxt - _to);
+    return r;
+}
+
+codecvt<char32_t, char8_t, mbstate_t>::result
+codecvt<char32_t, char8_t, mbstate_t>::do_unshift(state_type&,
+    extern_type* to, extern_type*, extern_type*& to_nxt) const
+{
+    to_nxt = to;
+    return noconv;
+}
+
+int
+codecvt<char32_t, char8_t, mbstate_t>::do_encoding() const  _NOEXCEPT
+{
+    return 0;
+}
+
+bool
+codecvt<char32_t, char8_t, mbstate_t>::do_always_noconv() const  _NOEXCEPT
+{
+    return false;
+}
+
+int
+codecvt<char32_t, char8_t, mbstate_t>::do_length(state_type&,
+    const extern_type* frm, const extern_type* frm_end, size_t mx) const
+{
+    const uint8_t* _frm = reinterpret_cast<const uint8_t*>(frm);
+    const uint8_t* _frm_end = reinterpret_cast<const uint8_t*>(frm_end);
+    return utf8_to_ucs4_length(_frm, _frm_end, mx);
+}
+
+int
+codecvt<char32_t, char8_t, mbstate_t>::do_max_length() const  _NOEXCEPT
+{
+    return 4;
+}
+
+#endif
+
 // __codecvt_utf8<wchar_t>
 
 __codecvt_utf8<wchar_t>::result
@@ -5128,7 +5314,7 @@ __time_get_storage<wchar_t>::init(const ctype<wchar_t>& ct)
         mb = mbstate_t();
         const char* bb = buf;
         size_t j = __libcpp_mbsrtowcs_l(wbuf, &bb, countof(wbuf), &mb, __loc_);
-        if (j == size_t(-1))
+        if (j == size_t(-1) || j == 0)
             __throw_runtime_error("locale not supported");
         wbe = wbuf + j;
         __weeks_[i].assign(wbuf, wbe);
@@ -5136,7 +5322,7 @@ __time_get_storage<wchar_t>::init(const ctype<wchar_t>& ct)
         mb = mbstate_t();
         bb = buf;
         j = __libcpp_mbsrtowcs_l(wbuf, &bb, countof(wbuf), &mb, __loc_);
-        if (j == size_t(-1))
+        if (j == size_t(-1) || j == 0)
             __throw_runtime_error("locale not supported");
         wbe = wbuf + j;
         __weeks_[i+7].assign(wbuf, wbe);
@@ -5149,7 +5335,7 @@ __time_get_storage<wchar_t>::init(const ctype<wchar_t>& ct)
         mb = mbstate_t();
         const char* bb = buf;
         size_t j = __libcpp_mbsrtowcs_l(wbuf, &bb, countof(wbuf), &mb, __loc_);
-        if (j == size_t(-1))
+        if (j == size_t(-1) || j == 0)
             __throw_runtime_error("locale not supported");
         wbe = wbuf + j;
         __months_[i].assign(wbuf, wbe);
@@ -5157,7 +5343,7 @@ __time_get_storage<wchar_t>::init(const ctype<wchar_t>& ct)
         mb = mbstate_t();
         bb = buf;
         j = __libcpp_mbsrtowcs_l(wbuf, &bb, countof(wbuf), &mb, __loc_);
-        if (j == size_t(-1))
+        if (j == size_t(-1) || j == 0)
             __throw_runtime_error("locale not supported");
         wbe = wbuf + j;
         __months_[i+12].assign(wbuf, wbe);
@@ -6148,7 +6334,11 @@ template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS messages_byname<wchar_t>
 
 template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS codecvt_byname<char, char, mbstate_t>;
 template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS codecvt_byname<wchar_t, char, mbstate_t>;
-template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS codecvt_byname<char16_t, char, mbstate_t>;
-template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS codecvt_byname<char32_t, char, mbstate_t>;
+template class _LIBCPP_DEPRECATED_IN_CXX20 _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS codecvt_byname<char16_t, char, mbstate_t>;
+template class _LIBCPP_DEPRECATED_IN_CXX20 _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS codecvt_byname<char32_t, char, mbstate_t>;
+#ifndef _LIBCPP_NO_HAS_CHAR8_T
+template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS codecvt_byname<char16_t, char8_t, mbstate_t>;
+template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS codecvt_byname<char32_t, char8_t, mbstate_t>;
+#endif
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/lib/libcxx/src/memory.cpp b/lib/libcxx/src/memory.cpp
index 633c9a6f56..5a5894fd94 100644
--- a/lib/libcxx/src/memory.cpp
+++ b/lib/libcxx/src/memory.cpp
@@ -124,16 +124,12 @@ __shared_weak_count::lock() _NOEXCEPT
     return nullptr;
 }
 
-#if !defined(_LIBCPP_NO_RTTI) || !defined(_LIBCPP_BUILD_STATIC)
-
 const void*
 __shared_weak_count::__get_deleter(const type_info&) const _NOEXCEPT
 {
     return nullptr;
 }
 
-#endif  // _LIBCPP_NO_RTTI
-
 #if !defined(_LIBCPP_HAS_NO_ATOMIC_HEADER)
 
 _LIBCPP_SAFE_STATIC static const std::size_t __sp_mut_count = 16;
diff --git a/lib/libcxx/src/new.cpp b/lib/libcxx/src/new.cpp
index 901e785658..9d01330ba7 100644
--- a/lib/libcxx/src/new.cpp
+++ b/lib/libcxx/src/new.cpp
@@ -64,7 +64,7 @@ operator new(std::size_t size) _THROW_BAD_ALLOC
     if (size == 0)
         size = 1;
     void* p;
-    while ((p = ::malloc(size)) == 0)
+    while ((p = ::malloc(size)) == nullptr)
     {
         // If malloc fails and there is a new_handler,
         // call it to try free up memory.
@@ -85,7 +85,7 @@ _LIBCPP_WEAK
 void*
 operator new(size_t size, const std::nothrow_t&) _NOEXCEPT
 {
-    void* p = 0;
+    void* p = nullptr;
 #ifndef _LIBCPP_NO_EXCEPTIONS
     try
     {
@@ -111,7 +111,7 @@ _LIBCPP_WEAK
 void*
 operator new[](size_t size, const std::nothrow_t&) _NOEXCEPT
 {
-    void* p = 0;
+    void* p = nullptr;
 #ifndef _LIBCPP_NO_EXCEPTIONS
     try
     {
@@ -178,15 +178,16 @@ operator new(std::size_t size, std::align_val_t alignment) _THROW_BAD_ALLOC
         size = 1;
     if (static_cast<size_t>(alignment) < sizeof(void*))
       alignment = std::align_val_t(sizeof(void*));
+
+    // Try allocating memory. If allocation fails and there is a new_handler,
+    // call it to try free up memory, and try again until it succeeds, or until
+    // the new_handler decides to terminate.
+    //
+    // If allocation fails and there is no new_handler, we throw bad_alloc
+    // (or return nullptr if exceptions are disabled).
     void* p;
-#if defined(_LIBCPP_MSVCRT_LIKE)
-    while ((p = _aligned_malloc(size, static_cast<size_t>(alignment))) == nullptr)
-#else
-    while (::posix_memalign(&p, static_cast<size_t>(alignment), size) != 0)
-#endif
+    while ((p = std::__libcpp_aligned_alloc(static_cast<std::size_t>(alignment), size)) == nullptr)
     {
-        // If posix_memalign fails and there is a new_handler,
-        // call it to try free up memory.
         std::new_handler nh = std::get_new_handler();
         if (nh)
             nh();
@@ -194,7 +195,6 @@ operator new(std::size_t size, std::align_val_t alignment) _THROW_BAD_ALLOC
 #ifndef _LIBCPP_NO_EXCEPTIONS
             throw std::bad_alloc();
 #else
-            p = nullptr; // posix_memalign doesn't initialize 'p' on failure
             break;
 #endif
         }
@@ -206,7 +206,7 @@ _LIBCPP_WEAK
 void*
 operator new(size_t size, std::align_val_t alignment, const std::nothrow_t&) _NOEXCEPT
 {
-    void* p = 0;
+    void* p = nullptr;
 #ifndef _LIBCPP_NO_EXCEPTIONS
     try
     {
@@ -232,7 +232,7 @@ _LIBCPP_WEAK
 void*
 operator new[](size_t size, std::align_val_t alignment, const std::nothrow_t&) _NOEXCEPT
 {
-    void* p = 0;
+    void* p = nullptr;
 #ifndef _LIBCPP_NO_EXCEPTIONS
     try
     {
@@ -251,11 +251,7 @@ _LIBCPP_WEAK
 void
 operator delete(void* ptr, std::align_val_t) _NOEXCEPT
 {
-#if defined(_LIBCPP_MSVCRT_LIKE)
-    ::_aligned_free(ptr);
-#else
-    ::free(ptr);
-#endif
+    std::__libcpp_aligned_free(ptr);
 }
 
 _LIBCPP_WEAK
diff --git a/lib/libcxx/src/optional.cpp b/lib/libcxx/src/optional.cpp
index 3aaf5ea553..86d013b350 100644
--- a/lib/libcxx/src/optional.cpp
+++ b/lib/libcxx/src/optional.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "optional"
+#include "__availability"
 
 namespace std
 {
diff --git a/lib/libcxx/src/random.cpp b/lib/libcxx/src/random.cpp
index 04adc59f9b..29aa43b1e1 100644
--- a/lib/libcxx/src/random.cpp
+++ b/lib/libcxx/src/random.cpp
@@ -13,6 +13,7 @@
 #define _CRT_RAND_S
 #endif // defined(_LIBCPP_USING_WIN32_RANDOM)
 
+#include "limits"
 #include "random"
 #include "system_error"
 
@@ -29,6 +30,10 @@
 #elif defined(_LIBCPP_USING_DEV_RANDOM)
 #include <fcntl.h>
 #include <unistd.h>
+#if __has_include(<sys/ioctl.h>) && __has_include(<linux/random.h>)
+#include <sys/ioctl.h>
+#include <linux/random.h>
+#endif
 #elif defined(_LIBCPP_USING_NACL_RANDOM)
 #include <nacl/nacl_random.h>
 #endif
@@ -172,7 +177,23 @@ random_device::operator()()
 double
 random_device::entropy() const _NOEXCEPT
 {
+#if defined(_LIBCPP_USING_DEV_RANDOM) && defined(RNDGETENTCNT)
+  int ent;
+  if (::ioctl(__f_, RNDGETENTCNT, &ent) < 0)
     return 0;
+
+  if (ent < 0)
+    return 0;
+
+  if (ent > std::numeric_limits<result_type>::digits)
+    return std::numeric_limits<result_type>::digits;
+
+  return ent;
+#elif defined(__OpenBSD__)
+  return std::numeric_limits<result_type>::digits;
+#else
+  return 0;
+#endif
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/lib/libcxx/src/support/runtime/exception_fallback.ipp b/lib/libcxx/src/support/runtime/exception_fallback.ipp
index 376a0381f5..faa112f7f3 100644
--- a/lib/libcxx/src/support/runtime/exception_fallback.ipp
+++ b/lib/libcxx/src/support/runtime/exception_fallback.ipp
@@ -49,7 +49,6 @@ get_terminate() _NOEXCEPT
   return __libcpp_atomic_load(&__terminate_handler);
 }
 
-#ifndef __EMSCRIPTEN__ // We provide this in JS
 _LIBCPP_NORETURN
 void
 terminate() _NOEXCEPT
@@ -72,9 +71,7 @@ terminate() _NOEXCEPT
     }
 #endif  // _LIBCPP_NO_EXCEPTIONS
 }
-#endif // !__EMSCRIPTEN__
 
-#if !defined(__EMSCRIPTEN__)
 bool uncaught_exception() _NOEXCEPT { return uncaught_exceptions() > 0; }
 
 int uncaught_exceptions() _NOEXCEPT
@@ -83,7 +80,6 @@ int uncaught_exceptions() _NOEXCEPT
   fprintf(stderr, "uncaught_exceptions not yet implemented\n");
   ::abort();
 }
-#endif // !__EMSCRIPTEN__
 
 
 exception::~exception() _NOEXCEPT
diff --git a/lib/libcxx/src/support/solaris/xlocale.cpp b/lib/libcxx/src/support/solaris/xlocale.cpp
index d68a39f4df..d25adcd21d 100644
--- a/lib/libcxx/src/support/solaris/xlocale.cpp
+++ b/lib/libcxx/src/support/solaris/xlocale.cpp
@@ -8,7 +8,7 @@
 
 #ifdef __sun__
 
-#include "support/solaris/xlocale.h"
+#include "__support/solaris/xlocale.h"
 #include <stdarg.h>
 #include <stdio.h>
 #include <sys/localedef.h>
diff --git a/lib/libcxx/src/support/win32/locale_win32.cpp b/lib/libcxx/src/support/win32/locale_win32.cpp
index b7062db352..e7c6005fc1 100644
--- a/lib/libcxx/src/support/win32/locale_win32.cpp
+++ b/lib/libcxx/src/support/win32/locale_win32.cpp
@@ -1,5 +1,5 @@
 // -*- C++ -*-
-//===-------------------- support/win32/locale_win32.cpp ------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/lib/libcxx/src/support/win32/support.cpp b/lib/libcxx/src/support/win32/support.cpp
index d156e02e3e..52453f5479 100644
--- a/lib/libcxx/src/support/win32/support.cpp
+++ b/lib/libcxx/src/support/win32/support.cpp
@@ -1,5 +1,5 @@
 // -*- C++ -*-
-//===----------------------- support/win32/support.h ----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/lib/libcxx/src/support/win32/thread_win32.cpp b/lib/libcxx/src/support/win32/thread_win32.cpp
index 83e7e9f6ce..35c4c87145 100644
--- a/lib/libcxx/src/support/win32/thread_win32.cpp
+++ b/lib/libcxx/src/support/win32/thread_win32.cpp
@@ -1,5 +1,5 @@
 // -*- C++ -*-
-//===-------------------- support/win32/thread_win32.cpp ------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/lib/libcxx/src/thread.cpp b/lib/libcxx/src/thread.cpp
index 5f44e9e40f..5959d8b711 100644
--- a/lib/libcxx/src/thread.cpp
+++ b/lib/libcxx/src/thread.cpp
@@ -14,17 +14,9 @@
 #include "vector"
 #include "future"
 #include "limits"
-#include <sys/types.h>
-
-#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__))
-# include <sys/param.h>
-# if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
-#   include <sys/sysctl.h>
-# endif
-#endif // defined(__unix__) || (defined(__APPLE__) && defined(__MACH__))
 
 #if __has_include(<unistd.h>)
-#include <unistd.h>
+# include <unistd.h> // for sysconf
 #endif
 
 #if defined(__NetBSD__)
@@ -80,13 +72,7 @@ thread::detach()
 unsigned
 thread::hardware_concurrency() _NOEXCEPT
 {
-#if defined(CTL_HW) && defined(HW_NCPU)
-    unsigned n;
-    int mib[2] = {CTL_HW, HW_NCPU};
-    std::size_t s = sizeof(n);
-    sysctl(mib, 2, &n, &s, 0, 0);
-    return n;
-#elif defined(_SC_NPROCESSORS_ONLN)
+#if defined(_SC_NPROCESSORS_ONLN)
     long result = sysconf(_SC_NPROCESSORS_ONLN);
     // sysconf returns -1 if the name is invalid, the option does not exist or
     // does not have a definite limit.
diff --git a/lib/libcxxabi/include/__cxxabi_config.h b/lib/libcxxabi/include/__cxxabi_config.h
index b5444d67ee..cffedb88df 100644
--- a/lib/libcxxabi/include/__cxxabi_config.h
+++ b/lib/libcxxabi/include/__cxxabi_config.h
@@ -18,6 +18,19 @@
 #define __has_attribute(_attribute_) 0
 #endif
 
+#if defined(__clang__)
+#  define _LIBCXXABI_COMPILER_CLANG
+#  ifndef __apple_build_version__
+#    define _LIBCXXABI_CLANG_VER (__clang_major__ * 100 + __clang_minor__)
+#  endif
+#elif defined(__GNUC__)
+#  define _LIBCXXABI_COMPILER_GCC
+#elif defined(_MSC_VER)
+#  define _LIBCXXABI_COMPILER_MSVC
+#elif defined(__IBMCPP__)
+#  define _LIBCXXABI_COMPILER_IBM
+#endif
+
 #if defined(_WIN32)
  #if defined(_LIBCXXABI_DISABLE_VISIBILITY_ANNOTATIONS)
   #define _LIBCXXABI_HIDDEN
@@ -53,7 +66,7 @@
  #endif
 #endif
 
-#if defined(_WIN32)
+#if defined(_LIBCXXABI_COMPILER_MSVC)
 #define _LIBCXXABI_WEAK
 #else
 #define _LIBCXXABI_WEAK __attribute__((__weak__))
@@ -72,7 +85,7 @@
 #endif
 
 // wasm32 follows the arm32 ABI convention of using 32-bit guard.
-#if defined(__arm__) || defined(__wasm32__)
+#if defined(__arm__) || defined(__wasm32__) || defined(__ARM64_ARCH_8_32__)
 #  define _LIBCXXABI_GUARD_ABI_ARM
 #endif
 
diff --git a/lib/libcxxabi/include/cxxabi.h b/lib/libcxxabi/include/cxxabi.h
index d21d3e1e23..43ce6f5f74 100644
--- a/lib/libcxxabi/include/cxxabi.h
+++ b/lib/libcxxabi/include/cxxabi.h
@@ -21,6 +21,7 @@
 
 #define _LIBCPPABI_VERSION 1002
 #define _LIBCXXABI_NORETURN  __attribute__((noreturn))
+#define _LIBCXXABI_ALWAYS_COLD __attribute__((cold))
 
 #ifdef __cplusplus
 
@@ -78,13 +79,13 @@ extern _LIBCXXABI_FUNC_VIS _LIBCXXABI_NORETURN void __cxa_deleted_virtual(void);
 
 // 3.3.2 One-time Construction API
 #if defined(_LIBCXXABI_GUARD_ABI_ARM)
-extern _LIBCXXABI_FUNC_VIS int __cxa_guard_acquire(uint32_t *);
-extern _LIBCXXABI_FUNC_VIS void __cxa_guard_release(uint32_t *);
-extern _LIBCXXABI_FUNC_VIS void __cxa_guard_abort(uint32_t *);
+extern _LIBCXXABI_FUNC_VIS _LIBCXXABI_ALWAYS_COLD int __cxa_guard_acquire(uint32_t *);
+extern _LIBCXXABI_FUNC_VIS _LIBCXXABI_ALWAYS_COLD void __cxa_guard_release(uint32_t *);
+extern _LIBCXXABI_FUNC_VIS _LIBCXXABI_ALWAYS_COLD void __cxa_guard_abort(uint32_t *);
 #else
-extern _LIBCXXABI_FUNC_VIS int __cxa_guard_acquire(uint64_t *);
-extern _LIBCXXABI_FUNC_VIS void __cxa_guard_release(uint64_t *);
-extern _LIBCXXABI_FUNC_VIS void __cxa_guard_abort(uint64_t *);
+extern _LIBCXXABI_FUNC_VIS _LIBCXXABI_ALWAYS_COLD int __cxa_guard_acquire(uint64_t *);
+extern _LIBCXXABI_FUNC_VIS _LIBCXXABI_ALWAYS_COLD void __cxa_guard_release(uint64_t *);
+extern _LIBCXXABI_FUNC_VIS _LIBCXXABI_ALWAYS_COLD void __cxa_guard_abort(uint64_t *);
 #endif
 
 // 3.3.3 Array Construction and Destruction API
@@ -136,9 +137,9 @@ __cxa_vec_cctor(void *dest_array, void *src_array, size_t element_count,
                 void (*destructor)(void *));
 
 // 3.3.5.3 Runtime API
-extern _LIBCXXABI_FUNC_VIS int __cxa_atexit(void (*f)(void *), void *p,
-                                            void *d);
-extern _LIBCXXABI_FUNC_VIS int __cxa_finalize(void *);
+// These functions are part of the C++ ABI, but they are not defined in libc++abi:
+//    int __cxa_atexit(void (*)(void *), void *, void *);
+//    void __cxa_finalize(void *);
 
 // 3.4 Demangler API
 extern _LIBCXXABI_FUNC_VIS char *__cxa_demangle(const char *mangled_name,
diff --git a/lib/libcxxabi/src/cxa_default_handlers.cpp b/lib/libcxxabi/src/cxa_default_handlers.cpp
index d2f823d2b7..a24ee01453 100644
--- a/lib/libcxxabi/src/cxa_default_handlers.cpp
+++ b/lib/libcxxabi/src/cxa_default_handlers.cpp
@@ -45,6 +45,7 @@ static void demangling_terminate_handler()
                         exception_header + 1;
                 const __shim_type_info* thrown_type =
                     static_cast<const __shim_type_info*>(exception_header->exceptionType);
+#if !defined(LIBCXXABI_NON_DEMANGLING_TERMINATE)
                 // Try to get demangled name of thrown_type
                 int status;
                 char buf[1024];
@@ -52,6 +53,9 @@ static void demangling_terminate_handler()
                 const char* name = __cxa_demangle(thrown_type->name(), buf, &len, &status);
                 if (status != 0)
                     name = thrown_type->name();
+#else
+                const char* name = thrown_type->name();
+#endif
                 // If the uncaught exception can be caught with std::exception&
                 const __shim_type_info* catch_type =
                     static_cast<const __shim_type_info*>(&typeid(std::exception));
diff --git a/lib/libcxxabi/src/cxa_guard_impl.h b/lib/libcxxabi/src/cxa_guard_impl.h
index f6a698e23a..6f873f241f 100644
--- a/lib/libcxxabi/src/cxa_guard_impl.h
+++ b/lib/libcxxabi/src/cxa_guard_impl.h
@@ -54,6 +54,14 @@
 #endif
 #endif
 
+#if defined(__clang__)
+# pragma clang diagnostic push
+# pragma clang diagnostic ignored "-Wtautological-pointer-compare"
+#elif defined(__GNUC__)
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Waddress"
+#endif
+
 // To make testing possible, this header is included from both cxa_guard.cpp
 // and a number of tests.
 //
@@ -112,25 +120,25 @@ class AtomicInt {
 public:
   using MemoryOrder = std::__libcpp_atomic_order;
 
-  explicit AtomicInt(IntType *b) : b(b) {}
+  explicit AtomicInt(IntType *b) : b_(b) {}
   AtomicInt(AtomicInt const&) = delete;
   AtomicInt& operator=(AtomicInt const&) = delete;
 
   IntType load(MemoryOrder ord) {
-    return std::__libcpp_atomic_load(b, ord);
+    return std::__libcpp_atomic_load(b_, ord);
   }
   void store(IntType val, MemoryOrder ord) {
-    std::__libcpp_atomic_store(b, val, ord);
+    std::__libcpp_atomic_store(b_, val, ord);
   }
   IntType exchange(IntType new_val, MemoryOrder ord) {
-    return std::__libcpp_atomic_exchange(b, new_val, ord);
+    return std::__libcpp_atomic_exchange(b_, new_val, ord);
   }
   bool compare_exchange(IntType *expected, IntType desired, MemoryOrder ord_success, MemoryOrder ord_failure) {
-    return std::__libcpp_atomic_compare_exchange(b, expected, desired, ord_success, ord_failure);
+    return std::__libcpp_atomic_compare_exchange(b_, expected, desired, ord_success, ord_failure);
   }
 
 private:
-  IntType *b;
+  IntType *b_;
 };
 
 //===----------------------------------------------------------------------===//
@@ -154,14 +162,7 @@ constexpr uint32_t (*PlatformThreadID)() = nullptr;
 
 
 constexpr bool PlatformSupportsThreadID() {
-#ifdef __clang__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wtautological-pointer-compare"
-#endif
   return +PlatformThreadID != nullptr;
-#ifdef __clang__
-#pragma clang diagnostic pop
-#endif
 }
 
 //===----------------------------------------------------------------------===//
@@ -375,18 +376,18 @@ private:
     LockGuard& operator=(LockGuard const&) = delete;
 
     explicit LockGuard(const char* calling_func)
-        : calling_func(calling_func)  {
+        : calling_func_(calling_func)  {
       if (global_mutex.lock())
-        ABORT_WITH_MESSAGE("%s failed to acquire mutex", calling_func);
+        ABORT_WITH_MESSAGE("%s failed to acquire mutex", calling_func_);
     }
 
     ~LockGuard() {
       if (global_mutex.unlock())
-        ABORT_WITH_MESSAGE("%s failed to release mutex", calling_func);
+        ABORT_WITH_MESSAGE("%s failed to release mutex", calling_func_);
     }
 
   private:
-    const char* const calling_func;
+    const char* const calling_func_;
   };
 };
 
@@ -411,14 +412,7 @@ constexpr void (*PlatformFutexWake)(int*) = nullptr;
 #endif
 
 constexpr bool PlatformSupportsFutex() {
-#ifdef __clang__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wtautological-pointer-compare"
-#endif
   return +PlatformFutexWait != nullptr;
-#ifdef __clang__
-#pragma clang diagnostic pop
-#endif
 }
 
 /// InitByteFutex - Manages initialization using atomics and the futex syscall
@@ -589,4 +583,10 @@ using SelectedImplementation =
 } // end namespace
 } // end namespace __cxxabiv1
 
+#if defined(__clang__)
+# pragma clang diagnostic pop
+#elif defined(__GNUC__)
+# pragma GCC diagnostic pop
+#endif
+
 #endif // LIBCXXABI_SRC_INCLUDE_CXA_GUARD_IMPL_H
diff --git a/lib/libcxxabi/src/cxa_personality.cpp b/lib/libcxxabi/src/cxa_personality.cpp
index f276257761..81aa85165d 100644
--- a/lib/libcxxabi/src/cxa_personality.cpp
+++ b/lib/libcxxabi/src/cxa_personality.cpp
@@ -684,27 +684,21 @@ static void scan_eh_tab(scan_results &results, _Unwind_Action actions,
                 return;
             }
             landingPad = (uintptr_t)lpStart + landingPad;
+            results.landingPad = landingPad;
 #else  // __USING_SJLJ_EXCEPTIONS__
             ++landingPad;
 #endif  // __USING_SJLJ_EXCEPTIONS__
             if (actionEntry == 0)
             {
                 // Found a cleanup
-                // If this is a type 1 or type 2 search, there are no handlers
-                // If this is a type 3 search, you want to install the cleanup.
-                if ((actions & _UA_CLEANUP_PHASE) && !(actions & _UA_HANDLER_FRAME))
-                {
-                    results.ttypeIndex = 0;  // Redundant but clarifying
-                    results.landingPad = landingPad;
-                    results.reason = _URC_HANDLER_FOUND;
-                    return;
-                }
-                // No handler here
-                results.reason = _URC_CONTINUE_UNWIND;
+                results.reason = actions & _UA_SEARCH_PHASE
+                                     ? _URC_CONTINUE_UNWIND
+                                     : _URC_HANDLER_FOUND;
                 return;
             }
             // Convert 1-based byte offset into
             const uint8_t* action = actionTableStart + (actionEntry - 1);
+            bool hasCleanup = false;
             // Scan action entries until you find a matching handler, cleanup, or the end of action list
             while (true)
             {
@@ -720,27 +714,17 @@ static void scan_eh_tab(scan_results &results, _Unwind_Action actions,
                                            native_exception, unwind_exception);
                     if (catchType == 0)
                     {
-                        // Found catch (...) catches everything, including foreign exceptions
-                        // If this is a type 1 search save state and return _URC_HANDLER_FOUND
-                        // If this is a type 2 search save state and return _URC_HANDLER_FOUND
-                        // If this is a type 3 search !_UA_FORCE_UNWIND, we should have found this in phase 1!
-                        // If this is a type 3 search _UA_FORCE_UNWIND, ignore handler and continue scan
-                        if ((actions & _UA_SEARCH_PHASE) || (actions & _UA_HANDLER_FRAME))
-                        {
-                            // Save state and return _URC_HANDLER_FOUND
-                            results.ttypeIndex = ttypeIndex;
-                            results.actionRecord = actionRecord;
-                            results.landingPad = landingPad;
-                            results.adjustedPtr = get_thrown_object_ptr(unwind_exception);
-                            results.reason = _URC_HANDLER_FOUND;
-                            return;
-                        }
-                        else if (!(actions & _UA_FORCE_UNWIND))
-                        {
-                            // It looks like the exception table has changed
-                            //    on us.  Likely stack corruption!
-                            call_terminate(native_exception, unwind_exception);
-                        }
+                        // Found catch (...) catches everything, including
+                        // foreign exceptions. This is search phase, cleanup
+                        // phase with foreign exception, or forced unwinding.
+                        assert(actions & (_UA_SEARCH_PHASE | _UA_HANDLER_FRAME |
+                                          _UA_FORCE_UNWIND));
+                        results.ttypeIndex = ttypeIndex;
+                        results.actionRecord = actionRecord;
+                        results.adjustedPtr =
+                            get_thrown_object_ptr(unwind_exception);
+                        results.reason = _URC_HANDLER_FOUND;
+                        return;
                     }
                     // Else this is a catch (T) clause and will never
                     //    catch a foreign exception
@@ -757,36 +741,25 @@ static void scan_eh_tab(scan_results &results, _Unwind_Action actions,
                         }
                         if (catchType->can_catch(excpType, adjustedPtr))
                         {
-                            // Found a matching handler
-                            // If this is a type 1 search save state and return _URC_HANDLER_FOUND
-                            // If this is a type 3 search and !_UA_FORCE_UNWIND, we should have found this in phase 1!
-                            // If this is a type 3 search and _UA_FORCE_UNWIND, ignore handler and continue scan
-                            if (actions & _UA_SEARCH_PHASE)
-                            {
-                                // Save state and return _URC_HANDLER_FOUND
-                                results.ttypeIndex = ttypeIndex;
-                                results.actionRecord = actionRecord;
-                                results.landingPad = landingPad;
-                                results.adjustedPtr = adjustedPtr;
-                                results.reason = _URC_HANDLER_FOUND;
-                                return;
-                            }
-                            else if (!(actions & _UA_FORCE_UNWIND))
-                            {
-                                // It looks like the exception table has changed
-                                //    on us.  Likely stack corruption!
-                                call_terminate(native_exception, unwind_exception);
-                            }
+                            // Found a matching handler. This is either search
+                            // phase or forced unwinding.
+                            assert(actions &
+                                   (_UA_SEARCH_PHASE | _UA_FORCE_UNWIND));
+                            results.ttypeIndex = ttypeIndex;
+                            results.actionRecord = actionRecord;
+                            results.adjustedPtr = adjustedPtr;
+                            results.reason = _URC_HANDLER_FOUND;
+                            return;
                         }
                     }
                     // Scan next action ...
                 }
                 else if (ttypeIndex < 0)
                 {
-                    // Found an exception spec.  If this is a foreign exception,
-                    //   it is always caught.
-                    if (native_exception)
-                    {
+                    // Found an exception specification.
+                    if (actions & _UA_FORCE_UNWIND) {
+                        // Skip if forced unwinding.
+                    } else if (native_exception) {
                         // Does the exception spec catch this native exception?
                         __cxa_exception* exception_header = (__cxa_exception*)(unwind_exception+1) - 1;
                         void* adjustedPtr = get_thrown_object_ptr(unwind_exception);
@@ -801,77 +774,38 @@ static void scan_eh_tab(scan_results &results, _Unwind_Action actions,
                                                      ttypeEncoding, excpType,
                                                      adjustedPtr, unwind_exception))
                         {
-                            // native exception caught by exception spec
-                            // If this is a type 1 search, save state and return _URC_HANDLER_FOUND
-                            // If this is a type 3 search !_UA_FORCE_UNWIND, we should have found this in phase 1!
-                            // If this is a type 3 search _UA_FORCE_UNWIND, ignore handler and continue scan
-                            if (actions & _UA_SEARCH_PHASE)
-                            {
-                                // Save state and return _URC_HANDLER_FOUND
-                                results.ttypeIndex = ttypeIndex;
-                                results.actionRecord = actionRecord;
-                                results.landingPad = landingPad;
-                                results.adjustedPtr = adjustedPtr;
-                                results.reason = _URC_HANDLER_FOUND;
-                                return;
-                            }
-                            else if (!(actions & _UA_FORCE_UNWIND))
-                            {
-                                // It looks like the exception table has changed
-                                //    on us.  Likely stack corruption!
-                                call_terminate(native_exception, unwind_exception);
-                            }
-                        }
-                    }
-                    else
-                    {
-                        // foreign exception caught by exception spec
-                        // If this is a type 1 search, save state and return _URC_HANDLER_FOUND
-                        // If this is a type 2 search, save state and return _URC_HANDLER_FOUND
-                        // If this is a type 3 search !_UA_FORCE_UNWIND, we should have found this in phase 1!
-                        // If this is a type 3 search _UA_FORCE_UNWIND, ignore handler and continue scan
-                        if ((actions & _UA_SEARCH_PHASE) || (actions & _UA_HANDLER_FRAME))
-                        {
-                            // Save state and return _URC_HANDLER_FOUND
+                            // Native exception caught by exception
+                            // specification.
+                            assert(actions & _UA_SEARCH_PHASE);
                             results.ttypeIndex = ttypeIndex;
                             results.actionRecord = actionRecord;
-                            results.landingPad = landingPad;
-                            results.adjustedPtr = get_thrown_object_ptr(unwind_exception);
+                            results.adjustedPtr = adjustedPtr;
                             results.reason = _URC_HANDLER_FOUND;
                             return;
                         }
-                        else if (!(actions & _UA_FORCE_UNWIND))
-                        {
-                            // It looks like the exception table has changed
-                            //    on us.  Likely stack corruption!
-                            call_terminate(native_exception, unwind_exception);
-                        }
-                    }
-                    // Scan next action ...
-                }
-                else  // ttypeIndex == 0
-                {
-                    // Found a cleanup
-                    // If this is a type 1 search, ignore it and continue scan
-                    // If this is a type 2 search, ignore it and continue scan
-                    // If this is a type 3 search, save state and return _URC_HANDLER_FOUND
-                    if ((actions & _UA_CLEANUP_PHASE) && !(actions & _UA_HANDLER_FRAME))
-                    {
-                        // Save state and return _URC_HANDLER_FOUND
+                    } else {
+                        // foreign exception caught by exception spec
                         results.ttypeIndex = ttypeIndex;
                         results.actionRecord = actionRecord;
-                        results.landingPad = landingPad;
-                        results.adjustedPtr = get_thrown_object_ptr(unwind_exception);
+                        results.adjustedPtr =
+                            get_thrown_object_ptr(unwind_exception);
                         results.reason = _URC_HANDLER_FOUND;
                         return;
                     }
+                    // Scan next action ...
+                } else {
+                    hasCleanup = true;
                 }
                 const uint8_t* temp = action;
                 int64_t actionOffset = readSLEB128(&temp);
                 if (actionOffset == 0)
                 {
-                    // End of action list, no matching handler or cleanup found
-                    results.reason = _URC_CONTINUE_UNWIND;
+                    // End of action list. If this is phase 2 and we have found
+                    // a cleanup (ttypeIndex=0), return _URC_HANDLER_FOUND;
+                    // otherwise return _URC_CONTINUE_UNWIND.
+                    results.reason = hasCleanup && actions & _UA_CLEANUP_PHASE
+                                         ? _URC_HANDLER_FOUND
+                                         : _URC_CONTINUE_UNWIND;
                     return;
                 }
                 // Go to next action
@@ -962,78 +896,51 @@ __gxx_personality_v0
     bool native_exception = (exceptionClass     & get_vendor_and_language) ==
                             (kOurExceptionClass & get_vendor_and_language);
     scan_results results;
+    // Process a catch handler for a native exception first.
+    if (actions == (_UA_CLEANUP_PHASE | _UA_HANDLER_FRAME) &&
+        native_exception) {
+        // Reload the results from the phase 1 cache.
+        __cxa_exception* exception_header =
+            (__cxa_exception*)(unwind_exception + 1) - 1;
+        results.ttypeIndex = exception_header->handlerSwitchValue;
+        results.actionRecord = exception_header->actionRecord;
+        results.languageSpecificData = exception_header->languageSpecificData;
+        results.landingPad =
+            reinterpret_cast<uintptr_t>(exception_header->catchTemp);
+        results.adjustedPtr = exception_header->adjustedPtr;
+
+        // Jump to the handler.
+        set_registers(unwind_exception, context, results);
+        return _URC_INSTALL_CONTEXT;
+    }
+
+    // In other cases we need to scan LSDA.
+    scan_eh_tab(results, actions, native_exception, unwind_exception, context);
+    if (results.reason == _URC_CONTINUE_UNWIND ||
+        results.reason == _URC_FATAL_PHASE1_ERROR)
+        return results.reason;
+
     if (actions & _UA_SEARCH_PHASE)
     {
         // Phase 1 search:  All we're looking for in phase 1 is a handler that
         //   halts unwinding
-        scan_eh_tab(results, actions, native_exception, unwind_exception, context);
-        if (results.reason == _URC_HANDLER_FOUND)
-        {
-            // Found one.  Can we cache the results somewhere to optimize phase 2?
-            if (native_exception)
-            {
-                __cxa_exception* exception_header = (__cxa_exception*)(unwind_exception+1) - 1;
-                exception_header->handlerSwitchValue = static_cast<int>(results.ttypeIndex);
-                exception_header->actionRecord = results.actionRecord;
-                exception_header->languageSpecificData = results.languageSpecificData;
-                exception_header->catchTemp = reinterpret_cast<void*>(results.landingPad);
-                exception_header->adjustedPtr = results.adjustedPtr;
-            }
-            return _URC_HANDLER_FOUND;
+        assert(results.reason == _URC_HANDLER_FOUND);
+        if (native_exception) {
+            // For a native exception, cache the LSDA result.
+            __cxa_exception* exc = (__cxa_exception*)(unwind_exception + 1) - 1;
+            exc->handlerSwitchValue = static_cast<int>(results.ttypeIndex);
+            exc->actionRecord = results.actionRecord;
+            exc->languageSpecificData = results.languageSpecificData;
+            exc->catchTemp = reinterpret_cast<void*>(results.landingPad);
+            exc->adjustedPtr = results.adjustedPtr;
         }
-        // Did not find a catching-handler.  Return the results of the scan
-        //    (normally _URC_CONTINUE_UNWIND, but could have been _URC_FATAL_PHASE1_ERROR
-        //     if we were called improperly).
-        return results.reason;
+        return _URC_HANDLER_FOUND;
     }
-    if (actions & _UA_CLEANUP_PHASE)
-    {
-        // Phase 2 search:
-        //  Did we find a catching handler in phase 1?
-        if (actions & _UA_HANDLER_FRAME)
-        {
-            // Yes, phase 1 said we have a catching handler here.
-            // Did we cache the results of the scan?
-            if (native_exception)
-            {
-                // Yes, reload the results from the cache.
-                __cxa_exception* exception_header = (__cxa_exception*)(unwind_exception+1) - 1;
-                results.ttypeIndex = exception_header->handlerSwitchValue;
-                results.actionRecord = exception_header->actionRecord;
-                results.languageSpecificData = exception_header->languageSpecificData;
-                results.landingPad = reinterpret_cast<uintptr_t>(exception_header->catchTemp);
-                results.adjustedPtr = exception_header->adjustedPtr;
-            }
-            else
-            {
-                // No, do the scan again to reload the results.
-                scan_eh_tab(results, actions, native_exception, unwind_exception, context);
-                // Phase 1 told us we would find a handler.  Now in Phase 2 we
-                //   didn't find a handler.  The eh table should not be changing!
-                if (results.reason != _URC_HANDLER_FOUND)
-                    call_terminate(native_exception, unwind_exception);
-            }
-            // Jump to the handler
-            set_registers(unwind_exception, context, results);
-            return _URC_INSTALL_CONTEXT;
-        }
-        // Either we didn't do a phase 1 search (due to forced unwinding), or
-        //   phase 1 reported no catching-handlers.
-        // Search for a (non-catching) cleanup
-        scan_eh_tab(results, actions, native_exception, unwind_exception, context);
-        if (results.reason == _URC_HANDLER_FOUND)
-        {
-            // Found a non-catching handler.  Jump to it:
-            set_registers(unwind_exception, context, results);
-            return _URC_INSTALL_CONTEXT;
-        }
-        // Did not find a cleanup.  Return the results of the scan
-        //    (normally _URC_CONTINUE_UNWIND, but could have been _URC_FATAL_PHASE2_ERROR
-        //     if we were called improperly).
-        return results.reason;
-    }
-    // We were called improperly: neither a phase 1 or phase 2 search
-    return _URC_FATAL_PHASE1_ERROR;
+
+    assert(actions & _UA_CLEANUP_PHASE);
+    assert(results.reason == _URC_HANDLER_FOUND);
+    set_registers(unwind_exception, context, results);
+    return _URC_INSTALL_CONTEXT;
 }
 
 #if defined(__SEH__) && !defined(__USING_SJLJ_EXCEPTIONS__)
diff --git a/lib/libcxxabi/src/demangle/ItaniumDemangle.h b/lib/libcxxabi/src/demangle/ItaniumDemangle.h
index 6ab8732183..e5fca98f92 100644
--- a/lib/libcxxabi/src/demangle/ItaniumDemangle.h
+++ b/lib/libcxxabi/src/demangle/ItaniumDemangle.h
@@ -82,6 +82,7 @@
     X(PostfixExpr) \
     X(ConditionalExpr) \
     X(MemberExpr) \
+    X(SubobjectExpr) \
     X(EnclosingExpr) \
     X(CastExpr) \
     X(SizeofParamPackExpr) \
@@ -91,10 +92,10 @@
     X(PrefixExpr) \
     X(FunctionParam) \
     X(ConversionExpr) \
+    X(PointerToMemberConversionExpr) \
     X(InitListExpr) \
     X(FoldExpr) \
     X(ThrowExpr) \
-    X(UUIDOfExpr) \
     X(BoolExpr) \
     X(StringLiteral) \
     X(LambdaExpr) \
@@ -1656,6 +1657,40 @@ public:
   }
 };
 
+class SubobjectExpr : public Node {
+  const Node *Type;
+  const Node *SubExpr;
+  StringView Offset;
+  NodeArray UnionSelectors;
+  bool OnePastTheEnd;
+
+public:
+  SubobjectExpr(const Node *Type_, const Node *SubExpr_, StringView Offset_,
+                NodeArray UnionSelectors_, bool OnePastTheEnd_)
+      : Node(KSubobjectExpr), Type(Type_), SubExpr(SubExpr_), Offset(Offset_),
+        UnionSelectors(UnionSelectors_), OnePastTheEnd(OnePastTheEnd_) {}
+
+  template<typename Fn> void match(Fn F) const {
+    F(Type, SubExpr, Offset, UnionSelectors, OnePastTheEnd);
+  }
+
+  void printLeft(OutputStream &S) const override {
+    SubExpr->print(S);
+    S += ".<";
+    Type->print(S);
+    S += " at offset ";
+    if (Offset.empty()) {
+      S += "0";
+    } else if (Offset[0] == 'n') {
+      S += "-";
+      S += Offset.dropFront();
+    } else {
+      S += Offset;
+    }
+    S += ">";
+  }
+};
+
 class EnclosingExpr : public Node {
   const StringView Prefix;
   const Node *Infix;
@@ -1843,6 +1878,28 @@ public:
   }
 };
 
+class PointerToMemberConversionExpr : public Node {
+  const Node *Type;
+  const Node *SubExpr;
+  StringView Offset;
+
+public:
+  PointerToMemberConversionExpr(const Node *Type_, const Node *SubExpr_,
+                                StringView Offset_)
+      : Node(KPointerToMemberConversionExpr), Type(Type_), SubExpr(SubExpr_),
+        Offset(Offset_) {}
+
+  template<typename Fn> void match(Fn F) const { F(Type, SubExpr, Offset); }
+
+  void printLeft(OutputStream &S) const override {
+    S += "(";
+    Type->print(S);
+    S += ")(";
+    SubExpr->print(S);
+    S += ")";
+  }
+};
+
 class InitListExpr : public Node {
   const Node *Ty;
   NodeArray Inits;
@@ -1977,21 +2034,6 @@ public:
   }
 };
 
-// MSVC __uuidof extension, generated by clang in -fms-extensions mode.
-class UUIDOfExpr : public Node {
-  Node *Operand;
-public:
-  UUIDOfExpr(Node *Operand_) : Node(KUUIDOfExpr), Operand(Operand_) {}
-
-  template<typename Fn> void match(Fn F) const { F(Operand); }
-
-  void printLeft(OutputStream &S) const override {
-    S << "__uuidof(";
-    Operand->print(S);
-    S << ")";
-  }
-};
-
 class BoolExpr : public Node {
   bool Value;
 
@@ -2313,9 +2355,9 @@ template <typename Derived, typename Alloc> struct AbstractManglingParser {
     TemplateParamList Params;
 
   public:
-    ScopedTemplateParamList(AbstractManglingParser *Parser)
-        : Parser(Parser),
-          OldNumTemplateParamLists(Parser->TemplateParams.size()) {
+    ScopedTemplateParamList(AbstractManglingParser *TheParser)
+        : Parser(TheParser),
+          OldNumTemplateParamLists(TheParser->TemplateParams.size()) {
       Parser->TemplateParams.push_back(&Params);
     }
     ~ScopedTemplateParamList() {
@@ -2437,6 +2479,8 @@ template <typename Derived, typename Alloc> struct AbstractManglingParser {
   Node *parseConversionExpr();
   Node *parseBracedExpr();
   Node *parseFoldExpr();
+  Node *parsePointerToMemberConversionExpr();
+  Node *parseSubobjectExpr();
 
   /// Parse the <type> production.
   Node *parseType();
@@ -4404,6 +4448,50 @@ Node *AbstractManglingParser<Derived, Alloc>::parseFoldExpr() {
   return make<FoldExpr>(IsLeftFold, OperatorName, Pack, Init);
 }
 
+// <expression> ::= mc <parameter type> <expr> [<offset number>] E
+//
+// Not yet in the spec: https://github.com/itanium-cxx-abi/cxx-abi/issues/47
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parsePointerToMemberConversionExpr() {
+  Node *Ty = getDerived().parseType();
+  if (!Ty)
+    return nullptr;
+  Node *Expr = getDerived().parseExpr();
+  if (!Expr)
+    return nullptr;
+  StringView Offset = getDerived().parseNumber(true);
+  if (!consumeIf('E'))
+    return nullptr;
+  return make<PointerToMemberConversionExpr>(Ty, Expr, Offset);
+}
+
+// <expression> ::= so <referent type> <expr> [<offset number>] <union-selector>* [p] E
+// <union-selector> ::= _ [<number>]
+//
+// Not yet in the spec: https://github.com/itanium-cxx-abi/cxx-abi/issues/47
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseSubobjectExpr() {
+  Node *Ty = getDerived().parseType();
+  if (!Ty)
+    return nullptr;
+  Node *Expr = getDerived().parseExpr();
+  if (!Expr)
+    return nullptr;
+  StringView Offset = getDerived().parseNumber(true);
+  size_t SelectorsBegin = Names.size();
+  while (consumeIf('_')) {
+    Node *Selector = make<NameType>(parseNumber());
+    if (!Selector)
+      return nullptr;
+    Names.push_back(Selector);
+  }
+  bool OnePastTheEnd = consumeIf('p');
+  if (!consumeIf('E'))
+    return nullptr;
+  return make<SubobjectExpr>(
+      Ty, Expr, Offset, popTrailingNodeArray(SelectorsBegin), OnePastTheEnd);
+}
+
 // <expression> ::= <unary operator-name> <expression>
 //              ::= <binary operator-name> <expression> <expression>
 //              ::= <ternary operator-name> <expression> <expression> <expression>
@@ -4661,6 +4749,9 @@ Node *AbstractManglingParser<Derived, Alloc>::parseExpr() {
     return nullptr;
   case 'm':
     switch (First[1]) {
+    case 'c':
+      First += 2;
+      return parsePointerToMemberConversionExpr();
     case 'i':
       First += 2;
       return getDerived().parseBinaryExpr("-");
@@ -4808,6 +4899,9 @@ Node *AbstractManglingParser<Derived, Alloc>::parseExpr() {
         return Ex;
       return make<CastExpr>("static_cast", T, Ex);
     }
+    case 'o':
+      First += 2;
+      return parseSubobjectExpr();
     case 'p': {
       First += 2;
       Node *Child = getDerived().parseExpr();
@@ -4903,6 +4997,43 @@ Node *AbstractManglingParser<Derived, Alloc>::parseExpr() {
     }
     }
     return nullptr;
+  case 'u': {
+    ++First;
+    Node *Name = getDerived().parseSourceName(/*NameState=*/nullptr);
+    if (!Name)
+      return nullptr;
+    // Special case legacy __uuidof mangling. The 't' and 'z' appear where the
+    // standard encoding expects a <template-arg>, and would be otherwise be
+    // interpreted as <type> node 'short' or 'ellipsis'. However, neither
+    // __uuidof(short) nor __uuidof(...) can actually appear, so there is no
+    // actual conflict here.
+    if (Name->getBaseName() == "__uuidof") {
+      if (numLeft() < 2)
+        return nullptr;
+      if (*First == 't') {
+        ++First;
+        Node *Ty = getDerived().parseType();
+        if (!Ty)
+          return nullptr;
+        return make<CallExpr>(Name, makeNodeArray(&Ty, &Ty + 1));
+      }
+      if (*First == 'z') {
+        ++First;
+        Node *Ex = getDerived().parseExpr();
+        if (!Ex)
+          return nullptr;
+        return make<CallExpr>(Name, makeNodeArray(&Ex, &Ex + 1));
+      }
+    }
+    size_t ExprsBegin = Names.size();
+    while (!consumeIf('E')) {
+      Node *E = getDerived().parseTemplateArg();
+      if (E == nullptr)
+        return E;
+      Names.push_back(E);
+    }
+    return make<CallExpr>(Name, popTrailingNodeArray(ExprsBegin));
+  }
   case '1':
   case '2':
   case '3':
@@ -4914,21 +5045,6 @@ Node *AbstractManglingParser<Derived, Alloc>::parseExpr() {
   case '9':
     return getDerived().parseUnresolvedName();
   }
-
-  if (consumeIf("u8__uuidoft")) {
-    Node *Ty = getDerived().parseType();
-    if (!Ty)
-      return nullptr;
-    return make<UUIDOfExpr>(Ty);
-  }
-
-  if (consumeIf("u8__uuidofz")) {
-    Node *Ex = getDerived().parseExpr();
-    if (!Ex)
-      return nullptr;
-    return make<UUIDOfExpr>(Ex);
-  }
-
   return nullptr;
 }
 
@@ -4975,6 +5091,16 @@ Node *AbstractManglingParser<Derived, Alloc>::parseSpecialName() {
   switch (look()) {
   case 'T':
     switch (look(1)) {
+    // TA <template-arg>    # template parameter object
+    //
+    // Not yet in the spec: https://github.com/itanium-cxx-abi/cxx-abi/issues/63
+    case 'A': {
+      First += 2;
+      Node *Arg = getDerived().parseTemplateArg();
+      if (Arg == nullptr)
+        return nullptr;
+      return make<SpecialName>("template parameter object for ", Arg);
+    }
     // TV <type>    # virtual table
     case 'V': {
       First += 2;
@@ -5103,7 +5229,7 @@ Node *AbstractManglingParser<Derived, Alloc>::parseEncoding() {
     decltype(TemplateParams) OldParams;
 
   public:
-    SaveTemplateParams(AbstractManglingParser *Parser) : Parser(Parser) {
+    SaveTemplateParams(AbstractManglingParser *TheParser) : Parser(TheParser) {
       OldParams = std::move(Parser->TemplateParams);
       Parser->TemplateParams.clear();
     }
@@ -5203,7 +5329,12 @@ struct FloatData<long double>
 #else
     static const size_t mangled_size = 20;  // May need to be adjusted to 16 or 24 on other platforms
 #endif
-    static const size_t max_demangled_size = 40;
+    // `-0x1.ffffffffffffffffffffffffffffp+16383` + 'L' + '\0' == 42 bytes.
+    // 28 'f's * 4 bits == 112 bits, which is the number of mantissa bits.
+    // Negatives are one character longer than positives.
+    // `0x1.` and `p` are constant, and exponents `+16383` and `-16382` are the
+    // same length. 1 sign bit, 112 mantissa bits, and 15 exponent bits == 128.
+    static const size_t max_demangled_size = 42;
     static constexpr const char *spec = "%LaL";
 };
 
diff --git a/lib/libcxxabi/src/demangle/Utility.h b/lib/libcxxabi/src/demangle/Utility.h
index 04e1936ebb..846a5f0818 100644
--- a/lib/libcxxabi/src/demangle/Utility.h
+++ b/lib/libcxxabi/src/demangle/Utility.h
@@ -52,7 +52,7 @@ class OutputStream {
     char *TempPtr = std::end(Temp);
 
     while (N) {
-      *--TempPtr = '0' + char(N % 10);
+      *--TempPtr = char('0' + N % 10);
       N /= 10;
     }
 
diff --git a/lib/libcxxabi/src/fallback_malloc.cpp b/lib/libcxxabi/src/fallback_malloc.cpp
index fdae40764a..f3d7937793 100644
--- a/lib/libcxxabi/src/fallback_malloc.cpp
+++ b/lib/libcxxabi/src/fallback_malloc.cpp
@@ -6,9 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// Define _LIBCPP_BUILDING_LIBRARY to ensure _LIBCPP_HAS_NO_LIBRARY_ALIGNED_ALLOCATION
-// is only defined when libc aligned allocation is not available.
-#define _LIBCPP_BUILDING_LIBRARY
 #include "fallback_malloc.h"
 
 #include <__threading_support>
@@ -20,6 +17,7 @@
 
 #include <stdlib.h> // for malloc, calloc, free
 #include <string.h> // for memset
+#include <new> // for std::__libcpp_aligned_{alloc,free}
 
 //  A small, simple heap manager based (loosely) on
 //  the startup heap manager from FreeBSD, optimized for space.
@@ -144,29 +142,26 @@ void fallback_free(void* ptr) {
   mutexor mtx(&heap_mutex);
 
 #ifdef DEBUG_FALLBACK_MALLOC
-  std::cout << "Freeing item at " << offset_from_node(cp) << " of size "
-            << cp->len << std::endl;
+  std::printf("Freeing item at %d of size %d\n", offset_from_node(cp), cp->len);
 #endif
 
   for (p = freelist, prev = 0; p && p != list_end;
        prev = p, p = node_from_offset(p->next_node)) {
 #ifdef DEBUG_FALLBACK_MALLOC
-    std::cout << "  p, cp, after (p), after(cp) " << offset_from_node(p) << ' '
-              << offset_from_node(cp) << ' ' << offset_from_node(after(p))
-              << ' ' << offset_from_node(after(cp)) << std::endl;
+    std::printf("  p=%d, cp=%d, after(p)=%d, after(cp)=%d\n",
+      offset_from_node(p), offset_from_node(cp),
+      offset_from_node(after(p)), offset_from_node(after(cp)));
 #endif
     if (after(p) == cp) {
 #ifdef DEBUG_FALLBACK_MALLOC
-      std::cout << "  Appending onto chunk at " << offset_from_node(p)
-                << std::endl;
+      std::printf("  Appending onto chunk at %d\n", offset_from_node(p));
 #endif
       p->len = static_cast<heap_size>(
           p->len + cp->len); // make the free heap_node larger
       return;
     } else if (after(cp) == p) { // there's a free heap_node right after
 #ifdef DEBUG_FALLBACK_MALLOC
-      std::cout << "  Appending free chunk at " << offset_from_node(p)
-                << std::endl;
+      std::printf("  Appending free chunk at %d\n", offset_from_node(p));
 #endif
       cp->len = static_cast<heap_size>(cp->len + p->len);
       if (prev == 0) {
@@ -179,8 +174,7 @@ void fallback_free(void* ptr) {
   }
 //  Nothing to merge with, add it to the start of the free list
 #ifdef DEBUG_FALLBACK_MALLOC
-  std::cout << "  Making new free list entry " << offset_from_node(cp)
-            << std::endl;
+  std::printf("  Making new free list entry %d\n", offset_from_node(cp));
 #endif
   cp->next_node = offset_from_node(freelist);
   freelist = cp;
@@ -195,11 +189,11 @@ size_t print_free_list() {
 
   for (p = freelist, prev = 0; p && p != list_end;
        prev = p, p = node_from_offset(p->next_node)) {
-    std::cout << (prev == 0 ? "" : "  ") << "Offset: " << offset_from_node(p)
-              << "\tsize: " << p->len << " Next: " << p->next_node << std::endl;
+    std::printf("%sOffset: %d\tsize: %d Next: %d\n",
+      (prev == 0 ? "" : "  "), offset_from_node(p), p->len, p->next_node);
     total_free += p->len;
   }
-  std::cout << "Total Free space: " << total_free << std::endl;
+  std::printf("Total Free space: %d\n", total_free);
   return total_free;
 }
 #endif
@@ -211,7 +205,7 @@ struct __attribute__((aligned)) __aligned_type {};
 
 void* __aligned_malloc_with_fallback(size_t size) {
 #if defined(_WIN32)
-  if (void* dest = _aligned_malloc(size, alignof(__aligned_type)))
+  if (void* dest = std::__libcpp_aligned_alloc(alignof(__aligned_type), size))
     return dest;
 #elif defined(_LIBCPP_HAS_NO_LIBRARY_ALIGNED_ALLOCATION)
   if (void* dest = ::malloc(size))
@@ -219,8 +213,7 @@ void* __aligned_malloc_with_fallback(size_t size) {
 #else
   if (size == 0)
     size = 1;
-  void* dest;
-  if (::posix_memalign(&dest, __alignof(__aligned_type), size) == 0)
+  if (void* dest = std::__libcpp_aligned_alloc(__alignof(__aligned_type), size))
     return dest;
 #endif
   return fallback_malloc(size);
@@ -241,10 +234,10 @@ void __aligned_free_with_fallback(void* ptr) {
   if (is_fallback_ptr(ptr))
     fallback_free(ptr);
   else {
-#if defined(_WIN32)
-    ::_aligned_free(ptr);
-#else
+#if defined(_LIBCPP_HAS_NO_LIBRARY_ALIGNED_ALLOCATION)
     ::free(ptr);
+#else
+    std::__libcpp_aligned_free(ptr);
 #endif
   }
 }
diff --git a/lib/libcxxabi/src/include/refstring.h b/lib/libcxxabi/src/include/refstring.h
deleted file mode 100644
index 5905b5c932..0000000000
--- a/lib/libcxxabi/src/include/refstring.h
+++ /dev/null
@@ -1,131 +0,0 @@
-//===------------------------ __refstring ---------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// FIXME: This file is copied from libcxx/src/include/refstring.h. Instead of
-// duplicating the file in libc++abi we should require that the libc++ sources
-// are available when building libc++abi.
-
-#ifndef _LIBCPPABI_REFSTRING_H
-#define _LIBCPPABI_REFSTRING_H
-
-#include <__config>
-#include <stdexcept>
-#include <cstddef>
-#include <cstring>
-#ifdef __APPLE__
-#include <dlfcn.h>
-#include <mach-o/dyld.h>
-#endif
-#include "atomic_support.h"
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-namespace __refstring_imp { namespace {
-typedef int count_t;
-
-struct _Rep_base {
-    std::size_t len;
-    std::size_t cap;
-    count_t     count;
-};
-
-inline _Rep_base* rep_from_data(const char *data_) noexcept {
-    char *data = const_cast<char *>(data_);
-    return reinterpret_cast<_Rep_base *>(data - sizeof(_Rep_base));
-}
-
-inline char * data_from_rep(_Rep_base *rep) noexcept {
-    char *data = reinterpret_cast<char *>(rep);
-    return data + sizeof(*rep);
-}
-
-#if defined(__APPLE__)
-inline
-const char* compute_gcc_empty_string_storage() _NOEXCEPT
-{
-    void* handle = dlopen("/usr/lib/libstdc++.6.dylib", RTLD_NOLOAD);
-    if (handle == nullptr)
-        return nullptr;
-    void* sym = dlsym(handle, "_ZNSs4_Rep20_S_empty_rep_storageE");
-    if (sym == nullptr)
-        return nullptr;
-    return data_from_rep(reinterpret_cast<_Rep_base *>(sym));
-}
-
-inline
-const char*
-get_gcc_empty_string_storage() _NOEXCEPT
-{
-    static const char* p = compute_gcc_empty_string_storage();
-    return p;
-}
-#endif
-
-}} // namespace __refstring_imp
-
-using namespace __refstring_imp;
-
-inline
-__libcpp_refstring::__libcpp_refstring(const char* msg) {
-    std::size_t len = strlen(msg);
-    _Rep_base* rep = static_cast<_Rep_base *>(::operator new(sizeof(*rep) + len + 1));
-    rep->len = len;
-    rep->cap = len;
-    rep->count = 0;
-    char *data = data_from_rep(rep);
-    std::memcpy(data, msg, len + 1);
-    __imp_ = data;
-}
-
-inline
-__libcpp_refstring::__libcpp_refstring(const __libcpp_refstring &s) _NOEXCEPT
-    : __imp_(s.__imp_)
-{
-    if (__uses_refcount())
-        __libcpp_atomic_add(&rep_from_data(__imp_)->count, 1);
-}
-
-inline
-__libcpp_refstring& __libcpp_refstring::operator=(__libcpp_refstring const& s) _NOEXCEPT {
-    bool adjust_old_count = __uses_refcount();
-    struct _Rep_base *old_rep = rep_from_data(__imp_);
-    __imp_ = s.__imp_;
-    if (__uses_refcount())
-        __libcpp_atomic_add(&rep_from_data(__imp_)->count, 1);
-    if (adjust_old_count)
-    {
-        if (__libcpp_atomic_add(&old_rep->count, count_t(-1)) < 0)
-        {
-            ::operator delete(old_rep);
-        }
-    }
-    return *this;
-}
-
-inline
-__libcpp_refstring::~__libcpp_refstring() {
-    if (__uses_refcount()) {
-        _Rep_base* rep = rep_from_data(__imp_);
-        if (__libcpp_atomic_add(&rep->count, count_t(-1)) < 0) {
-            ::operator delete(rep);
-        }
-    }
-}
-
-inline
-bool __libcpp_refstring::__uses_refcount() const {
-#ifdef __APPLE__
-    return __imp_ != get_gcc_empty_string_storage();
-#else
-    return true;
-#endif
-}
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif //_LIBCPPABI_REFSTRING_H
diff --git a/lib/libcxxabi/src/private_typeinfo.cpp b/lib/libcxxabi/src/private_typeinfo.cpp
index 55a90b3ae1..c77ad669c4 100644
--- a/lib/libcxxabi/src/private_typeinfo.cpp
+++ b/lib/libcxxabi/src/private_typeinfo.cpp
@@ -61,6 +61,16 @@ is_equal(const std::type_info* x, const std::type_info* y, bool use_strcmp)
     return x == y || strcmp(x->name(), y->name()) == 0;
 }
 
+static inline ptrdiff_t update_offset_to_base(const char* vtable,
+                                              ptrdiff_t offset_to_base) {
+#if __has_feature(cxx_abi_relative_vtable)
+  // VTable components are 32 bits in the relative vtables ABI.
+  return *reinterpret_cast<const int32_t*>(vtable + offset_to_base);
+#else
+  return *reinterpret_cast<const ptrdiff_t*>(vtable + offset_to_base);
+#endif
+}
+
 namespace __cxxabiv1
 {
 
@@ -297,7 +307,7 @@ __base_class_type_info::has_unambiguous_public_base(__dynamic_cast_info* info,
         if (__offset_flags & __virtual_mask)
         {
             const char* vtable = *static_cast<const char*const*>(adjustedPtr);
-            offset_to_base = *reinterpret_cast<const ptrdiff_t*>(vtable + offset_to_base);
+            offset_to_base = update_offset_to_base(vtable, offset_to_base);
         }
     }
     __base_type->has_unambiguous_public_base(
@@ -615,10 +625,26 @@ __dynamic_cast(const void *static_ptr, const __class_type_info *static_type,
     // Possible future optimization:  Take advantage of src2dst_offset
 
     // Get (dynamic_ptr, dynamic_type) from static_ptr
+#if __has_feature(cxx_abi_relative_vtable)
+    // The vtable address will point to the first virtual function, which is 8
+    // bytes after the start of the vtable (4 for the offset from top + 4 for the typeinfo component).
+    const int32_t* vtable =
+        *reinterpret_cast<const int32_t* const*>(static_ptr);
+    int32_t offset_to_derived = vtable[-2];
+    const void* dynamic_ptr = static_cast<const char*>(static_ptr) + offset_to_derived;
+
+    // The typeinfo component is now a relative offset to a proxy.
+    int32_t offset_to_ti_proxy = vtable[-1];
+    const uint8_t* ptr_to_ti_proxy =
+        reinterpret_cast<const uint8_t*>(vtable) + offset_to_ti_proxy;
+    const __class_type_info* dynamic_type =
+        *(reinterpret_cast<const __class_type_info* const*>(ptr_to_ti_proxy));
+#else
     void **vtable = *static_cast<void ** const *>(static_ptr);
     ptrdiff_t offset_to_derived = reinterpret_cast<ptrdiff_t>(vtable[-2]);
     const void* dynamic_ptr = static_cast<const char*>(static_ptr) + offset_to_derived;
     const __class_type_info* dynamic_type = static_cast<const __class_type_info*>(vtable[-1]);
+#endif
 
     // Initialize answer to nullptr.  This will be changed from the search
     //    results if a non-null answer is found.  Regardless, this is what will
@@ -641,6 +667,7 @@ __dynamic_cast(const void *static_ptr, const __class_type_info *static_type,
         {
             // We get here only if there is some kind of visibility problem
             //   in client code.
+            static_assert(std::atomic<size_t>::is_always_lock_free, "");
             static std::atomic<size_t> error_count(0);
             size_t error_count_snapshot = error_count.fetch_add(1, std::memory_order_relaxed);
             if ((error_count_snapshot & (error_count_snapshot-1)) == 0)
@@ -667,6 +694,7 @@ __dynamic_cast(const void *static_ptr, const __class_type_info *static_type,
         if (info.path_dst_ptr_to_static_ptr == unknown &&
             info.path_dynamic_ptr_to_static_ptr == unknown)
         {
+            static_assert(std::atomic<size_t>::is_always_lock_free, "");
             static std::atomic<size_t> error_count(0);
             size_t error_count_snapshot = error_count.fetch_add(1, std::memory_order_relaxed);
             if ((error_count_snapshot & (error_count_snapshot-1)) == 0)
@@ -1265,7 +1293,7 @@ __base_class_type_info::search_above_dst(__dynamic_cast_info* info,
     if (__offset_flags & __virtual_mask)
     {
         const char* vtable = *static_cast<const char*const*>(current_ptr);
-        offset_to_base = *reinterpret_cast<const ptrdiff_t*>(vtable + offset_to_base);
+        offset_to_base = update_offset_to_base(vtable, offset_to_base);
     }
     __base_type->search_above_dst(info, dst_ptr,
                                   static_cast<const char*>(current_ptr) + offset_to_base,
@@ -1285,7 +1313,7 @@ __base_class_type_info::search_below_dst(__dynamic_cast_info* info,
     if (__offset_flags & __virtual_mask)
     {
         const char* vtable = *static_cast<const char*const*>(current_ptr);
-        offset_to_base = *reinterpret_cast<const ptrdiff_t*>(vtable + offset_to_base);
+        offset_to_base = update_offset_to_base(vtable, offset_to_base);
     }
     __base_type->search_below_dst(info,
                                   static_cast<const char*>(current_ptr) + offset_to_base,
diff --git a/lib/libcxxabi/src/stdlib_exception.cpp b/lib/libcxxabi/src/stdlib_exception.cpp
index 88d911d96c..b0cc431f24 100644
--- a/lib/libcxxabi/src/stdlib_exception.cpp
+++ b/lib/libcxxabi/src/stdlib_exception.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define _LIBCPP_BUILDING_LIBRARY
 #include <new>
 #include <exception>
 
diff --git a/lib/libcxxabi/src/stdlib_new_delete.cpp b/lib/libcxxabi/src/stdlib_new_delete.cpp
index 698c5f7c29..8ef3057dd4 100644
--- a/lib/libcxxabi/src/stdlib_new_delete.cpp
+++ b/lib/libcxxabi/src/stdlib_new_delete.cpp
@@ -8,7 +8,6 @@
 // This file implements the new and delete operators.
 //===----------------------------------------------------------------------===//
 
-#define _LIBCPP_BUILDING_LIBRARY
 #include "__cxxabi_config.h"
 #include <new>
 #include <cstdlib>
@@ -28,7 +27,7 @@ operator new(std::size_t size) _THROW_BAD_ALLOC
     if (size == 0)
         size = 1;
     void* p;
-    while ((p = ::malloc(size)) == 0)
+    while ((p = ::malloc(size)) == nullptr)
     {
         // If malloc fails and there is a new_handler,
         // call it to try free up memory.
@@ -49,7 +48,7 @@ _LIBCXXABI_WEAK
 void*
 operator new(size_t size, const std::nothrow_t&) _NOEXCEPT
 {
-    void* p = 0;
+    void* p = nullptr;
 #ifndef _LIBCXXABI_NO_EXCEPTIONS
     try
     {
@@ -75,7 +74,7 @@ _LIBCXXABI_WEAK
 void*
 operator new[](size_t size, const std::nothrow_t&) _NOEXCEPT
 {
-    void* p = 0;
+    void* p = nullptr;
 #ifndef _LIBCXXABI_NO_EXCEPTIONS
     try
     {
@@ -94,8 +93,7 @@ _LIBCXXABI_WEAK
 void
 operator delete(void* ptr) _NOEXCEPT
 {
-    if (ptr)
-        ::free(ptr);
+    ::free(ptr);
 }
 
 _LIBCXXABI_WEAK
@@ -143,15 +141,16 @@ operator new(std::size_t size, std::align_val_t alignment) _THROW_BAD_ALLOC
         size = 1;
     if (static_cast<size_t>(alignment) < sizeof(void*))
       alignment = std::align_val_t(sizeof(void*));
+
+    // Try allocating memory. If allocation fails and there is a new_handler,
+    // call it to try free up memory, and try again until it succeeds, or until
+    // the new_handler decides to terminate.
+    //
+    // If allocation fails and there is no new_handler, we throw bad_alloc
+    // (or return nullptr if exceptions are disabled).
     void* p;
-#if defined(_LIBCPP_WIN32API)
-    while ((p = _aligned_malloc(size, static_cast<size_t>(alignment))) == nullptr)
-#else
-    while (::posix_memalign(&p, static_cast<size_t>(alignment), size) != 0)
-#endif
+    while ((p = std::__libcpp_aligned_alloc(static_cast<std::size_t>(alignment), size)) == nullptr)
     {
-        // If posix_memalign fails and there is a new_handler,
-        // call it to try free up memory.
         std::new_handler nh = std::get_new_handler();
         if (nh)
             nh();
@@ -159,7 +158,6 @@ operator new(std::size_t size, std::align_val_t alignment) _THROW_BAD_ALLOC
 #ifndef _LIBCXXABI_NO_EXCEPTIONS
             throw std::bad_alloc();
 #else
-            p = nullptr; // posix_memalign doesn't initialize 'p' on failure
             break;
 #endif
         }
@@ -171,7 +169,7 @@ _LIBCXXABI_WEAK
 void*
 operator new(size_t size, std::align_val_t alignment, const std::nothrow_t&) _NOEXCEPT
 {
-    void* p = 0;
+    void* p = nullptr;
 #ifndef _LIBCXXABI_NO_EXCEPTIONS
     try
     {
@@ -197,7 +195,7 @@ _LIBCXXABI_WEAK
 void*
 operator new[](size_t size, std::align_val_t alignment, const std::nothrow_t&) _NOEXCEPT
 {
-    void* p = 0;
+    void* p = nullptr;
 #ifndef _LIBCXXABI_NO_EXCEPTIONS
     try
     {
@@ -216,12 +214,7 @@ _LIBCXXABI_WEAK
 void
 operator delete(void* ptr, std::align_val_t) _NOEXCEPT
 {
-    if (ptr)
-#if defined(_LIBCPP_WIN32API)
-        ::_aligned_free(ptr);
-#else
-        ::free(ptr);
-#endif
+    std::__libcpp_aligned_free(ptr);
 }
 
 _LIBCXXABI_WEAK
diff --git a/lib/libcxxabi/src/stdlib_stdexcept.cpp b/lib/libcxxabi/src/stdlib_stdexcept.cpp
index 2cff8ee2f0..4a464e4889 100644
--- a/lib/libcxxabi/src/stdlib_stdexcept.cpp
+++ b/lib/libcxxabi/src/stdlib_stdexcept.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "include/refstring.h"
+#include "../../libcxx/src/include/refstring.h"
 #include "stdexcept"
 #include "new"
 #include <cstdlib>
diff --git a/lib/libunwind/include/__libunwind_config.h b/lib/libunwind/include/__libunwind_config.h
index 71d77ca651..34ac6f717d 100644
--- a/lib/libunwind/include/__libunwind_config.h
+++ b/lib/libunwind/include/__libunwind_config.h
@@ -25,8 +25,12 @@
 #define _LIBUNWIND_HIGHEST_DWARF_REGISTER_SPARC     31
 #define _LIBUNWIND_HIGHEST_DWARF_REGISTER_HEXAGON   34
 #define _LIBUNWIND_HIGHEST_DWARF_REGISTER_RISCV     64
+#define _LIBUNWIND_HIGHEST_DWARF_REGISTER_VE        143
 
 #if defined(_LIBUNWIND_IS_NATIVE_ONLY)
+# if defined(__linux__)
+#  define _LIBUNWIND_TARGET_LINUX 1
+# endif
 # if defined(__i386__)
 #  define _LIBUNWIND_TARGET_I386
 #  define _LIBUNWIND_CONTEXT_SIZE 8
@@ -135,6 +139,11 @@
 #    error "Unsupported RISC-V ABI"
 #  endif
 # define _LIBUNWIND_HIGHEST_DWARF_REGISTER _LIBUNWIND_HIGHEST_DWARF_REGISTER_RISCV
+# elif defined(__ve__)
+#  define _LIBUNWIND_TARGET_VE 1
+#  define _LIBUNWIND_CONTEXT_SIZE 67
+#  define _LIBUNWIND_CURSOR_SIZE 79
+#  define _LIBUNWIND_HIGHEST_DWARF_REGISTER _LIBUNWIND_HIGHEST_DWARF_REGISTER_VE
 # else
 #  error "Unsupported architecture."
 # endif
@@ -151,6 +160,7 @@
 # define _LIBUNWIND_TARGET_SPARC 1
 # define _LIBUNWIND_TARGET_HEXAGON 1
 # define _LIBUNWIND_TARGET_RISCV 1
+# define _LIBUNWIND_TARGET_VE 1
 # define _LIBUNWIND_CONTEXT_SIZE 167
 # define _LIBUNWIND_CURSOR_SIZE 179
 # define _LIBUNWIND_HIGHEST_DWARF_REGISTER 287
diff --git a/lib/libunwind/include/libunwind.h b/lib/libunwind/include/libunwind.h
index 23ef47f4ac..5bae8d02f7 100644
--- a/lib/libunwind/include/libunwind.h
+++ b/lib/libunwind/include/libunwind.h
@@ -43,6 +43,12 @@
   #define LIBUNWIND_AVAIL
 #endif
 
+#if defined(_WIN32) && defined(__SEH__)
+  #define LIBUNWIND_CURSOR_ALIGNMENT_ATTR __attribute__((__aligned__(16)))
+#else
+  #define LIBUNWIND_CURSOR_ALIGNMENT_ATTR
+#endif
+
 /* error codes */
 enum {
   UNW_ESUCCESS      = 0,     /* no error */
@@ -68,7 +74,7 @@ typedef struct unw_context_t unw_context_t;
 
 struct unw_cursor_t {
   uint64_t data[_LIBUNWIND_CURSOR_SIZE];
-};
+} LIBUNWIND_CURSOR_ALIGNMENT_ATTR;
 typedef struct unw_cursor_t unw_cursor_t;
 
 typedef struct unw_addr_space *unw_addr_space_t;
@@ -941,4 +947,156 @@ enum {
   UNW_RISCV_F31 = 63,
 };
 
+// VE register numbers
+enum {
+  UNW_VE_S0   = 0,
+  UNW_VE_S1   = 1,
+  UNW_VE_S2   = 2,
+  UNW_VE_S3   = 3,
+  UNW_VE_S4   = 4,
+  UNW_VE_S5   = 5,
+  UNW_VE_S6   = 6,
+  UNW_VE_S7   = 7,
+  UNW_VE_S8   = 8,
+  UNW_VE_S9   = 9,
+  UNW_VE_S10  = 10,
+  UNW_VE_S11  = 11,
+  UNW_VE_S12  = 12,
+  UNW_VE_S13  = 13,
+  UNW_VE_S14  = 14,
+  UNW_VE_S15  = 15,
+  UNW_VE_S16  = 16,
+  UNW_VE_S17  = 17,
+  UNW_VE_S18  = 18,
+  UNW_VE_S19  = 19,
+  UNW_VE_S20  = 20,
+  UNW_VE_S21  = 21,
+  UNW_VE_S22  = 22,
+  UNW_VE_S23  = 23,
+  UNW_VE_S24  = 24,
+  UNW_VE_S25  = 25,
+  UNW_VE_S26  = 26,
+  UNW_VE_S27  = 27,
+  UNW_VE_S28  = 28,
+  UNW_VE_S29  = 29,
+  UNW_VE_S30  = 30,
+  UNW_VE_S31  = 31,
+  UNW_VE_S32  = 32,
+  UNW_VE_S33  = 33,
+  UNW_VE_S34  = 34,
+  UNW_VE_S35  = 35,
+  UNW_VE_S36  = 36,
+  UNW_VE_S37  = 37,
+  UNW_VE_S38  = 38,
+  UNW_VE_S39  = 39,
+  UNW_VE_S40  = 40,
+  UNW_VE_S41  = 41,
+  UNW_VE_S42  = 42,
+  UNW_VE_S43  = 43,
+  UNW_VE_S44  = 44,
+  UNW_VE_S45  = 45,
+  UNW_VE_S46  = 46,
+  UNW_VE_S47  = 47,
+  UNW_VE_S48  = 48,
+  UNW_VE_S49  = 49,
+  UNW_VE_S50  = 50,
+  UNW_VE_S51  = 51,
+  UNW_VE_S52  = 52,
+  UNW_VE_S53  = 53,
+  UNW_VE_S54  = 54,
+  UNW_VE_S55  = 55,
+  UNW_VE_S56  = 56,
+  UNW_VE_S57  = 57,
+  UNW_VE_S58  = 58,
+  UNW_VE_S59  = 59,
+  UNW_VE_S60  = 60,
+  UNW_VE_S61  = 61,
+  UNW_VE_S62  = 62,
+  UNW_VE_S63  = 63,
+  UNW_VE_V0   = 64 + 0,
+  UNW_VE_V1   = 64 + 1,
+  UNW_VE_V2   = 64 + 2,
+  UNW_VE_V3   = 64 + 3,
+  UNW_VE_V4   = 64 + 4,
+  UNW_VE_V5   = 64 + 5,
+  UNW_VE_V6   = 64 + 6,
+  UNW_VE_V7   = 64 + 7,
+  UNW_VE_V8   = 64 + 8,
+  UNW_VE_V9   = 64 + 9,
+  UNW_VE_V10  = 64 + 10,
+  UNW_VE_V11  = 64 + 11,
+  UNW_VE_V12  = 64 + 12,
+  UNW_VE_V13  = 64 + 13,
+  UNW_VE_V14  = 64 + 14,
+  UNW_VE_V15  = 64 + 15,
+  UNW_VE_V16  = 64 + 16,
+  UNW_VE_V17  = 64 + 17,
+  UNW_VE_V18  = 64 + 18,
+  UNW_VE_V19  = 64 + 19,
+  UNW_VE_V20  = 64 + 20,
+  UNW_VE_V21  = 64 + 21,
+  UNW_VE_V22  = 64 + 22,
+  UNW_VE_V23  = 64 + 23,
+  UNW_VE_V24  = 64 + 24,
+  UNW_VE_V25  = 64 + 25,
+  UNW_VE_V26  = 64 + 26,
+  UNW_VE_V27  = 64 + 27,
+  UNW_VE_V28  = 64 + 28,
+  UNW_VE_V29  = 64 + 29,
+  UNW_VE_V30  = 64 + 30,
+  UNW_VE_V31  = 64 + 31,
+  UNW_VE_V32  = 64 + 32,
+  UNW_VE_V33  = 64 + 33,
+  UNW_VE_V34  = 64 + 34,
+  UNW_VE_V35  = 64 + 35,
+  UNW_VE_V36  = 64 + 36,
+  UNW_VE_V37  = 64 + 37,
+  UNW_VE_V38  = 64 + 38,
+  UNW_VE_V39  = 64 + 39,
+  UNW_VE_V40  = 64 + 40,
+  UNW_VE_V41  = 64 + 41,
+  UNW_VE_V42  = 64 + 42,
+  UNW_VE_V43  = 64 + 43,
+  UNW_VE_V44  = 64 + 44,
+  UNW_VE_V45  = 64 + 45,
+  UNW_VE_V46  = 64 + 46,
+  UNW_VE_V47  = 64 + 47,
+  UNW_VE_V48  = 64 + 48,
+  UNW_VE_V49  = 64 + 49,
+  UNW_VE_V50  = 64 + 50,
+  UNW_VE_V51  = 64 + 51,
+  UNW_VE_V52  = 64 + 52,
+  UNW_VE_V53  = 64 + 53,
+  UNW_VE_V54  = 64 + 54,
+  UNW_VE_V55  = 64 + 55,
+  UNW_VE_V56  = 64 + 56,
+  UNW_VE_V57  = 64 + 57,
+  UNW_VE_V58  = 64 + 58,
+  UNW_VE_V59  = 64 + 59,
+  UNW_VE_V60  = 64 + 60,
+  UNW_VE_V61  = 64 + 61,
+  UNW_VE_V62  = 64 + 62,
+  UNW_VE_V63  = 64 + 63,
+  UNW_VE_VM0  = 128 + 0,
+  UNW_VE_VM1  = 128 + 1,
+  UNW_VE_VM2  = 128 + 2,
+  UNW_VE_VM3  = 128 + 3,
+  UNW_VE_VM4  = 128 + 4,
+  UNW_VE_VM5  = 128 + 5,
+  UNW_VE_VM6  = 128 + 6,
+  UNW_VE_VM7  = 128 + 7,
+  UNW_VE_VM8  = 128 + 8,
+  UNW_VE_VM9  = 128 + 9,
+  UNW_VE_VM10 = 128 + 10,
+  UNW_VE_VM11 = 128 + 11,
+  UNW_VE_VM12 = 128 + 12,
+  UNW_VE_VM13 = 128 + 13,
+  UNW_VE_VM14 = 128 + 14,
+  UNW_VE_VM15 = 128 + 15, // = 143
+
+  // Following registers don't have DWARF register numbers.
+  UNW_VE_VIXR = 144,
+  UNW_VE_VL   = 145,
+};
+
 #endif
diff --git a/lib/libunwind/src/AddressSpace.hpp b/lib/libunwind/src/AddressSpace.hpp
index 93395ffb3b..171318ff63 100644
--- a/lib/libunwind/src/AddressSpace.hpp
+++ b/lib/libunwind/src/AddressSpace.hpp
@@ -17,6 +17,12 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include "libunwind.h"
+#include "config.h"
+#include "dwarf2.h"
+#include "EHHeaderParser.hpp"
+#include "Registers.hpp"
+
 #ifndef _LIBUNWIND_USE_DLADDR
   #if !defined(_LIBUNWIND_IS_BAREMETAL) && !defined(_WIN32)
     #define _LIBUNWIND_USE_DLADDR 1
@@ -39,19 +45,6 @@ struct EHABIIndexEntry {
 };
 #endif
 
-#ifdef __APPLE__
-#include <mach-o/getsect.h>
-namespace libunwind {
-   bool checkKeyMgrRegisteredFDEs(uintptr_t targetAddr, void *&fde);
-}
-#endif
-
-#include "libunwind.h"
-#include "config.h"
-#include "dwarf2.h"
-#include "EHHeaderParser.hpp"
-#include "Registers.hpp"
-
 #ifdef __APPLE__
 
   struct dyld_unwind_sections
@@ -62,43 +55,9 @@ namespace libunwind {
     const void*                 compact_unwind_section;
     uintptr_t                   compact_unwind_section_length;
   };
-  #if (defined(__MAC_OS_X_VERSION_MIN_REQUIRED) \
-                                 && (__MAC_OS_X_VERSION_MIN_REQUIRED >= 1070)) \
-      || defined(__IPHONE_OS_VERSION_MIN_REQUIRED)
-    // In 10.7.0 or later, libSystem.dylib implements this function.
-    extern "C" bool _dyld_find_unwind_sections(void *, dyld_unwind_sections *);
-  #else
-    // In 10.6.x and earlier, we need to implement this functionality. Note
-    // that this requires a newer version of libmacho (from cctools) than is
-    // present in libSystem on 10.6.x (for getsectiondata).
-    static inline bool _dyld_find_unwind_sections(void* addr,
-                                                    dyld_unwind_sections* info) {
-      // Find mach-o image containing address.
-      Dl_info dlinfo;
-      if (!dladdr(addr, &dlinfo))
-        return false;
-#if __LP64__
-      const struct mach_header_64 *mh = (const struct mach_header_64 *)dlinfo.dli_fbase;
-#else
-      const struct mach_header *mh = (const struct mach_header *)dlinfo.dli_fbase;
-#endif
 
-      // Initialize the return struct
-      info->mh = (const struct mach_header *)mh;
-      info->dwarf_section = getsectiondata(mh, "__TEXT", "__eh_frame", &info->dwarf_section_length);
-      info->compact_unwind_section = getsectiondata(mh, "__TEXT", "__unwind_info", &info->compact_unwind_section_length);
-
-      if (!info->dwarf_section) {
-        info->dwarf_section_length = 0;
-      }
-
-      if (!info->compact_unwind_section) {
-        info->compact_unwind_section_length = 0;
-      }
-
-      return true;
-    }
-  #endif
+  // In 10.7.0 or later, libSystem.dylib implements this function.
+  extern "C" bool _dyld_find_unwind_sections(void *, dyld_unwind_sections *);
 
 #elif defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) && defined(_LIBUNWIND_IS_BAREMETAL)
 
@@ -139,22 +98,15 @@ extern char __eh_frame_hdr_end;
 extern char __exidx_start;
 extern char __exidx_end;
 
-#elif defined(_LIBUNWIND_ARM_EHABI) || defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND)
+#elif defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) && defined(_WIN32)
 
-// ELF-based systems may use dl_iterate_phdr() to access sections
-// containing unwinding information. The ElfW() macro for pointer-size
-// independent ELF header traversal is not provided by <link.h> on some
-// systems (e.g., FreeBSD). On these systems the data structures are
-// just called Elf_XXX. Define ElfW() locally.
-#ifndef _WIN32
-#include <link.h>
-#else
 #include <windows.h>
 #include <psapi.h>
-#endif
-#if !defined(ElfW)
-#define ElfW(type) Elf_##type
-#endif
+
+#elif defined(_LIBUNWIND_USE_DL_ITERATE_PHDR) ||                               \
+      defined(_LIBUNWIND_USE_DL_UNWIND_FIND_EXIDX)
+
+#include <link.h>
 
 #endif
 
@@ -162,11 +114,15 @@ namespace libunwind {
 
 /// Used by findUnwindSections() to return info about needed sections.
 struct UnwindInfoSections {
-#if defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) || defined(_LIBUNWIND_SUPPORT_DWARF_INDEX) ||       \
-    defined(_LIBUNWIND_SUPPORT_COMPACT_UNWIND)
-  // No dso_base for SEH or ARM EHABI.
+#if defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) ||                                \
+    defined(_LIBUNWIND_SUPPORT_COMPACT_UNWIND) ||                              \
+    defined(_LIBUNWIND_USE_DL_ITERATE_PHDR)
+  // No dso_base for SEH.
   uintptr_t       dso_base;
 #endif
+#if defined(_LIBUNWIND_USE_DL_ITERATE_PHDR)
+  uintptr_t       text_segment_length;
+#endif
 #if defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND)
   uintptr_t       dwarf_section;
   uintptr_t       dwarf_section_length;
@@ -290,11 +246,11 @@ inline int64_t LocalAddressSpace::getSLEB128(pint_t &addr, pint_t end) {
     if (p == pend)
       _LIBUNWIND_ABORT("truncated sleb128 expression");
     byte = *p++;
-    result |= ((byte & 0x7f) << bit);
+    result |= (uint64_t)(byte & 0x7f) << bit;
     bit += 7;
   } while (byte & 0x80);
   // sign extend negative numbers
-  if ((byte & 0x40) != 0)
+  if ((byte & 0x40) != 0 && bit < 64)
     result |= (-1ULL) << bit;
   addr = (pint_t) p;
   return result;
@@ -392,23 +348,14 @@ LocalAddressSpace::getEncodedP(pint_t &addr, pint_t end, uint8_t encoding,
   return result;
 }
 
-#ifdef __APPLE__
-#elif defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) && defined(_LIBUNWIND_IS_BAREMETAL)
-#elif defined(_LIBUNWIND_ARM_EHABI) && defined(_LIBUNWIND_IS_BAREMETAL)
-#elif defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) && defined(_WIN32)
-#elif defined(_LIBUNWIND_SUPPORT_SEH_UNWIND) && defined(_WIN32)
-#elif defined(_LIBUNWIND_ARM_EHABI) && defined(__BIONIC__)
-// Code inside findUnwindSections handles all these cases.
-//
-// Although the above ifdef chain is ugly, there doesn't seem to be a cleaner
-// way to handle it. The generalized boolean expression is:
-//
-//  A OR (B AND C) OR (D AND C) OR (B AND E) OR (F AND E) OR (D AND G)
-//
-// Running it through various boolean expression simplifiers gives expressions
-// that don't help at all.
-#elif defined(_LIBUNWIND_ARM_EHABI) || defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND)
+#if defined(_LIBUNWIND_USE_DL_ITERATE_PHDR)
 
+// The ElfW() macro for pointer-size independent ELF header traversal is not
+// provided by <link.h> on some systems (e.g., FreeBSD). On these systems the
+// data structures are just called Elf_XXX. Define ElfW() locally.
+#if !defined(ElfW)
+  #define ElfW(type) Elf_##type
+#endif
 #if !defined(Elf_Half)
   typedef ElfW(Half) Elf_Half;
 #endif
@@ -447,16 +394,12 @@ struct _LIBUNWIND_HIDDEN dl_iterate_cb_data {
   uintptr_t targetAddr;
 };
 
-#if defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND)
-  #if !defined(_LIBUNWIND_SUPPORT_DWARF_INDEX)
-    #error "_LIBUNWIND_SUPPORT_DWARF_UNWIND requires _LIBUNWIND_SUPPORT_DWARF_INDEX on this platform."
-  #endif
-
 #if defined(_LIBUNWIND_USE_FRAME_HEADER_CACHE)
 #include "FrameHeaderCache.hpp"
 
-// There should be just one of these per process.
-static FrameHeaderCache ProcessFrameHeaderCache;
+// Typically there is one cache per process, but when libunwind is built as a
+// hermetic static library, then each shared object may have its own cache.
+static FrameHeaderCache TheFrameHeaderCache;
 #endif
 
 static bool checkAddrInSegment(const Elf_Phdr *phdr, size_t image_base,
@@ -466,95 +409,93 @@ static bool checkAddrInSegment(const Elf_Phdr *phdr, size_t image_base,
     uintptr_t end = begin + phdr->p_memsz;
     if (cbdata->targetAddr >= begin && cbdata->targetAddr < end) {
       cbdata->sects->dso_base = begin;
-      cbdata->sects->dwarf_section_length = phdr->p_memsz;
+      cbdata->sects->text_segment_length = phdr->p_memsz;
       return true;
     }
   }
   return false;
 }
 
+static bool checkForUnwindInfoSegment(const Elf_Phdr *phdr, size_t image_base,
+                                      dl_iterate_cb_data *cbdata) {
+#if defined(_LIBUNWIND_SUPPORT_DWARF_INDEX)
+  if (phdr->p_type == PT_GNU_EH_FRAME) {
+    EHHeaderParser<LocalAddressSpace>::EHHeaderInfo hdrInfo;
+    uintptr_t eh_frame_hdr_start = image_base + phdr->p_vaddr;
+    cbdata->sects->dwarf_index_section = eh_frame_hdr_start;
+    cbdata->sects->dwarf_index_section_length = phdr->p_memsz;
+    if (EHHeaderParser<LocalAddressSpace>::decodeEHHdr(
+            *cbdata->addressSpace, eh_frame_hdr_start, phdr->p_memsz,
+            hdrInfo)) {
+      // .eh_frame_hdr records the start of .eh_frame, but not its size.
+      // Rely on a zero terminator to find the end of the section.
+      cbdata->sects->dwarf_section = hdrInfo.eh_frame_ptr;
+      cbdata->sects->dwarf_section_length = UINTPTR_MAX;
+      return true;
+    }
+  }
+  return false;
+#elif defined(_LIBUNWIND_ARM_EHABI)
+  if (phdr->p_type == PT_ARM_EXIDX) {
+    uintptr_t exidx_start = image_base + phdr->p_vaddr;
+    cbdata->sects->arm_section = exidx_start;
+    cbdata->sects->arm_section_length = phdr->p_memsz;
+    return true;
+  }
+  return false;
+#else
+#error Need one of _LIBUNWIND_SUPPORT_DWARF_INDEX or _LIBUNWIND_ARM_EHABI
+#endif
+}
+
 static int findUnwindSectionsByPhdr(struct dl_phdr_info *pinfo,
                                     size_t pinfo_size, void *data) {
   auto cbdata = static_cast<dl_iterate_cb_data *>(data);
   if (pinfo->dlpi_phnum == 0 || cbdata->targetAddr < pinfo->dlpi_addr)
     return 0;
 #if defined(_LIBUNWIND_USE_FRAME_HEADER_CACHE)
-  if (ProcessFrameHeaderCache.find(pinfo, pinfo_size, data))
+  if (TheFrameHeaderCache.find(pinfo, pinfo_size, data))
     return 1;
+#else
+  // Avoid warning about unused variable.
+  (void)pinfo_size;
 #endif
 
   Elf_Addr image_base = calculateImageBase(pinfo);
-  bool found_obj = false;
-  bool found_hdr = false;
 
-  // Third phdr is usually the executable phdr.
-  if (pinfo->dlpi_phnum > 2)
-    found_obj = checkAddrInSegment(&pinfo->dlpi_phdr[2], image_base, cbdata);
-
-  // PT_GNU_EH_FRAME is usually near the end. Iterate backward. We already know
-  // that there is one or more phdrs.
-  for (Elf_Half i = pinfo->dlpi_phnum; i > 0; i--) {
-    const Elf_Phdr *phdr = &pinfo->dlpi_phdr[i - 1];
-    if (!found_hdr && phdr->p_type == PT_GNU_EH_FRAME) {
-      EHHeaderParser<LocalAddressSpace>::EHHeaderInfo hdrInfo;
-      uintptr_t eh_frame_hdr_start = image_base + phdr->p_vaddr;
-      cbdata->sects->dwarf_index_section = eh_frame_hdr_start;
-      cbdata->sects->dwarf_index_section_length = phdr->p_memsz;
-      found_hdr = EHHeaderParser<LocalAddressSpace>::decodeEHHdr(
-          *cbdata->addressSpace, eh_frame_hdr_start, phdr->p_memsz,
-          hdrInfo);
-      if (found_hdr)
-        cbdata->sects->dwarf_section = hdrInfo.eh_frame_ptr;
-    } else if (!found_obj) {
-      found_obj = checkAddrInSegment(phdr, image_base, cbdata);
-    }
-    if (found_obj && found_hdr) {
-#if defined(_LIBUNWIND_USE_FRAME_HEADER_CACHE)
-      ProcessFrameHeaderCache.add(cbdata->sects);
-#endif
-      return 1;
+  // Most shared objects seen in this callback function likely don't contain the
+  // target address, so optimize for that. Scan for a matching PT_LOAD segment
+  // first and bail when it isn't found.
+  bool found_text = false;
+  for (Elf_Half i = 0; i < pinfo->dlpi_phnum; ++i) {
+    if (checkAddrInSegment(&pinfo->dlpi_phdr[i], image_base, cbdata)) {
+      found_text = true;
+      break;
     }
   }
-  cbdata->sects->dwarf_section_length = 0;
-  return 0;
-}
-
-#else  // defined(LIBUNWIND_SUPPORT_DWARF_UNWIND)
-// Given all the #ifdef's above, the code here is for
-// defined(LIBUNWIND_ARM_EHABI)
-
-static int findUnwindSectionsByPhdr(struct dl_phdr_info *pinfo, size_t,
-                                    void *data) {
-  auto *cbdata = static_cast<dl_iterate_cb_data *>(data);
-  bool found_obj = false;
-  bool found_hdr = false;
-
-  assert(cbdata);
-  assert(cbdata->sects);
-
-  if (cbdata->targetAddr < pinfo->dlpi_addr)
+  if (!found_text)
     return 0;
 
-  Elf_Addr image_base = calculateImageBase(pinfo);
-
-  for (Elf_Half i = 0; i < pinfo->dlpi_phnum; i++) {
-    const Elf_Phdr *phdr = &pinfo->dlpi_phdr[i];
-    if (phdr->p_type == PT_LOAD) {
-      uintptr_t begin = image_base + phdr->p_vaddr;
-      uintptr_t end = begin + phdr->p_memsz;
-      if (cbdata->targetAddr >= begin && cbdata->targetAddr < end)
-        found_obj = true;
-    } else if (phdr->p_type == PT_ARM_EXIDX) {
-      uintptr_t exidx_start = image_base + phdr->p_vaddr;
-      cbdata->sects->arm_section = exidx_start;
-      cbdata->sects->arm_section_length = phdr->p_memsz;
-      found_hdr = true;
+  // PT_GNU_EH_FRAME and PT_ARM_EXIDX are usually near the end. Iterate
+  // backward.
+  bool found_unwind = false;
+  for (Elf_Half i = pinfo->dlpi_phnum; i > 0; i--) {
+    const Elf_Phdr *phdr = &pinfo->dlpi_phdr[i - 1];
+    if (checkForUnwindInfoSegment(phdr, image_base, cbdata)) {
+      found_unwind = true;
+      break;
     }
   }
-  return found_obj && found_hdr;
+  if (!found_unwind)
+    return 0;
+
+#if defined(_LIBUNWIND_USE_FRAME_HEADER_CACHE)
+  TheFrameHeaderCache.add(cbdata->sects);
+#endif
+  return 1;
 }
-#endif  // defined(LIBUNWIND_SUPPORT_DWARF_UNWIND)
-#endif  // defined(_LIBUNWIND_ARM_EHABI) || defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND)
+
+#endif  // defined(_LIBUNWIND_USE_DL_ITERATE_PHDR)
 
 
 inline bool LocalAddressSpace::findUnwindSections(pint_t targetAddr,
@@ -572,6 +513,7 @@ inline bool LocalAddressSpace::findUnwindSections(pint_t targetAddr,
     return true;
   }
 #elif defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) && defined(_LIBUNWIND_IS_BAREMETAL)
+  info.dso_base = 0;
   // Bare metal is statically linked, so no need to ask the dynamic loader
   info.dwarf_section_length = (uintptr_t)(&__eh_frame_end - &__eh_frame_start);
   info.dwarf_section =        (uintptr_t)(&__eh_frame_start);
@@ -638,16 +580,14 @@ inline bool LocalAddressSpace::findUnwindSections(pint_t targetAddr,
   (void)targetAddr;
   (void)info;
   return true;
-#elif defined(_LIBUNWIND_ARM_EHABI) && defined(__BIONIC__)
-  // For ARM EHABI, Bionic didn't implement dl_iterate_phdr until API 21. After
-  // API 21, dl_iterate_phdr exists, but dl_unwind_find_exidx is much faster.
+#elif defined(_LIBUNWIND_USE_DL_UNWIND_FIND_EXIDX)
   int length = 0;
   info.arm_section =
       (uintptr_t)dl_unwind_find_exidx((_Unwind_Ptr)targetAddr, &length);
   info.arm_section_length = (uintptr_t)length * sizeof(EHABIIndexEntry);
   if (info.arm_section && info.arm_section_length)
     return true;
-#elif defined(_LIBUNWIND_ARM_EHABI) || defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND)
+#elif defined(_LIBUNWIND_USE_DL_ITERATE_PHDR)
   dl_iterate_cb_data cb_data = {this, &info, targetAddr};
   int found = dl_iterate_phdr(findUnwindSectionsByPhdr, &cb_data);
   return static_cast<bool>(found);
@@ -658,14 +598,10 @@ inline bool LocalAddressSpace::findUnwindSections(pint_t targetAddr,
 
 
 inline bool LocalAddressSpace::findOtherFDE(pint_t targetAddr, pint_t &fde) {
-#ifdef __APPLE__
-  return checkKeyMgrRegisteredFDEs(targetAddr, *((void**)&fde));
-#else
   // TO DO: if OS has way to dynamically register FDEs, check that.
   (void)targetAddr;
   (void)fde;
   return false;
-#endif
 }
 
 inline bool LocalAddressSpace::findFunctionName(pint_t addr, char *buf,
diff --git a/lib/libunwind/src/DwarfInstructions.hpp b/lib/libunwind/src/DwarfInstructions.hpp
index ee98f538d4..c39cabe1f7 100644
--- a/lib/libunwind/src/DwarfInstructions.hpp
+++ b/lib/libunwind/src/DwarfInstructions.hpp
@@ -93,7 +93,8 @@ typename A::pint_t DwarfInstructions<A, R>::getSavedRegister(
 
   case CFI_Parser<A>::kRegisterInRegister:
     return registers.getRegister((int)savedReg.value);
-
+  case CFI_Parser<A>::kRegisterUndefined:
+    return 0;
   case CFI_Parser<A>::kRegisterUnused:
   case CFI_Parser<A>::kRegisterOffsetFromCFA:
     // FIX ME
@@ -117,6 +118,7 @@ double DwarfInstructions<A, R>::getSavedFloatRegister(
 
   case CFI_Parser<A>::kRegisterIsExpression:
   case CFI_Parser<A>::kRegisterUnused:
+  case CFI_Parser<A>::kRegisterUndefined:
   case CFI_Parser<A>::kRegisterOffsetFromCFA:
   case CFI_Parser<A>::kRegisterInRegister:
     // FIX ME
@@ -140,6 +142,7 @@ v128 DwarfInstructions<A, R>::getSavedVectorRegister(
 
   case CFI_Parser<A>::kRegisterIsExpression:
   case CFI_Parser<A>::kRegisterUnused:
+  case CFI_Parser<A>::kRegisterUndefined:
   case CFI_Parser<A>::kRegisterOffsetFromCFA:
   case CFI_Parser<A>::kRegisterInRegister:
     // FIX ME
@@ -190,6 +193,10 @@ int DwarfInstructions<A, R>::stepWithDwarf(A &addressSpace, pint_t pc,
                                     prolog.savedRegisters[i]));
           else
             return UNW_EBADREG;
+        } else if (i == (int)cieInfo.returnAddressRegister) {
+            // Leaf function keeps the return address in register and there is no
+            // explicit intructions how to restore it.
+            returnAddress = registers.getRegister(cieInfo.returnAddressRegister);
         }
       }
 
diff --git a/lib/libunwind/src/DwarfParser.hpp b/lib/libunwind/src/DwarfParser.hpp
index d05ac46836..de0eb6de9d 100644
--- a/lib/libunwind/src/DwarfParser.hpp
+++ b/lib/libunwind/src/DwarfParser.hpp
@@ -69,6 +69,7 @@ public:
   };
   enum RegisterSavedWhere {
     kRegisterUnused,
+    kRegisterUndefined,
     kRegisterInCFA,
     kRegisterOffsetFromCFA,
     kRegisterInRegister,
@@ -87,9 +88,6 @@ public:
     int32_t           cfaRegisterOffset;  // CFA = (cfaRegister)+cfaRegisterOffset
     int64_t           cfaExpression;      // CFA = expression
     uint32_t          spExtraArgSize;
-    uint32_t          codeOffsetAtStackDecrement;
-    bool              registersInOtherRegisters;
-    bool              sameValueUsed;
     RegisterLocation  savedRegisters[kMaxRegisterNumber + 1];
     enum class InitializeTime { kLazy, kNormal };
 
@@ -134,8 +132,26 @@ public:
     PrologInfo info;
   };
 
+  struct RememberStack {
+    PrologInfoStackEntry *entry;
+    RememberStack() : entry(nullptr) {}
+    ~RememberStack() {
+#if defined(_LIBUNWIND_REMEMBER_CLEANUP_NEEDED)
+      // Clean up rememberStack. Even in the case where every
+      // DW_CFA_remember_state is paired with a DW_CFA_restore_state,
+      // parseInstructions can skip restore opcodes if it reaches the target PC
+      // and stops interpreting, so we have to make sure we don't leak memory.
+      while (entry) {
+        PrologInfoStackEntry *next = entry->next;
+        _LIBUNWIND_REMEMBER_FREE(entry);
+        entry = next;
+      }
+#endif
+    }
+  };
+
   static bool findFDE(A &addressSpace, pint_t pc, pint_t ehSectionStart,
-                      uint32_t sectionLength, pint_t fdeHint, FDE_Info *fdeInfo,
+                      uintptr_t sectionLength, pint_t fdeHint, FDE_Info *fdeInfo,
                       CIE_Info *cieInfo);
   static const char *decodeFDE(A &addressSpace, pint_t fdeStart,
                                FDE_Info *fdeInfo, CIE_Info *cieInfo);
@@ -144,13 +160,6 @@ public:
                                    int arch, PrologInfo *results);
 
   static const char *parseCIE(A &addressSpace, pint_t cie, CIE_Info *cieInfo);
-
-private:
-  static bool parseInstructions(A &addressSpace, pint_t instructions,
-                                pint_t instructionsEnd, const CIE_Info &cieInfo,
-                                pint_t pcoffset,
-                                PrologInfoStackEntry *&rememberStack, int arch,
-                                PrologInfo *results);
 };
 
 /// Parse a FDE into a CIE_Info and an FDE_Info
@@ -166,7 +175,7 @@ const char *CFI_Parser<A>::decodeFDE(A &addressSpace, pint_t fdeStart,
     p += 8;
   }
   if (cfiLength == 0)
-    return "FDE has zero length"; // end marker
+    return "FDE has zero length"; // zero terminator
   uint32_t ciePointer = addressSpace.get32(p);
   if (ciePointer == 0)
     return "FDE is really a CIE"; // this is a CIE not an FDE
@@ -211,11 +220,13 @@ const char *CFI_Parser<A>::decodeFDE(A &addressSpace, pint_t fdeStart,
 /// Scan an eh_frame section to find an FDE for a pc
 template <typename A>
 bool CFI_Parser<A>::findFDE(A &addressSpace, pint_t pc, pint_t ehSectionStart,
-                            uint32_t sectionLength, pint_t fdeHint,
+                            uintptr_t sectionLength, pint_t fdeHint,
                             FDE_Info *fdeInfo, CIE_Info *cieInfo) {
   //fprintf(stderr, "findFDE(0x%llX)\n", (long long)pc);
   pint_t p = (fdeHint != 0) ? fdeHint : ehSectionStart;
-  const pint_t ehSectionEnd = p + sectionLength;
+  const pint_t ehSectionEnd = (sectionLength == UINTPTR_MAX)
+                                  ? static_cast<pint_t>(-1)
+                                  : (ehSectionStart + sectionLength);
   while (p < ehSectionEnd) {
     pint_t currentCFI = p;
     //fprintf(stderr, "findFDE() CFI at 0x%llX\n", (long long)p);
@@ -227,7 +238,7 @@ bool CFI_Parser<A>::findFDE(A &addressSpace, pint_t pc, pint_t ehSectionStart,
       p += 8;
     }
     if (cfiLength == 0)
-      return false; // end marker
+      return false; // zero terminator
     uint32_t id = addressSpace.get32(p);
     if (id == 0) {
       // Skip over CIEs.
@@ -336,7 +347,8 @@ const char *CFI_Parser<A>::parseCIE(A &addressSpace, pint_t cie,
   // parse data alignment factor
   cieInfo->dataAlignFactor = (int)addressSpace.getSLEB128(p, cieContentEnd);
   // parse return address register
-  uint64_t raReg = addressSpace.getULEB128(p, cieContentEnd);
+  uint64_t raReg = (version == 1) ? addressSpace.get8(p++)
+                                  : addressSpace.getULEB128(p, cieContentEnd);
   assert(raReg < 255 && "return address register too large");
   cieInfo->returnAddressRegister = (uint8_t)raReg;
   // parse augmentation data based on augmentation string
@@ -390,418 +402,409 @@ bool CFI_Parser<A>::parseFDEInstructions(A &addressSpace,
                                          const FDE_Info &fdeInfo,
                                          const CIE_Info &cieInfo, pint_t upToPC,
                                          int arch, PrologInfo *results) {
-  PrologInfoStackEntry *rememberStack = NULL;
+  // Alloca is used for the allocation of the rememberStack entries. It removes
+  // the dependency on new/malloc but the below for loop can not be refactored
+  // into functions. Entry could be saved during the processing of a CIE and
+  // restored by an FDE.
+  RememberStack rememberStack;
 
-  // parse CIE then FDE instructions
-  bool returnValue =
-      parseInstructions(addressSpace, cieInfo.cieInstructions,
-                        cieInfo.cieStart + cieInfo.cieLength, cieInfo,
-                        (pint_t)(-1), rememberStack, arch, results) &&
-      parseInstructions(addressSpace, fdeInfo.fdeInstructions,
-                        fdeInfo.fdeStart + fdeInfo.fdeLength, cieInfo,
-                        upToPC - fdeInfo.pcStart, rememberStack, arch, results);
+  struct ParseInfo {
+    pint_t instructions;
+    pint_t instructionsEnd;
+    pint_t pcoffset;
+  };
 
-#if !defined(_LIBUNWIND_NO_HEAP)
-  // Clean up rememberStack. Even in the case where every DW_CFA_remember_state
-  // is paired with a DW_CFA_restore_state, parseInstructions can skip restore
-  // opcodes if it reaches the target PC and stops interpreting, so we have to
-  // make sure we don't leak memory.
-  while (rememberStack) {
-    PrologInfoStackEntry *next = rememberStack->next;
-    free(rememberStack);
-    rememberStack = next;
-  }
-#endif
+  ParseInfo parseInfoArray[] = {
+      {cieInfo.cieInstructions, cieInfo.cieStart + cieInfo.cieLength,
+       (pint_t)(-1)},
+      {fdeInfo.fdeInstructions, fdeInfo.fdeStart + fdeInfo.fdeLength,
+       upToPC - fdeInfo.pcStart}};
 
-  return returnValue;
-}
+  for (const auto &info : parseInfoArray) {
+    pint_t p = info.instructions;
+    pint_t instructionsEnd = info.instructionsEnd;
+    pint_t pcoffset = info.pcoffset;
+    pint_t codeOffset = 0;
 
-/// "run" the DWARF instructions
-template <typename A>
-bool CFI_Parser<A>::parseInstructions(A &addressSpace, pint_t instructions,
-                                      pint_t instructionsEnd,
-                                      const CIE_Info &cieInfo, pint_t pcoffset,
-                                      PrologInfoStackEntry *&rememberStack,
-                                      int arch, PrologInfo *results) {
-  pint_t p = instructions;
-  pint_t codeOffset = 0;
-  // initialState initialized as registers in results are modified. Use
-  // PrologInfo accessor functions to avoid reading uninitialized data.
-  PrologInfo initialState(PrologInfo::InitializeTime::kLazy);
+    // initialState initialized as registers in results are modified. Use
+    // PrologInfo accessor functions to avoid reading uninitialized data.
+    PrologInfo initialState(PrologInfo::InitializeTime::kLazy);
 
-  _LIBUNWIND_TRACE_DWARF("parseInstructions(instructions=0x%0" PRIx64 ")\n",
-                         static_cast<uint64_t>(instructionsEnd));
+    _LIBUNWIND_TRACE_DWARF("parseFDEInstructions(instructions=0x%0" PRIx64
+                           ")\n",
+                           static_cast<uint64_t>(instructionsEnd));
 
-  // see DWARF Spec, section 6.4.2 for details on unwind opcodes
-  while ((p < instructionsEnd) && (codeOffset < pcoffset)) {
-    uint64_t reg;
-    uint64_t reg2;
-    int64_t offset;
-    uint64_t length;
-    uint8_t opcode = addressSpace.get8(p);
-    uint8_t operand;
-#if !defined(_LIBUNWIND_NO_HEAP)
-    PrologInfoStackEntry *entry;
-#endif
-    ++p;
-    switch (opcode) {
-    case DW_CFA_nop:
-      _LIBUNWIND_TRACE_DWARF("DW_CFA_nop\n");
-      break;
-    case DW_CFA_set_loc:
-      codeOffset =
-          addressSpace.getEncodedP(p, instructionsEnd, cieInfo.pointerEncoding);
-      _LIBUNWIND_TRACE_DWARF("DW_CFA_set_loc\n");
-      break;
-    case DW_CFA_advance_loc1:
-      codeOffset += (addressSpace.get8(p) * cieInfo.codeAlignFactor);
-      p += 1;
-      _LIBUNWIND_TRACE_DWARF("DW_CFA_advance_loc1: new offset=%" PRIu64 "\n",
-                             static_cast<uint64_t>(codeOffset));
-      break;
-    case DW_CFA_advance_loc2:
-      codeOffset += (addressSpace.get16(p) * cieInfo.codeAlignFactor);
-      p += 2;
-      _LIBUNWIND_TRACE_DWARF("DW_CFA_advance_loc2: new offset=%" PRIu64 "\n",
-                             static_cast<uint64_t>(codeOffset));
-      break;
-    case DW_CFA_advance_loc4:
-      codeOffset += (addressSpace.get32(p) * cieInfo.codeAlignFactor);
-      p += 4;
-      _LIBUNWIND_TRACE_DWARF("DW_CFA_advance_loc4: new offset=%" PRIu64 "\n",
-                             static_cast<uint64_t>(codeOffset));
-      break;
-    case DW_CFA_offset_extended:
-      reg = addressSpace.getULEB128(p, instructionsEnd);
-      offset = (int64_t)addressSpace.getULEB128(p, instructionsEnd)
-                                                  * cieInfo.dataAlignFactor;
-      if (reg > kMaxRegisterNumber) {
-        _LIBUNWIND_LOG0(
-                "malformed DW_CFA_offset_extended DWARF unwind, reg too big");
-        return false;
+    // see DWARF Spec, section 6.4.2 for details on unwind opcodes
+    while ((p < instructionsEnd) && (codeOffset < pcoffset)) {
+      uint64_t reg;
+      uint64_t reg2;
+      int64_t offset;
+      uint64_t length;
+      uint8_t opcode = addressSpace.get8(p);
+      uint8_t operand;
+
+      ++p;
+      switch (opcode) {
+      case DW_CFA_nop:
+        _LIBUNWIND_TRACE_DWARF("DW_CFA_nop\n");
+        break;
+      case DW_CFA_set_loc:
+        codeOffset = addressSpace.getEncodedP(p, instructionsEnd,
+                                              cieInfo.pointerEncoding);
+        _LIBUNWIND_TRACE_DWARF("DW_CFA_set_loc\n");
+        break;
+      case DW_CFA_advance_loc1:
+        codeOffset += (addressSpace.get8(p) * cieInfo.codeAlignFactor);
+        p += 1;
+        _LIBUNWIND_TRACE_DWARF("DW_CFA_advance_loc1: new offset=%" PRIu64 "\n",
+                               static_cast<uint64_t>(codeOffset));
+        break;
+      case DW_CFA_advance_loc2:
+        codeOffset += (addressSpace.get16(p) * cieInfo.codeAlignFactor);
+        p += 2;
+        _LIBUNWIND_TRACE_DWARF("DW_CFA_advance_loc2: new offset=%" PRIu64 "\n",
+                               static_cast<uint64_t>(codeOffset));
+        break;
+      case DW_CFA_advance_loc4:
+        codeOffset += (addressSpace.get32(p) * cieInfo.codeAlignFactor);
+        p += 4;
+        _LIBUNWIND_TRACE_DWARF("DW_CFA_advance_loc4: new offset=%" PRIu64 "\n",
+                               static_cast<uint64_t>(codeOffset));
+        break;
+      case DW_CFA_offset_extended:
+        reg = addressSpace.getULEB128(p, instructionsEnd);
+        offset = (int64_t)addressSpace.getULEB128(p, instructionsEnd) *
+                 cieInfo.dataAlignFactor;
+        if (reg > kMaxRegisterNumber) {
+          _LIBUNWIND_LOG0(
+              "malformed DW_CFA_offset_extended DWARF unwind, reg too big");
+          return false;
+        }
+        results->setRegister(reg, kRegisterInCFA, offset, initialState);
+        _LIBUNWIND_TRACE_DWARF("DW_CFA_offset_extended(reg=%" PRIu64 ", "
+                               "offset=%" PRId64 ")\n",
+                               reg, offset);
+        break;
+      case DW_CFA_restore_extended:
+        reg = addressSpace.getULEB128(p, instructionsEnd);
+        if (reg > kMaxRegisterNumber) {
+          _LIBUNWIND_LOG0(
+              "malformed DW_CFA_restore_extended DWARF unwind, reg too big");
+          return false;
+        }
+        results->restoreRegisterToInitialState(reg, initialState);
+        _LIBUNWIND_TRACE_DWARF("DW_CFA_restore_extended(reg=%" PRIu64 ")\n",
+                               reg);
+        break;
+      case DW_CFA_undefined:
+        reg = addressSpace.getULEB128(p, instructionsEnd);
+        if (reg > kMaxRegisterNumber) {
+          _LIBUNWIND_LOG0(
+              "malformed DW_CFA_undefined DWARF unwind, reg too big");
+          return false;
+        }
+        results->setRegisterLocation(reg, kRegisterUndefined, initialState);
+        _LIBUNWIND_TRACE_DWARF("DW_CFA_undefined(reg=%" PRIu64 ")\n", reg);
+        break;
+      case DW_CFA_same_value:
+        reg = addressSpace.getULEB128(p, instructionsEnd);
+        if (reg > kMaxRegisterNumber) {
+          _LIBUNWIND_LOG0(
+              "malformed DW_CFA_same_value DWARF unwind, reg too big");
+          return false;
+        }
+        // <rdar://problem/8456377> DW_CFA_same_value unsupported
+        // "same value" means register was stored in frame, but its current
+        // value has not changed, so no need to restore from frame.
+        // We model this as if the register was never saved.
+        results->setRegisterLocation(reg, kRegisterUnused, initialState);
+        _LIBUNWIND_TRACE_DWARF("DW_CFA_same_value(reg=%" PRIu64 ")\n", reg);
+        break;
+      case DW_CFA_register:
+        reg = addressSpace.getULEB128(p, instructionsEnd);
+        reg2 = addressSpace.getULEB128(p, instructionsEnd);
+        if (reg > kMaxRegisterNumber) {
+          _LIBUNWIND_LOG0(
+              "malformed DW_CFA_register DWARF unwind, reg too big");
+          return false;
+        }
+        if (reg2 > kMaxRegisterNumber) {
+          _LIBUNWIND_LOG0(
+              "malformed DW_CFA_register DWARF unwind, reg2 too big");
+          return false;
+        }
+        results->setRegister(reg, kRegisterInRegister, (int64_t)reg2,
+                             initialState);
+        _LIBUNWIND_TRACE_DWARF(
+            "DW_CFA_register(reg=%" PRIu64 ", reg2=%" PRIu64 ")\n", reg, reg2);
+        break;
+      case DW_CFA_remember_state: {
+        // Avoid operator new because that would be an upward dependency.
+        // Avoid malloc because it needs heap allocation.
+        PrologInfoStackEntry *entry =
+            (PrologInfoStackEntry *)_LIBUNWIND_REMEMBER_ALLOC(
+                sizeof(PrologInfoStackEntry));
+        if (entry != NULL) {
+          entry->next = rememberStack.entry;
+          entry->info = *results;
+          rememberStack.entry = entry;
+        } else {
+          return false;
+        }
+        _LIBUNWIND_TRACE_DWARF("DW_CFA_remember_state\n");
+        break;
       }
-      results->setRegister(reg, kRegisterInCFA, offset, initialState);
-      _LIBUNWIND_TRACE_DWARF("DW_CFA_offset_extended(reg=%" PRIu64 ", "
-                             "offset=%" PRId64 ")\n",
-                             reg, offset);
-      break;
-    case DW_CFA_restore_extended:
-      reg = addressSpace.getULEB128(p, instructionsEnd);
-      if (reg > kMaxRegisterNumber) {
-        _LIBUNWIND_LOG0(
-            "malformed DW_CFA_restore_extended DWARF unwind, reg too big");
-        return false;
-      }
-      results->restoreRegisterToInitialState(reg, initialState);
-      _LIBUNWIND_TRACE_DWARF("DW_CFA_restore_extended(reg=%" PRIu64 ")\n", reg);
-      break;
-    case DW_CFA_undefined:
-      reg = addressSpace.getULEB128(p, instructionsEnd);
-      if (reg > kMaxRegisterNumber) {
-        _LIBUNWIND_LOG0(
-                "malformed DW_CFA_undefined DWARF unwind, reg too big");
-        return false;
-      }
-      results->setRegisterLocation(reg, kRegisterUnused, initialState);
-      _LIBUNWIND_TRACE_DWARF("DW_CFA_undefined(reg=%" PRIu64 ")\n", reg);
-      break;
-    case DW_CFA_same_value:
-      reg = addressSpace.getULEB128(p, instructionsEnd);
-      if (reg > kMaxRegisterNumber) {
-        _LIBUNWIND_LOG0(
-                "malformed DW_CFA_same_value DWARF unwind, reg too big");
-        return false;
-      }
-      // <rdar://problem/8456377> DW_CFA_same_value unsupported
-      // "same value" means register was stored in frame, but its current
-      // value has not changed, so no need to restore from frame.
-      // We model this as if the register was never saved.
-      results->setRegisterLocation(reg, kRegisterUnused, initialState);
-      // set flag to disable conversion to compact unwind
-      results->sameValueUsed = true;
-      _LIBUNWIND_TRACE_DWARF("DW_CFA_same_value(reg=%" PRIu64 ")\n", reg);
-      break;
-    case DW_CFA_register:
-      reg = addressSpace.getULEB128(p, instructionsEnd);
-      reg2 = addressSpace.getULEB128(p, instructionsEnd);
-      if (reg > kMaxRegisterNumber) {
-        _LIBUNWIND_LOG0(
-                "malformed DW_CFA_register DWARF unwind, reg too big");
-        return false;
-      }
-      if (reg2 > kMaxRegisterNumber) {
-        _LIBUNWIND_LOG0(
-                "malformed DW_CFA_register DWARF unwind, reg2 too big");
-        return false;
-      }
-      results->setRegister(reg, kRegisterInRegister, (int64_t)reg2,
-                           initialState);
-      // set flag to disable conversion to compact unwind
-      results->registersInOtherRegisters = true;
-      _LIBUNWIND_TRACE_DWARF(
-          "DW_CFA_register(reg=%" PRIu64 ", reg2=%" PRIu64 ")\n", reg, reg2);
-      break;
-#if !defined(_LIBUNWIND_NO_HEAP)
-    case DW_CFA_remember_state:
-      // avoid operator new, because that would be an upward dependency
-      entry = (PrologInfoStackEntry *)malloc(sizeof(PrologInfoStackEntry));
-      if (entry != NULL) {
-        entry->next = rememberStack;
-        entry->info = *results;
-        rememberStack = entry;
-      } else {
-        return false;
-      }
-      _LIBUNWIND_TRACE_DWARF("DW_CFA_remember_state\n");
-      break;
-    case DW_CFA_restore_state:
-      if (rememberStack != NULL) {
-        PrologInfoStackEntry *top = rememberStack;
-        *results = top->info;
-        rememberStack = top->next;
-        free((char *)top);
-      } else {
-        return false;
-      }
-      _LIBUNWIND_TRACE_DWARF("DW_CFA_restore_state\n");
-      break;
-#endif
-    case DW_CFA_def_cfa:
-      reg = addressSpace.getULEB128(p, instructionsEnd);
-      offset = (int64_t)addressSpace.getULEB128(p, instructionsEnd);
-      if (reg > kMaxRegisterNumber) {
-        _LIBUNWIND_LOG0("malformed DW_CFA_def_cfa DWARF unwind, reg too big");
-        return false;
-      }
-      results->cfaRegister = (uint32_t)reg;
-      results->cfaRegisterOffset = (int32_t)offset;
-      _LIBUNWIND_TRACE_DWARF(
-          "DW_CFA_def_cfa(reg=%" PRIu64 ", offset=%" PRIu64 ")\n", reg, offset);
-      break;
-    case DW_CFA_def_cfa_register:
-      reg = addressSpace.getULEB128(p, instructionsEnd);
-      if (reg > kMaxRegisterNumber) {
-        _LIBUNWIND_LOG0(
-            "malformed DW_CFA_def_cfa_register DWARF unwind, reg too big");
-        return false;
-      }
-      results->cfaRegister = (uint32_t)reg;
-      _LIBUNWIND_TRACE_DWARF("DW_CFA_def_cfa_register(%" PRIu64 ")\n", reg);
-      break;
-    case DW_CFA_def_cfa_offset:
-      results->cfaRegisterOffset = (int32_t)
-                                  addressSpace.getULEB128(p, instructionsEnd);
-      results->codeOffsetAtStackDecrement = (uint32_t)codeOffset;
-      _LIBUNWIND_TRACE_DWARF("DW_CFA_def_cfa_offset(%d)\n",
-                             results->cfaRegisterOffset);
-      break;
-    case DW_CFA_def_cfa_expression:
-      results->cfaRegister = 0;
-      results->cfaExpression = (int64_t)p;
-      length = addressSpace.getULEB128(p, instructionsEnd);
-      assert(length < static_cast<pint_t>(~0) && "pointer overflow");
-      p += static_cast<pint_t>(length);
-      _LIBUNWIND_TRACE_DWARF("DW_CFA_def_cfa_expression(expression=0x%" PRIx64
-                             ", length=%" PRIu64 ")\n",
-                             results->cfaExpression, length);
-      break;
-    case DW_CFA_expression:
-      reg = addressSpace.getULEB128(p, instructionsEnd);
-      if (reg > kMaxRegisterNumber) {
-        _LIBUNWIND_LOG0(
-                "malformed DW_CFA_expression DWARF unwind, reg too big");
-        return false;
-      }
-      results->setRegister(reg, kRegisterAtExpression, (int64_t)p,
-                           initialState);
-      length = addressSpace.getULEB128(p, instructionsEnd);
-      assert(length < static_cast<pint_t>(~0) && "pointer overflow");
-      p += static_cast<pint_t>(length);
-      _LIBUNWIND_TRACE_DWARF("DW_CFA_expression(reg=%" PRIu64 ", "
-                             "expression=0x%" PRIx64 ", "
-                             "length=%" PRIu64 ")\n",
-                             reg, results->savedRegisters[reg].value, length);
-      break;
-    case DW_CFA_offset_extended_sf:
-      reg = addressSpace.getULEB128(p, instructionsEnd);
-      if (reg > kMaxRegisterNumber) {
-        _LIBUNWIND_LOG0(
-            "malformed DW_CFA_offset_extended_sf DWARF unwind, reg too big");
-        return false;
-      }
-      offset =
-          addressSpace.getSLEB128(p, instructionsEnd) * cieInfo.dataAlignFactor;
-      results->setRegister(reg, kRegisterInCFA, offset, initialState);
-      _LIBUNWIND_TRACE_DWARF("DW_CFA_offset_extended_sf(reg=%" PRIu64 ", "
-                             "offset=%" PRId64 ")\n",
-                             reg, offset);
-      break;
-    case DW_CFA_def_cfa_sf:
-      reg = addressSpace.getULEB128(p, instructionsEnd);
-      offset =
-          addressSpace.getSLEB128(p, instructionsEnd) * cieInfo.dataAlignFactor;
-      if (reg > kMaxRegisterNumber) {
-        _LIBUNWIND_LOG0(
-                "malformed DW_CFA_def_cfa_sf DWARF unwind, reg too big");
-        return false;
-      }
-      results->cfaRegister = (uint32_t)reg;
-      results->cfaRegisterOffset = (int32_t)offset;
-      _LIBUNWIND_TRACE_DWARF("DW_CFA_def_cfa_sf(reg=%" PRIu64 ", "
-                             "offset=%" PRId64 ")\n",
-                             reg, offset);
-      break;
-    case DW_CFA_def_cfa_offset_sf:
-      results->cfaRegisterOffset = (int32_t)
-        (addressSpace.getSLEB128(p, instructionsEnd) * cieInfo.dataAlignFactor);
-      results->codeOffsetAtStackDecrement = (uint32_t)codeOffset;
-      _LIBUNWIND_TRACE_DWARF("DW_CFA_def_cfa_offset_sf(%d)\n",
-                             results->cfaRegisterOffset);
-      break;
-    case DW_CFA_val_offset:
-      reg = addressSpace.getULEB128(p, instructionsEnd);
-      if (reg > kMaxRegisterNumber) {
-        _LIBUNWIND_LOG(
-                "malformed DW_CFA_val_offset DWARF unwind, reg (%" PRIu64
-                ") out of range\n",
-                reg);
-        return false;
-      }
-      offset = (int64_t)addressSpace.getULEB128(p, instructionsEnd)
-                                                    * cieInfo.dataAlignFactor;
-      results->setRegister(reg, kRegisterOffsetFromCFA, offset, initialState);
-      _LIBUNWIND_TRACE_DWARF("DW_CFA_val_offset(reg=%" PRIu64 ", "
-                             "offset=%" PRId64 "\n",
-                             reg, offset);
-      break;
-    case DW_CFA_val_offset_sf:
-      reg = addressSpace.getULEB128(p, instructionsEnd);
-      if (reg > kMaxRegisterNumber) {
-        _LIBUNWIND_LOG0(
-                "malformed DW_CFA_val_offset_sf DWARF unwind, reg too big");
-        return false;
-      }
-      offset =
-          addressSpace.getSLEB128(p, instructionsEnd) * cieInfo.dataAlignFactor;
-      results->setRegister(reg, kRegisterOffsetFromCFA, offset, initialState);
-      _LIBUNWIND_TRACE_DWARF("DW_CFA_val_offset_sf(reg=%" PRIu64 ", "
-                             "offset=%" PRId64 "\n",
-                             reg, offset);
-      break;
-    case DW_CFA_val_expression:
-      reg = addressSpace.getULEB128(p, instructionsEnd);
-      if (reg > kMaxRegisterNumber) {
-        _LIBUNWIND_LOG0(
-                "malformed DW_CFA_val_expression DWARF unwind, reg too big");
-        return false;
-      }
-      results->setRegister(reg, kRegisterIsExpression, (int64_t)p,
-                           initialState);
-      length = addressSpace.getULEB128(p, instructionsEnd);
-      assert(length < static_cast<pint_t>(~0) && "pointer overflow");
-      p += static_cast<pint_t>(length);
-      _LIBUNWIND_TRACE_DWARF("DW_CFA_val_expression(reg=%" PRIu64 ", "
-                             "expression=0x%" PRIx64 ", length=%" PRIu64 ")\n",
-                             reg, results->savedRegisters[reg].value, length);
-      break;
-    case DW_CFA_GNU_args_size:
-      length = addressSpace.getULEB128(p, instructionsEnd);
-      results->spExtraArgSize = (uint32_t)length;
-      _LIBUNWIND_TRACE_DWARF("DW_CFA_GNU_args_size(%" PRIu64 ")\n", length);
-      break;
-    case DW_CFA_GNU_negative_offset_extended:
-      reg = addressSpace.getULEB128(p, instructionsEnd);
-      if (reg > kMaxRegisterNumber) {
-        _LIBUNWIND_LOG0("malformed DW_CFA_GNU_negative_offset_extended DWARF "
-                        "unwind, reg too big");
-        return false;
-      }
-      offset = (int64_t)addressSpace.getULEB128(p, instructionsEnd)
-                                                    * cieInfo.dataAlignFactor;
-      results->setRegister(reg, kRegisterInCFA, -offset, initialState);
-      _LIBUNWIND_TRACE_DWARF(
-          "DW_CFA_GNU_negative_offset_extended(%" PRId64 ")\n", offset);
-      break;
+      case DW_CFA_restore_state:
+        if (rememberStack.entry != NULL) {
+          PrologInfoStackEntry *top = rememberStack.entry;
+          *results = top->info;
+          rememberStack.entry = top->next;
+          _LIBUNWIND_REMEMBER_FREE(top);
+        } else {
+          return false;
+        }
+        _LIBUNWIND_TRACE_DWARF("DW_CFA_restore_state\n");
+        break;
+      case DW_CFA_def_cfa:
+        reg = addressSpace.getULEB128(p, instructionsEnd);
+        offset = (int64_t)addressSpace.getULEB128(p, instructionsEnd);
+        if (reg > kMaxRegisterNumber) {
+          _LIBUNWIND_LOG0("malformed DW_CFA_def_cfa DWARF unwind, reg too big");
+          return false;
+        }
+        results->cfaRegister = (uint32_t)reg;
+        results->cfaRegisterOffset = (int32_t)offset;
+        _LIBUNWIND_TRACE_DWARF("DW_CFA_def_cfa(reg=%" PRIu64 ", offset=%" PRIu64
+                               ")\n",
+                               reg, offset);
+        break;
+      case DW_CFA_def_cfa_register:
+        reg = addressSpace.getULEB128(p, instructionsEnd);
+        if (reg > kMaxRegisterNumber) {
+          _LIBUNWIND_LOG0(
+              "malformed DW_CFA_def_cfa_register DWARF unwind, reg too big");
+          return false;
+        }
+        results->cfaRegister = (uint32_t)reg;
+        _LIBUNWIND_TRACE_DWARF("DW_CFA_def_cfa_register(%" PRIu64 ")\n", reg);
+        break;
+      case DW_CFA_def_cfa_offset:
+        results->cfaRegisterOffset =
+            (int32_t)addressSpace.getULEB128(p, instructionsEnd);
+        _LIBUNWIND_TRACE_DWARF("DW_CFA_def_cfa_offset(%d)\n",
+                               results->cfaRegisterOffset);
+        break;
+      case DW_CFA_def_cfa_expression:
+        results->cfaRegister = 0;
+        results->cfaExpression = (int64_t)p;
+        length = addressSpace.getULEB128(p, instructionsEnd);
+        assert(length < static_cast<pint_t>(~0) && "pointer overflow");
+        p += static_cast<pint_t>(length);
+        _LIBUNWIND_TRACE_DWARF("DW_CFA_def_cfa_expression(expression=0x%" PRIx64
+                               ", length=%" PRIu64 ")\n",
+                               results->cfaExpression, length);
+        break;
+      case DW_CFA_expression:
+        reg = addressSpace.getULEB128(p, instructionsEnd);
+        if (reg > kMaxRegisterNumber) {
+          _LIBUNWIND_LOG0(
+              "malformed DW_CFA_expression DWARF unwind, reg too big");
+          return false;
+        }
+        results->setRegister(reg, kRegisterAtExpression, (int64_t)p,
+                             initialState);
+        length = addressSpace.getULEB128(p, instructionsEnd);
+        assert(length < static_cast<pint_t>(~0) && "pointer overflow");
+        p += static_cast<pint_t>(length);
+        _LIBUNWIND_TRACE_DWARF("DW_CFA_expression(reg=%" PRIu64 ", "
+                               "expression=0x%" PRIx64 ", "
+                               "length=%" PRIu64 ")\n",
+                               reg, results->savedRegisters[reg].value, length);
+        break;
+      case DW_CFA_offset_extended_sf:
+        reg = addressSpace.getULEB128(p, instructionsEnd);
+        if (reg > kMaxRegisterNumber) {
+          _LIBUNWIND_LOG0(
+              "malformed DW_CFA_offset_extended_sf DWARF unwind, reg too big");
+          return false;
+        }
+        offset = addressSpace.getSLEB128(p, instructionsEnd) *
+                 cieInfo.dataAlignFactor;
+        results->setRegister(reg, kRegisterInCFA, offset, initialState);
+        _LIBUNWIND_TRACE_DWARF("DW_CFA_offset_extended_sf(reg=%" PRIu64 ", "
+                               "offset=%" PRId64 ")\n",
+                               reg, offset);
+        break;
+      case DW_CFA_def_cfa_sf:
+        reg = addressSpace.getULEB128(p, instructionsEnd);
+        offset = addressSpace.getSLEB128(p, instructionsEnd) *
+                 cieInfo.dataAlignFactor;
+        if (reg > kMaxRegisterNumber) {
+          _LIBUNWIND_LOG0(
+              "malformed DW_CFA_def_cfa_sf DWARF unwind, reg too big");
+          return false;
+        }
+        results->cfaRegister = (uint32_t)reg;
+        results->cfaRegisterOffset = (int32_t)offset;
+        _LIBUNWIND_TRACE_DWARF("DW_CFA_def_cfa_sf(reg=%" PRIu64 ", "
+                               "offset=%" PRId64 ")\n",
+                               reg, offset);
+        break;
+      case DW_CFA_def_cfa_offset_sf:
+        results->cfaRegisterOffset =
+            (int32_t)(addressSpace.getSLEB128(p, instructionsEnd) *
+                      cieInfo.dataAlignFactor);
+        _LIBUNWIND_TRACE_DWARF("DW_CFA_def_cfa_offset_sf(%d)\n",
+                               results->cfaRegisterOffset);
+        break;
+      case DW_CFA_val_offset:
+        reg = addressSpace.getULEB128(p, instructionsEnd);
+        if (reg > kMaxRegisterNumber) {
+          _LIBUNWIND_LOG(
+              "malformed DW_CFA_val_offset DWARF unwind, reg (%" PRIu64
+              ") out of range\n",
+              reg);
+          return false;
+        }
+        offset = (int64_t)addressSpace.getULEB128(p, instructionsEnd) *
+                 cieInfo.dataAlignFactor;
+        results->setRegister(reg, kRegisterOffsetFromCFA, offset, initialState);
+        _LIBUNWIND_TRACE_DWARF("DW_CFA_val_offset(reg=%" PRIu64 ", "
+                               "offset=%" PRId64 "\n",
+                               reg, offset);
+        break;
+      case DW_CFA_val_offset_sf:
+        reg = addressSpace.getULEB128(p, instructionsEnd);
+        if (reg > kMaxRegisterNumber) {
+          _LIBUNWIND_LOG0(
+              "malformed DW_CFA_val_offset_sf DWARF unwind, reg too big");
+          return false;
+        }
+        offset = addressSpace.getSLEB128(p, instructionsEnd) *
+                 cieInfo.dataAlignFactor;
+        results->setRegister(reg, kRegisterOffsetFromCFA, offset, initialState);
+        _LIBUNWIND_TRACE_DWARF("DW_CFA_val_offset_sf(reg=%" PRIu64 ", "
+                               "offset=%" PRId64 "\n",
+                               reg, offset);
+        break;
+      case DW_CFA_val_expression:
+        reg = addressSpace.getULEB128(p, instructionsEnd);
+        if (reg > kMaxRegisterNumber) {
+          _LIBUNWIND_LOG0(
+              "malformed DW_CFA_val_expression DWARF unwind, reg too big");
+          return false;
+        }
+        results->setRegister(reg, kRegisterIsExpression, (int64_t)p,
+                             initialState);
+        length = addressSpace.getULEB128(p, instructionsEnd);
+        assert(length < static_cast<pint_t>(~0) && "pointer overflow");
+        p += static_cast<pint_t>(length);
+        _LIBUNWIND_TRACE_DWARF("DW_CFA_val_expression(reg=%" PRIu64 ", "
+                               "expression=0x%" PRIx64 ", length=%" PRIu64
+                               ")\n",
+                               reg, results->savedRegisters[reg].value, length);
+        break;
+      case DW_CFA_GNU_args_size:
+        length = addressSpace.getULEB128(p, instructionsEnd);
+        results->spExtraArgSize = (uint32_t)length;
+        _LIBUNWIND_TRACE_DWARF("DW_CFA_GNU_args_size(%" PRIu64 ")\n", length);
+        break;
+      case DW_CFA_GNU_negative_offset_extended:
+        reg = addressSpace.getULEB128(p, instructionsEnd);
+        if (reg > kMaxRegisterNumber) {
+          _LIBUNWIND_LOG0("malformed DW_CFA_GNU_negative_offset_extended DWARF "
+                          "unwind, reg too big");
+          return false;
+        }
+        offset = (int64_t)addressSpace.getULEB128(p, instructionsEnd) *
+                 cieInfo.dataAlignFactor;
+        results->setRegister(reg, kRegisterInCFA, -offset, initialState);
+        _LIBUNWIND_TRACE_DWARF(
+            "DW_CFA_GNU_negative_offset_extended(%" PRId64 ")\n", offset);
+        break;
 
 #if defined(_LIBUNWIND_TARGET_AARCH64) || defined(_LIBUNWIND_TARGET_SPARC)
-    // The same constant is used to represent different instructions on
-    // AArch64 (negate_ra_state) and SPARC (window_save).
-    static_assert(DW_CFA_AARCH64_negate_ra_state == DW_CFA_GNU_window_save,
-                  "uses the same constant");
-    case DW_CFA_AARCH64_negate_ra_state:
-      switch (arch) {
+        // The same constant is used to represent different instructions on
+        // AArch64 (negate_ra_state) and SPARC (window_save).
+        static_assert(DW_CFA_AARCH64_negate_ra_state == DW_CFA_GNU_window_save,
+                      "uses the same constant");
+      case DW_CFA_AARCH64_negate_ra_state:
+        switch (arch) {
 #if defined(_LIBUNWIND_TARGET_AARCH64)
         case REGISTERS_ARM64: {
           int64_t value =
               results->savedRegisters[UNW_ARM64_RA_SIGN_STATE].value ^ 0x1;
-          results->setRegisterValue(UNW_ARM64_RA_SIGN_STATE, value, initialState);
+          results->setRegisterValue(UNW_ARM64_RA_SIGN_STATE, value,
+                                    initialState);
           _LIBUNWIND_TRACE_DWARF("DW_CFA_AARCH64_negate_ra_state\n");
         } break;
 #endif
 
 #if defined(_LIBUNWIND_TARGET_SPARC)
-      // case DW_CFA_GNU_window_save:
-      case REGISTERS_SPARC:
-        _LIBUNWIND_TRACE_DWARF("DW_CFA_GNU_window_save()\n");
-        for (reg = UNW_SPARC_O0; reg <= UNW_SPARC_O7; reg++) {
-          results->setRegister(reg, kRegisterInRegister,
-                               ((int64_t)reg - UNW_SPARC_O0) + UNW_SPARC_I0,
-                               initialState);
-        }
+        // case DW_CFA_GNU_window_save:
+        case REGISTERS_SPARC:
+          _LIBUNWIND_TRACE_DWARF("DW_CFA_GNU_window_save()\n");
+          for (reg = UNW_SPARC_O0; reg <= UNW_SPARC_O7; reg++) {
+            results->setRegister(reg, kRegisterInRegister,
+                                 ((int64_t)reg - UNW_SPARC_O0) + UNW_SPARC_I0,
+                                 initialState);
+          }
 
-        for (reg = UNW_SPARC_L0; reg <= UNW_SPARC_I7; reg++) {
-          results->setRegister(reg, kRegisterInCFA,
-                               ((int64_t)reg - UNW_SPARC_L0) * 4, initialState);
+          for (reg = UNW_SPARC_L0; reg <= UNW_SPARC_I7; reg++) {
+            results->setRegister(reg, kRegisterInCFA,
+                                 ((int64_t)reg - UNW_SPARC_L0) * 4,
+                                 initialState);
+          }
+          break;
+#endif
         }
         break;
-#endif
-      }
-      break;
 #else
-      (void)arch;
+        (void)arch;
 #endif
 
-    default:
-      operand = opcode & 0x3F;
-      switch (opcode & 0xC0) {
-      case DW_CFA_offset:
-        reg = operand;
-        if (reg > kMaxRegisterNumber) {
-          _LIBUNWIND_LOG("malformed DW_CFA_offset DWARF unwind, reg (%" PRIu64
-                         ") out of range",
-                  reg);
-          return false;
-        }
-        offset = (int64_t)addressSpace.getULEB128(p, instructionsEnd)
-                                                    * cieInfo.dataAlignFactor;
-        results->setRegister(reg, kRegisterInCFA, offset, initialState);
-        _LIBUNWIND_TRACE_DWARF("DW_CFA_offset(reg=%d, offset=%" PRId64 ")\n",
-                               operand, offset);
-        break;
-      case DW_CFA_advance_loc:
-        codeOffset += operand * cieInfo.codeAlignFactor;
-        _LIBUNWIND_TRACE_DWARF("DW_CFA_advance_loc: new offset=%" PRIu64 "\n",
-                               static_cast<uint64_t>(codeOffset));
-        break;
-      case DW_CFA_restore:
-        reg = operand;
-        if (reg > kMaxRegisterNumber) {
-          _LIBUNWIND_LOG("malformed DW_CFA_restore DWARF unwind, reg (%" PRIu64
-                         ") out of range",
-                  reg);
-          return false;
-        }
-        results->restoreRegisterToInitialState(reg, initialState);
-        _LIBUNWIND_TRACE_DWARF("DW_CFA_restore(reg=%" PRIu64 ")\n",
-                               static_cast<uint64_t>(operand));
-        break;
       default:
-        _LIBUNWIND_TRACE_DWARF("unknown CFA opcode 0x%02X\n", opcode);
-        return false;
+        operand = opcode & 0x3F;
+        switch (opcode & 0xC0) {
+        case DW_CFA_offset:
+          reg = operand;
+          if (reg > kMaxRegisterNumber) {
+            _LIBUNWIND_LOG("malformed DW_CFA_offset DWARF unwind, reg (%" PRIu64
+                           ") out of range",
+                           reg);
+            return false;
+          }
+          offset = (int64_t)addressSpace.getULEB128(p, instructionsEnd) *
+                   cieInfo.dataAlignFactor;
+          results->setRegister(reg, kRegisterInCFA, offset, initialState);
+          _LIBUNWIND_TRACE_DWARF("DW_CFA_offset(reg=%d, offset=%" PRId64 ")\n",
+                                 operand, offset);
+          break;
+        case DW_CFA_advance_loc:
+          codeOffset += operand * cieInfo.codeAlignFactor;
+          _LIBUNWIND_TRACE_DWARF("DW_CFA_advance_loc: new offset=%" PRIu64 "\n",
+                                 static_cast<uint64_t>(codeOffset));
+          break;
+        case DW_CFA_restore:
+          reg = operand;
+          if (reg > kMaxRegisterNumber) {
+            _LIBUNWIND_LOG(
+                "malformed DW_CFA_restore DWARF unwind, reg (%" PRIu64
+                ") out of range",
+                reg);
+            return false;
+          }
+          results->restoreRegisterToInitialState(reg, initialState);
+          _LIBUNWIND_TRACE_DWARF("DW_CFA_restore(reg=%" PRIu64 ")\n",
+                                 static_cast<uint64_t>(operand));
+          break;
+        default:
+          _LIBUNWIND_TRACE_DWARF("unknown CFA opcode 0x%02X\n", opcode);
+          return false;
+        }
       }
     }
   }
-
   return true;
 }
 
diff --git a/lib/libunwind/src/FrameHeaderCache.hpp b/lib/libunwind/src/FrameHeaderCache.hpp
index 813fcd408b..54d5d33c3c 100644
--- a/lib/libunwind/src/FrameHeaderCache.hpp
+++ b/lib/libunwind/src/FrameHeaderCache.hpp
@@ -32,7 +32,7 @@
 class _LIBUNWIND_HIDDEN FrameHeaderCache {
   struct CacheEntry {
     uintptr_t LowPC() { return Info.dso_base; };
-    uintptr_t HighPC() { return Info.dso_base + Info.dwarf_section_length; };
+    uintptr_t HighPC() { return Info.dso_base + Info.text_segment_length; };
     UnwindInfoSections Info;
     CacheEntry *Next;
   };
diff --git a/lib/libunwind/src/Registers.hpp b/lib/libunwind/src/Registers.hpp
index c76b05bf31..efeaf43559 100644
--- a/lib/libunwind/src/Registers.hpp
+++ b/lib/libunwind/src/Registers.hpp
@@ -36,9 +36,12 @@ enum {
   REGISTERS_SPARC,
   REGISTERS_HEXAGON,
   REGISTERS_RISCV,
+  REGISTERS_VE,
 };
 
 #if defined(_LIBUNWIND_TARGET_I386)
+class _LIBUNWIND_HIDDEN Registers_x86;
+extern "C" void __libunwind_Registers_x86_jumpto(Registers_x86 *);
 /// Registers_x86 holds the register state of a thread in a 32-bit intel
 /// process.
 class _LIBUNWIND_HIDDEN Registers_x86 {
@@ -56,7 +59,7 @@ public:
   v128        getVectorRegister(int num) const;
   void        setVectorRegister(int num, v128 value);
   static const char *getRegisterName(int num);
-  void        jumpto();
+  void        jumpto() { __libunwind_Registers_x86_jumpto(this); }
   static int  lastDwarfRegNum() { return _LIBUNWIND_HIGHEST_DWARF_REGISTER_X86; }
   static int  getArch() { return REGISTERS_X86; }
 
@@ -248,6 +251,8 @@ inline void Registers_x86::setVectorRegister(int, v128) {
 #if defined(_LIBUNWIND_TARGET_X86_64)
 /// Registers_x86_64  holds the register state of a thread in a 64-bit intel
 /// process.
+class _LIBUNWIND_HIDDEN Registers_x86_64;
+extern "C" void __libunwind_Registers_x86_64_jumpto(Registers_x86_64 *);
 class _LIBUNWIND_HIDDEN Registers_x86_64 {
 public:
   Registers_x86_64();
@@ -263,7 +268,7 @@ public:
   v128        getVectorRegister(int num) const;
   void        setVectorRegister(int num, v128 value);
   static const char *getRegisterName(int num);
-  void        jumpto();
+  void        jumpto() { __libunwind_Registers_x86_64_jumpto(this); }
   static int  lastDwarfRegNum() { return _LIBUNWIND_HIGHEST_DWARF_REGISTER_X86_64; }
   static int  getArch() { return REGISTERS_X86_64; }
 
@@ -1510,12 +1515,12 @@ inline void Registers_ppc64::setFloatRegister(int regNum, double value) {
 }
 
 inline bool Registers_ppc64::validVectorRegister(int regNum) const {
-#ifdef PPC64_HAS_VMX
+#if defined(__VSX__)
   if (regNum >= UNW_PPC64_VS0 && regNum <= UNW_PPC64_VS31)
     return true;
   if (regNum >= UNW_PPC64_VS32 && regNum <= UNW_PPC64_VS63)
     return true;
-#else
+#elif defined(__ALTIVEC__)
   if (regNum >= UNW_PPC64_V0 && regNum <= UNW_PPC64_V31)
     return true;
 #endif
@@ -1771,6 +1776,8 @@ inline const char *Registers_ppc64::getRegisterName(int regNum) {
 #if defined(_LIBUNWIND_TARGET_AARCH64)
 /// Registers_arm64  holds the register state of a thread in a 64-bit arm
 /// process.
+class _LIBUNWIND_HIDDEN Registers_arm64;
+extern "C" void __libunwind_Registers_arm64_jumpto(Registers_arm64 *);
 class _LIBUNWIND_HIDDEN Registers_arm64 {
 public:
   Registers_arm64();
@@ -1786,7 +1793,7 @@ public:
   v128        getVectorRegister(int num) const;
   void        setVectorRegister(int num, v128 value);
   static const char *getRegisterName(int num);
-  void        jumpto();
+  void        jumpto() { __libunwind_Registers_arm64_jumpto(this); }
   static int  lastDwarfRegNum() { return _LIBUNWIND_HIGHEST_DWARF_REGISTER_ARM64; }
   static int  getArch() { return REGISTERS_ARM64; }
 
@@ -3977,6 +3984,447 @@ inline void Registers_riscv::setVectorRegister(int, v128) {
   _LIBUNWIND_ABORT("no riscv vector register support yet");
 }
 #endif // _LIBUNWIND_TARGET_RISCV
+
+#if defined(_LIBUNWIND_TARGET_VE)
+/// Registers_ve holds the register state of a thread in a VE process.
+class _LIBUNWIND_HIDDEN Registers_ve {
+public:
+  Registers_ve();
+  Registers_ve(const void *registers);
+
+  bool        validRegister(int num) const;
+  uint64_t    getRegister(int num) const;
+  void        setRegister(int num, uint64_t value);
+  bool        validFloatRegister(int num) const;
+  double      getFloatRegister(int num) const;
+  void        setFloatRegister(int num, double value);
+  bool        validVectorRegister(int num) const;
+  v128        getVectorRegister(int num) const;
+  void        setVectorRegister(int num, v128 value);
+  static const char *getRegisterName(int num);
+  void        jumpto();
+  static int  lastDwarfRegNum() { return _LIBUNWIND_HIGHEST_DWARF_REGISTER_VE; }
+  static int  getArch() { return REGISTERS_VE; }
+
+  uint64_t  getSP() const         { return _registers.__s[11]; }
+  void      setSP(uint64_t value) { _registers.__s[11] = value; }
+  uint64_t  getIP() const         { return _registers.__ic; }
+  void      setIP(uint64_t value) { _registers.__ic = value; }
+
+private:
+  // FIXME: Need to store not only scalar registers but also vector and vector
+  // mask registers.  VEOS uses mcontext_t defined in ucontext.h.  It takes
+  // 524288 bytes (65536*8 bytes), though.  Currently, we use libunwind for
+  // SjLj exception support only, so Registers_ve is not implemented completely.
+  struct ve_thread_state_t {
+    uint64_t __s[64]; // s0-s64
+    uint64_t __ic;    // Instruction counter (IC)
+    uint64_t __vixr;  // Vector Index Register
+    uint64_t __vl;    // Vector Length Register
+  };
+
+  ve_thread_state_t _registers; // total 67 registers
+
+  // Currently no vector register is preserved.
+};
+
+inline Registers_ve::Registers_ve(const void *registers) {
+  static_assert((check_fit<Registers_ve, unw_context_t>::does_fit),
+                "ve registers do not fit into unw_context_t");
+  memcpy(&_registers, static_cast<const uint8_t *>(registers),
+         sizeof(_registers));
+  static_assert(sizeof(_registers) == 536,
+                "expected vector register offset to be 536");
+}
+
+inline Registers_ve::Registers_ve() {
+  memset(&_registers, 0, sizeof(_registers));
+}
+
+inline bool Registers_ve::validRegister(int regNum) const {
+  if (regNum >= UNW_VE_S0 && regNum <= UNW_VE_S63)
+    return true;
+
+  switch (regNum) {
+  case UNW_REG_IP:
+  case UNW_REG_SP:
+  case UNW_VE_VIXR:
+  case UNW_VE_VL:
+    return true;
+  default:
+    return false;
+  }
+}
+
+inline uint64_t Registers_ve::getRegister(int regNum) const {
+  if (regNum >= UNW_VE_S0 && regNum <= UNW_VE_S63)
+    return _registers.__s[regNum - UNW_VE_S0];
+
+  switch (regNum) {
+  case UNW_REG_IP:
+    return _registers.__ic;
+  case UNW_REG_SP:
+    return _registers.__s[11];
+  case UNW_VE_VIXR:
+    return _registers.__vixr;
+  case UNW_VE_VL:
+    return _registers.__vl;
+  }
+  _LIBUNWIND_ABORT("unsupported ve register");
+}
+
+inline void Registers_ve::setRegister(int regNum, uint64_t value) {
+  if (regNum >= UNW_VE_S0 && regNum <= UNW_VE_S63) {
+    _registers.__s[regNum - UNW_VE_S0] = value;
+    return;
+  }
+
+  switch (regNum) {
+  case UNW_REG_IP:
+    _registers.__ic = value;
+    return;
+  case UNW_REG_SP:
+    _registers.__s[11] = value;
+    return;
+  case UNW_VE_VIXR:
+    _registers.__vixr = value;
+    return;
+  case UNW_VE_VL:
+    _registers.__vl = value;
+    return;
+  }
+  _LIBUNWIND_ABORT("unsupported ve register");
+}
+
+inline bool Registers_ve::validFloatRegister(int /* regNum */) const {
+  return false;
+}
+
+inline double Registers_ve::getFloatRegister(int /* regNum */) const {
+  _LIBUNWIND_ABORT("VE doesn't have float registers");
+}
+
+inline void Registers_ve::setFloatRegister(int /* regNum */,
+                                           double /* value */) {
+  _LIBUNWIND_ABORT("VE doesn't have float registers");
+}
+
+inline bool Registers_ve::validVectorRegister(int /* regNum */) const {
+  return false;
+}
+
+inline v128 Registers_ve::getVectorRegister(int /* regNum */) const {
+  _LIBUNWIND_ABORT("VE vector support not implemented");
+}
+
+inline void Registers_ve::setVectorRegister(int /* regNum */,
+                                            v128 /* value */) {
+  _LIBUNWIND_ABORT("VE vector support not implemented");
+}
+
+inline const char *Registers_ve::getRegisterName(int regNum) {
+  switch (regNum) {
+  case UNW_REG_IP:
+    return "ip";
+  case UNW_REG_SP:
+    return "sp";
+  case UNW_VE_VIXR:
+    return "vixr";
+  case UNW_VE_VL:
+    return "vl";
+  case UNW_VE_S0:
+    return "s0";
+  case UNW_VE_S1:
+    return "s1";
+  case UNW_VE_S2:
+    return "s2";
+  case UNW_VE_S3:
+    return "s3";
+  case UNW_VE_S4:
+    return "s4";
+  case UNW_VE_S5:
+    return "s5";
+  case UNW_VE_S6:
+    return "s6";
+  case UNW_VE_S7:
+    return "s7";
+  case UNW_VE_S8:
+    return "s8";
+  case UNW_VE_S9:
+    return "s9";
+  case UNW_VE_S10:
+    return "s10";
+  case UNW_VE_S11:
+    return "s11";
+  case UNW_VE_S12:
+    return "s12";
+  case UNW_VE_S13:
+    return "s13";
+  case UNW_VE_S14:
+    return "s14";
+  case UNW_VE_S15:
+    return "s15";
+  case UNW_VE_S16:
+    return "s16";
+  case UNW_VE_S17:
+    return "s17";
+  case UNW_VE_S18:
+    return "s18";
+  case UNW_VE_S19:
+    return "s19";
+  case UNW_VE_S20:
+    return "s20";
+  case UNW_VE_S21:
+    return "s21";
+  case UNW_VE_S22:
+    return "s22";
+  case UNW_VE_S23:
+    return "s23";
+  case UNW_VE_S24:
+    return "s24";
+  case UNW_VE_S25:
+    return "s25";
+  case UNW_VE_S26:
+    return "s26";
+  case UNW_VE_S27:
+    return "s27";
+  case UNW_VE_S28:
+    return "s28";
+  case UNW_VE_S29:
+    return "s29";
+  case UNW_VE_S30:
+    return "s30";
+  case UNW_VE_S31:
+    return "s31";
+  case UNW_VE_S32:
+    return "s32";
+  case UNW_VE_S33:
+    return "s33";
+  case UNW_VE_S34:
+    return "s34";
+  case UNW_VE_S35:
+    return "s35";
+  case UNW_VE_S36:
+    return "s36";
+  case UNW_VE_S37:
+    return "s37";
+  case UNW_VE_S38:
+    return "s38";
+  case UNW_VE_S39:
+    return "s39";
+  case UNW_VE_S40:
+    return "s40";
+  case UNW_VE_S41:
+    return "s41";
+  case UNW_VE_S42:
+    return "s42";
+  case UNW_VE_S43:
+    return "s43";
+  case UNW_VE_S44:
+    return "s44";
+  case UNW_VE_S45:
+    return "s45";
+  case UNW_VE_S46:
+    return "s46";
+  case UNW_VE_S47:
+    return "s47";
+  case UNW_VE_S48:
+    return "s48";
+  case UNW_VE_S49:
+    return "s49";
+  case UNW_VE_S50:
+    return "s50";
+  case UNW_VE_S51:
+    return "s51";
+  case UNW_VE_S52:
+    return "s52";
+  case UNW_VE_S53:
+    return "s53";
+  case UNW_VE_S54:
+    return "s54";
+  case UNW_VE_S55:
+    return "s55";
+  case UNW_VE_S56:
+    return "s56";
+  case UNW_VE_S57:
+    return "s57";
+  case UNW_VE_S58:
+    return "s58";
+  case UNW_VE_S59:
+    return "s59";
+  case UNW_VE_S60:
+    return "s60";
+  case UNW_VE_S61:
+    return "s61";
+  case UNW_VE_S62:
+    return "s62";
+  case UNW_VE_S63:
+    return "s63";
+  case UNW_VE_V0:
+    return "v0";
+  case UNW_VE_V1:
+    return "v1";
+  case UNW_VE_V2:
+    return "v2";
+  case UNW_VE_V3:
+    return "v3";
+  case UNW_VE_V4:
+    return "v4";
+  case UNW_VE_V5:
+    return "v5";
+  case UNW_VE_V6:
+    return "v6";
+  case UNW_VE_V7:
+    return "v7";
+  case UNW_VE_V8:
+    return "v8";
+  case UNW_VE_V9:
+    return "v9";
+  case UNW_VE_V10:
+    return "v10";
+  case UNW_VE_V11:
+    return "v11";
+  case UNW_VE_V12:
+    return "v12";
+  case UNW_VE_V13:
+    return "v13";
+  case UNW_VE_V14:
+    return "v14";
+  case UNW_VE_V15:
+    return "v15";
+  case UNW_VE_V16:
+    return "v16";
+  case UNW_VE_V17:
+    return "v17";
+  case UNW_VE_V18:
+    return "v18";
+  case UNW_VE_V19:
+    return "v19";
+  case UNW_VE_V20:
+    return "v20";
+  case UNW_VE_V21:
+    return "v21";
+  case UNW_VE_V22:
+    return "v22";
+  case UNW_VE_V23:
+    return "v23";
+  case UNW_VE_V24:
+    return "v24";
+  case UNW_VE_V25:
+    return "v25";
+  case UNW_VE_V26:
+    return "v26";
+  case UNW_VE_V27:
+    return "v27";
+  case UNW_VE_V28:
+    return "v28";
+  case UNW_VE_V29:
+    return "v29";
+  case UNW_VE_V30:
+    return "v30";
+  case UNW_VE_V31:
+    return "v31";
+  case UNW_VE_V32:
+    return "v32";
+  case UNW_VE_V33:
+    return "v33";
+  case UNW_VE_V34:
+    return "v34";
+  case UNW_VE_V35:
+    return "v35";
+  case UNW_VE_V36:
+    return "v36";
+  case UNW_VE_V37:
+    return "v37";
+  case UNW_VE_V38:
+    return "v38";
+  case UNW_VE_V39:
+    return "v39";
+  case UNW_VE_V40:
+    return "v40";
+  case UNW_VE_V41:
+    return "v41";
+  case UNW_VE_V42:
+    return "v42";
+  case UNW_VE_V43:
+    return "v43";
+  case UNW_VE_V44:
+    return "v44";
+  case UNW_VE_V45:
+    return "v45";
+  case UNW_VE_V46:
+    return "v46";
+  case UNW_VE_V47:
+    return "v47";
+  case UNW_VE_V48:
+    return "v48";
+  case UNW_VE_V49:
+    return "v49";
+  case UNW_VE_V50:
+    return "v50";
+  case UNW_VE_V51:
+    return "v51";
+  case UNW_VE_V52:
+    return "v52";
+  case UNW_VE_V53:
+    return "v53";
+  case UNW_VE_V54:
+    return "v54";
+  case UNW_VE_V55:
+    return "v55";
+  case UNW_VE_V56:
+    return "v56";
+  case UNW_VE_V57:
+    return "v57";
+  case UNW_VE_V58:
+    return "v58";
+  case UNW_VE_V59:
+    return "v59";
+  case UNW_VE_V60:
+    return "v60";
+  case UNW_VE_V61:
+    return "v61";
+  case UNW_VE_V62:
+    return "v62";
+  case UNW_VE_V63:
+    return "v63";
+  case UNW_VE_VM0:
+    return "vm0";
+  case UNW_VE_VM1:
+    return "vm1";
+  case UNW_VE_VM2:
+    return "vm2";
+  case UNW_VE_VM3:
+    return "vm3";
+  case UNW_VE_VM4:
+    return "vm4";
+  case UNW_VE_VM5:
+    return "vm5";
+  case UNW_VE_VM6:
+    return "vm6";
+  case UNW_VE_VM7:
+    return "vm7";
+  case UNW_VE_VM8:
+    return "vm8";
+  case UNW_VE_VM9:
+    return "vm9";
+  case UNW_VE_VM10:
+    return "vm10";
+  case UNW_VE_VM11:
+    return "vm11";
+  case UNW_VE_VM12:
+    return "vm12";
+  case UNW_VE_VM13:
+    return "vm13";
+  case UNW_VE_VM14:
+    return "vm14";
+  case UNW_VE_VM15:
+    return "vm15";
+  }
+  return "unknown register";
+}
+#endif // _LIBUNWIND_TARGET_VE
+
 } // namespace libunwind
 
 #endif // __REGISTERS_HPP__
diff --git a/lib/libunwind/src/Unwind-seh.cpp b/lib/libunwind/src/Unwind-seh.cpp
index 403ab2d771..6e2b4e73e4 100644
--- a/lib/libunwind/src/Unwind-seh.cpp
+++ b/lib/libunwind/src/Unwind-seh.cpp
@@ -46,18 +46,6 @@ using namespace libunwind;
 /// handling.
 #define STATUS_GCC_UNWIND MAKE_GCC_EXCEPTION(1) // 0x21474343
 
-/// Class of foreign exceptions based on unrecognized SEH exceptions.
-static const uint64_t kSEHExceptionClass = 0x434C4E4753454800; // CLNGSEH\0
-
-/// Exception cleanup routine used by \c _GCC_specific_handler to
-/// free foreign exceptions.
-static void seh_exc_cleanup(_Unwind_Reason_Code urc, _Unwind_Exception *exc) {
-  (void)urc;
-  if (exc->exception_class != kSEHExceptionClass)
-    _LIBUNWIND_ABORT("SEH cleanup called on non-SEH exception");
-  free(exc);
-}
-
 static int __unw_init_seh(unw_cursor_t *cursor, CONTEXT *ctx);
 static DISPATCHER_CONTEXT *__unw_seh_get_disp_ctx(unw_cursor_t *cursor);
 static void __unw_seh_set_disp_ctx(unw_cursor_t *cursor,
@@ -108,10 +96,10 @@ _GCC_specific_handler(PEXCEPTION_RECORD ms_exc, PVOID frame, PCONTEXT ms_ctx,
     }
   } else {
     // Foreign exception.
-    exc = (_Unwind_Exception *)malloc(sizeof(_Unwind_Exception));
-    exc->exception_class = kSEHExceptionClass;
-    exc->exception_cleanup = seh_exc_cleanup;
-    memset(exc->private_, 0, sizeof(exc->private_));
+    // We can't interact with them (we don't know the original target frame
+    // that we should pass on to RtlUnwindEx in _Unwind_Resume), so just
+    // pass without calling our destructors here.
+    return ExceptionContinueSearch;
   }
   if (!ctx) {
     __unw_init_seh(&cursor, disp->ContextRecord);
diff --git a/lib/libunwind/src/Unwind-sjlj.c b/lib/libunwind/src/Unwind-sjlj.c
index 85a4cc3c69..fd2a95b74c 100644
--- a/lib/libunwind/src/Unwind-sjlj.c
+++ b/lib/libunwind/src/Unwind-sjlj.c
@@ -32,11 +32,23 @@ struct _Unwind_FunctionContext {
   // next function in stack of handlers
   struct _Unwind_FunctionContext *prev;
 
+#if defined(__ve__)
+  // VE requires to store 64 bit pointers in the buffer for SjLj execption.
+  // We expand the size of values defined here.  This size must be matched
+  // to the size returned by TargetMachine::getSjLjDataSize().
+
+  // set by calling function before registering to be the landing pad
+  uint64_t                        resumeLocation;
+
+  // set by personality handler to be parameters passed to landing pad function
+  uint64_t                        resumeParameters[4];
+#else
   // set by calling function before registering to be the landing pad
   uint32_t                        resumeLocation;
 
   // set by personality handler to be parameters passed to landing pad function
   uint32_t                        resumeParameters[4];
+#endif
 
   // set by calling function before registering
   _Unwind_Personality_Fn personality;          // arm offset=24
diff --git a/lib/libunwind/src/UnwindCursor.hpp b/lib/libunwind/src/UnwindCursor.hpp
index f346c720d2..e537ed84dd 100644
--- a/lib/libunwind/src/UnwindCursor.hpp
+++ b/lib/libunwind/src/UnwindCursor.hpp
@@ -81,6 +81,7 @@ template <typename A>
 class _LIBUNWIND_HIDDEN DwarfFDECache {
   typedef typename A::pint_t pint_t;
 public:
+  static constexpr pint_t kSearchAll = static_cast<pint_t>(-1);
   static pint_t findFDE(pint_t mh, pint_t pc);
   static void add(pint_t mh, pint_t ip_start, pint_t ip_end, pint_t fde);
   static void removeAllIn(pint_t mh);
@@ -138,7 +139,7 @@ typename A::pint_t DwarfFDECache<A>::findFDE(pint_t mh, pint_t pc) {
   pint_t result = 0;
   _LIBUNWIND_LOG_IF_FALSE(_lock.lock_shared());
   for (entry *p = _buffer; p < _bufferUsed; ++p) {
-    if ((mh == p->mh) || (mh == 0)) {
+    if ((mh == p->mh) || (mh == kSearchAll)) {
       if ((p->ip_start <= pc) && (pc < p->ip_end)) {
         result = p->fde;
         break;
@@ -530,6 +531,8 @@ UnwindCursor<A, R>::UnwindCursor(unw_context_t *context, A &as)
     : _addressSpace(as), _unwindInfoMissing(false) {
   static_assert((check_fit<UnwindCursor<A, R>, unw_cursor_t>::does_fit),
                 "UnwindCursor<> does not fit in unw_cursor_t");
+  static_assert((alignof(UnwindCursor<A, R>) <= alignof(unw_cursor_t)),
+                "UnwindCursor<> requires more alignment than unw_cursor_t");
   memset(&_info, 0, sizeof(_info));
   memset(&_histTable, 0, sizeof(_histTable));
   _dispContext.ContextRecord = &_msContext;
@@ -922,7 +925,29 @@ private:
   }
 #endif
 
+#if defined(_LIBUNWIND_TARGET_LINUX) && defined(_LIBUNWIND_TARGET_AARCH64)
+  bool setInfoForSigReturn() {
+    R dummy;
+    return setInfoForSigReturn(dummy);
+  }
+  int stepThroughSigReturn() {
+    R dummy;
+    return stepThroughSigReturn(dummy);
+  }
+  bool setInfoForSigReturn(Registers_arm64 &);
+  int stepThroughSigReturn(Registers_arm64 &);
+  template <typename Registers> bool setInfoForSigReturn(Registers &) {
+    return false;
+  }
+  template <typename Registers> int stepThroughSigReturn(Registers &) {
+    return UNW_STEP_END;
+  }
+#endif
+
 #if defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND)
+  bool getInfoFromFdeCie(const typename CFI_Parser<A>::FDE_Info &fdeInfo,
+                         const typename CFI_Parser<A>::CIE_Info &cieInfo,
+                         pint_t pc, uintptr_t dso_base);
   bool getInfoFromDwarfSection(pint_t pc, const UnwindInfoSections &sects,
                                             uint32_t fdeSectionOffsetHint=0);
   int stepWithDwarfFDE() {
@@ -1173,6 +1198,9 @@ private:
   unw_proc_info_t  _info;
   bool             _unwindInfoMissing;
   bool             _isSignalFrame;
+#if defined(_LIBUNWIND_TARGET_LINUX) && defined(_LIBUNWIND_TARGET_AARCH64)
+  bool             _isSigReturn = false;
+#endif
 };
 
 
@@ -1182,6 +1210,8 @@ UnwindCursor<A, R>::UnwindCursor(unw_context_t *context, A &as)
       _isSignalFrame(false) {
   static_assert((check_fit<UnwindCursor<A, R>, unw_cursor_t>::does_fit),
                 "UnwindCursor<> does not fit in unw_cursor_t");
+  static_assert((alignof(UnwindCursor<A, R>) <= alignof(unw_cursor_t)),
+                "UnwindCursor<> requires more alignment than unw_cursor_t");
   memset(&_info, 0, sizeof(_info));
 }
 
@@ -1472,6 +1502,32 @@ bool UnwindCursor<A, R>::getInfoFromEHABISection(
 #endif
 
 #if defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND)
+template <typename A, typename R>
+bool UnwindCursor<A, R>::getInfoFromFdeCie(
+    const typename CFI_Parser<A>::FDE_Info &fdeInfo,
+    const typename CFI_Parser<A>::CIE_Info &cieInfo, pint_t pc,
+    uintptr_t dso_base) {
+  typename CFI_Parser<A>::PrologInfo prolog;
+  if (CFI_Parser<A>::parseFDEInstructions(_addressSpace, fdeInfo, cieInfo, pc,
+                                          R::getArch(), &prolog)) {
+    // Save off parsed FDE info
+    _info.start_ip          = fdeInfo.pcStart;
+    _info.end_ip            = fdeInfo.pcEnd;
+    _info.lsda              = fdeInfo.lsda;
+    _info.handler           = cieInfo.personality;
+    // Some frameless functions need SP altered when resuming in function, so
+    // propagate spExtraArgSize.
+    _info.gp                = prolog.spExtraArgSize;
+    _info.flags             = 0;
+    _info.format            = dwarfEncoding();
+    _info.unwind_info       = fdeInfo.fdeStart;
+    _info.unwind_info_size  = static_cast<uint32_t>(fdeInfo.fdeLength);
+    _info.extra             = static_cast<unw_word_t>(dso_base);
+    return true;
+  }
+  return false;
+}
+
 template <typename A, typename R>
 bool UnwindCursor<A, R>::getInfoFromDwarfSection(pint_t pc,
                                                 const UnwindInfoSections &sects,
@@ -1483,7 +1539,7 @@ bool UnwindCursor<A, R>::getInfoFromDwarfSection(pint_t pc,
   // If compact encoding table gave offset into dwarf section, go directly there
   if (fdeSectionOffsetHint != 0) {
     foundFDE = CFI_Parser<A>::findFDE(_addressSpace, pc, sects.dwarf_section,
-                                    (uint32_t)sects.dwarf_section_length,
+                                    sects.dwarf_section_length,
                                     sects.dwarf_section + fdeSectionOffsetHint,
                                     &fdeInfo, &cieInfo);
   }
@@ -1500,7 +1556,7 @@ bool UnwindCursor<A, R>::getInfoFromDwarfSection(pint_t pc,
     if (cachedFDE != 0) {
       foundFDE =
           CFI_Parser<A>::findFDE(_addressSpace, pc, sects.dwarf_section,
-                                 (uint32_t)sects.dwarf_section_length,
+                                 sects.dwarf_section_length,
                                  cachedFDE, &fdeInfo, &cieInfo);
       foundInCache = foundFDE;
     }
@@ -1508,25 +1564,11 @@ bool UnwindCursor<A, R>::getInfoFromDwarfSection(pint_t pc,
   if (!foundFDE) {
     // Still not found, do full scan of __eh_frame section.
     foundFDE = CFI_Parser<A>::findFDE(_addressSpace, pc, sects.dwarf_section,
-                                      (uint32_t)sects.dwarf_section_length, 0,
+                                      sects.dwarf_section_length, 0,
                                       &fdeInfo, &cieInfo);
   }
   if (foundFDE) {
-    typename CFI_Parser<A>::PrologInfo prolog;
-    if (CFI_Parser<A>::parseFDEInstructions(_addressSpace, fdeInfo, cieInfo, pc,
-                                            R::getArch(), &prolog)) {
-      // Save off parsed FDE info
-      _info.start_ip          = fdeInfo.pcStart;
-      _info.end_ip            = fdeInfo.pcEnd;
-      _info.lsda              = fdeInfo.lsda;
-      _info.handler           = cieInfo.personality;
-      _info.gp                = prolog.spExtraArgSize;
-      _info.flags             = 0;
-      _info.format            = dwarfEncoding();
-      _info.unwind_info       = fdeInfo.fdeStart;
-      _info.unwind_info_size  = (uint32_t)fdeInfo.fdeLength;
-      _info.extra             = (unw_word_t) sects.dso_base;
-
+    if (getInfoFromFdeCie(fdeInfo, cieInfo, pc, sects.dso_base)) {
       // Add to cache (to make next lookup faster) if we had no hint
       // and there was no index.
       if (!foundInCache && (fdeSectionOffsetHint == 0)) {
@@ -1759,12 +1801,12 @@ bool UnwindCursor<A, R>::getInfoFromCompactEncodingSection(pint_t pc,
     }
   }
 
-  // extact personality routine, if encoding says function has one
+  // extract personality routine, if encoding says function has one
   uint32_t personalityIndex = (encoding & UNWIND_PERSONALITY_MASK) >>
                               (__builtin_ctz(UNWIND_PERSONALITY_MASK));
   if (personalityIndex != 0) {
     --personalityIndex; // change 1-based to zero-based index
-    if (personalityIndex > sectionHeader.personalityArrayCount()) {
+    if (personalityIndex >= sectionHeader.personalityArrayCount()) {
       _LIBUNWIND_DEBUG_LOG("found encoding 0x%08X with personality index %d,  "
                             "but personality table has only %d entries",
                             encoding, personalityIndex,
@@ -1853,7 +1895,11 @@ bool UnwindCursor<A, R>::getInfoFromSEH(pint_t pc) {
 
 template <typename A, typename R>
 void UnwindCursor<A, R>::setInfoBasedOnIPRegister(bool isReturnAddress) {
-  pint_t pc = (pint_t)this->getReg(UNW_REG_IP);
+#if defined(_LIBUNWIND_TARGET_LINUX) && defined(_LIBUNWIND_TARGET_AARCH64)
+  _isSigReturn = false;
+#endif
+
+  pint_t pc = static_cast<pint_t>(this->getReg(UNW_REG_IP));
 #if defined(_LIBUNWIND_ARM_EHABI)
   // Remove the thumb bit so the IP represents the actual instruction address.
   // This matches the behaviour of _Unwind_GetIP on arm.
@@ -1926,68 +1972,102 @@ void UnwindCursor<A, R>::setInfoBasedOnIPRegister(bool isReturnAddress) {
 #if defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND)
   // There is no static unwind info for this pc. Look to see if an FDE was
   // dynamically registered for it.
-  pint_t cachedFDE = DwarfFDECache<A>::findFDE(0, pc);
+  pint_t cachedFDE = DwarfFDECache<A>::findFDE(DwarfFDECache<A>::kSearchAll,
+                                               pc);
   if (cachedFDE != 0) {
-    CFI_Parser<LocalAddressSpace>::FDE_Info fdeInfo;
-    CFI_Parser<LocalAddressSpace>::CIE_Info cieInfo;
-    const char *msg = CFI_Parser<A>::decodeFDE(_addressSpace,
-                                                cachedFDE, &fdeInfo, &cieInfo);
-    if (msg == NULL) {
-      typename CFI_Parser<A>::PrologInfo prolog;
-      if (CFI_Parser<A>::parseFDEInstructions(_addressSpace, fdeInfo, cieInfo,
-                                              pc, R::getArch(), &prolog)) {
-        // save off parsed FDE info
-        _info.start_ip         = fdeInfo.pcStart;
-        _info.end_ip           = fdeInfo.pcEnd;
-        _info.lsda             = fdeInfo.lsda;
-        _info.handler          = cieInfo.personality;
-        _info.gp               = prolog.spExtraArgSize;
-                                  // Some frameless functions need SP
-                                  // altered when resuming in function.
-        _info.flags            = 0;
-        _info.format           = dwarfEncoding();
-        _info.unwind_info      = fdeInfo.fdeStart;
-        _info.unwind_info_size = (uint32_t)fdeInfo.fdeLength;
-        _info.extra            = 0;
+    typename CFI_Parser<A>::FDE_Info fdeInfo;
+    typename CFI_Parser<A>::CIE_Info cieInfo;
+    if (!CFI_Parser<A>::decodeFDE(_addressSpace, cachedFDE, &fdeInfo, &cieInfo))
+      if (getInfoFromFdeCie(fdeInfo, cieInfo, pc, 0))
         return;
-      }
-    }
   }
 
   // Lastly, ask AddressSpace object about platform specific ways to locate
   // other FDEs.
   pint_t fde;
   if (_addressSpace.findOtherFDE(pc, fde)) {
-    CFI_Parser<LocalAddressSpace>::FDE_Info fdeInfo;
-    CFI_Parser<LocalAddressSpace>::CIE_Info cieInfo;
+    typename CFI_Parser<A>::FDE_Info fdeInfo;
+    typename CFI_Parser<A>::CIE_Info cieInfo;
     if (!CFI_Parser<A>::decodeFDE(_addressSpace, fde, &fdeInfo, &cieInfo)) {
       // Double check this FDE is for a function that includes the pc.
-      if ((fdeInfo.pcStart <= pc) && (pc < fdeInfo.pcEnd)) {
-        typename CFI_Parser<A>::PrologInfo prolog;
-        if (CFI_Parser<A>::parseFDEInstructions(_addressSpace, fdeInfo, cieInfo,
-                                                pc, R::getArch(), &prolog)) {
-          // save off parsed FDE info
-          _info.start_ip         = fdeInfo.pcStart;
-          _info.end_ip           = fdeInfo.pcEnd;
-          _info.lsda             = fdeInfo.lsda;
-          _info.handler          = cieInfo.personality;
-          _info.gp               = prolog.spExtraArgSize;
-          _info.flags            = 0;
-          _info.format           = dwarfEncoding();
-          _info.unwind_info      = fdeInfo.fdeStart;
-          _info.unwind_info_size = (uint32_t)fdeInfo.fdeLength;
-          _info.extra            = 0;
+      if ((fdeInfo.pcStart <= pc) && (pc < fdeInfo.pcEnd))
+        if (getInfoFromFdeCie(fdeInfo, cieInfo, pc, 0))
           return;
-        }
-      }
     }
   }
 #endif // #if defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND)
 
+#if defined(_LIBUNWIND_TARGET_LINUX) && defined(_LIBUNWIND_TARGET_AARCH64)
+  if (setInfoForSigReturn())
+    return;
+#endif
+
   // no unwind info, flag that we can't reliably unwind
   _unwindInfoMissing = true;
 }
 
+#if defined(_LIBUNWIND_TARGET_LINUX) && defined(_LIBUNWIND_TARGET_AARCH64)
+template <typename A, typename R>
+bool UnwindCursor<A, R>::setInfoForSigReturn(Registers_arm64 &) {
+  // Look for the sigreturn trampoline. The trampoline's body is two
+  // specific instructions (see below). Typically the trampoline comes from the
+  // vDSO[1] (i.e. the __kernel_rt_sigreturn function). A libc might provide its
+  // own restorer function, though, or user-mode QEMU might write a trampoline
+  // onto the stack.
+  //
+  // This special code path is a fallback that is only used if the trampoline
+  // lacks proper (e.g. DWARF) unwind info. On AArch64, a new DWARF register
+  // constant for the PC needs to be defined before DWARF can handle a signal
+  // trampoline. This code may segfault if the target PC is unreadable, e.g.:
+  //  - The PC points at a function compiled without unwind info, and which is
+  //    part of an execute-only mapping (e.g. using -Wl,--execute-only).
+  //  - The PC is invalid and happens to point to unreadable or unmapped memory.
+  //
+  // [1] https://github.com/torvalds/linux/blob/master/arch/arm64/kernel/vdso/sigreturn.S
+  const pint_t pc = static_cast<pint_t>(this->getReg(UNW_REG_IP));
+  // Look for instructions: mov x8, #0x8b; svc #0x0
+  if (_addressSpace.get32(pc) == 0xd2801168 &&
+      _addressSpace.get32(pc + 4) == 0xd4000001) {
+    _info = {};
+    _isSigReturn = true;
+    return true;
+  }
+  return false;
+}
+
+template <typename A, typename R>
+int UnwindCursor<A, R>::stepThroughSigReturn(Registers_arm64 &) {
+  // In the signal trampoline frame, sp points to an rt_sigframe[1], which is:
+  //  - 128-byte siginfo struct
+  //  - ucontext struct:
+  //     - 8-byte long (uc_flags)
+  //     - 8-byte pointer (uc_link)
+  //     - 24-byte stack_t
+  //     - 128-byte signal set
+  //     - 8 bytes of padding because sigcontext has 16-byte alignment
+  //     - sigcontext/mcontext_t
+  // [1] https://github.com/torvalds/linux/blob/master/arch/arm64/kernel/signal.c
+  const pint_t kOffsetSpToSigcontext = (128 + 8 + 8 + 24 + 128 + 8); // 304
+
+  // Offsets from sigcontext to each register.
+  const pint_t kOffsetGprs = 8; // offset to "__u64 regs[31]" field
+  const pint_t kOffsetSp = 256; // offset to "__u64 sp" field
+  const pint_t kOffsetPc = 264; // offset to "__u64 pc" field
+
+  pint_t sigctx = _registers.getSP() + kOffsetSpToSigcontext;
+
+  for (int i = 0; i <= 30; ++i) {
+    uint64_t value = _addressSpace.get64(sigctx + kOffsetGprs +
+                                         static_cast<pint_t>(i * 8));
+    _registers.setRegister(UNW_ARM64_X0 + i, value);
+  }
+  _registers.setSP(_addressSpace.get64(sigctx + kOffsetSp));
+  _registers.setIP(_addressSpace.get64(sigctx + kOffsetPc));
+  _isSignalFrame = true;
+  return UNW_STEP_SUCCESS;
+}
+#endif // defined(_LIBUNWIND_TARGET_LINUX) && defined(_LIBUNWIND_TARGET_AARCH64)
+
 template <typename A, typename R>
 int UnwindCursor<A, R>::step() {
   // Bottom of stack is defined is when unwind info cannot be found.
@@ -1996,20 +2076,27 @@ int UnwindCursor<A, R>::step() {
 
   // Use unwinding info to modify register set as if function returned.
   int result;
+#if defined(_LIBUNWIND_TARGET_LINUX) && defined(_LIBUNWIND_TARGET_AARCH64)
+  if (_isSigReturn) {
+    result = this->stepThroughSigReturn();
+  } else
+#endif
+  {
 #if defined(_LIBUNWIND_SUPPORT_COMPACT_UNWIND)
-  result = this->stepWithCompactEncoding();
+    result = this->stepWithCompactEncoding();
 #elif defined(_LIBUNWIND_SUPPORT_SEH_UNWIND)
-  result = this->stepWithSEHData();
+    result = this->stepWithSEHData();
 #elif defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND)
-  result = this->stepWithDwarfFDE();
+    result = this->stepWithDwarfFDE();
 #elif defined(_LIBUNWIND_ARM_EHABI)
-  result = this->stepWithEHABI();
+    result = this->stepWithEHABI();
 #else
   #error Need _LIBUNWIND_SUPPORT_COMPACT_UNWIND or \
               _LIBUNWIND_SUPPORT_SEH_UNWIND or \
               _LIBUNWIND_SUPPORT_DWARF_UNWIND or \
               _LIBUNWIND_ARM_EHABI
 #endif
+  }
 
   // update info based on new PC
   if (result == UNW_STEP_SUCCESS) {
diff --git a/lib/libunwind/src/UnwindLevel1.c b/lib/libunwind/src/UnwindLevel1.c
index 3e75b5f13c..68e5e48b8c 100644
--- a/lib/libunwind/src/UnwindLevel1.c
+++ b/lib/libunwind/src/UnwindLevel1.c
@@ -39,8 +39,7 @@ unwind_phase1(unw_context_t *uc, unw_cursor_t *cursor, _Unwind_Exception *except
   __unw_init_local(cursor, uc);
 
   // Walk each frame looking for a place to stop.
-  bool handlerNotFound = true;
-  while (handlerNotFound) {
+  while (true) {
     // Ask libunwind to get next frame (skip over first which is
     // _Unwind_RaiseException).
     int stepResult = __unw_step(cursor);
@@ -102,7 +101,6 @@ unwind_phase1(unw_context_t *uc, unw_cursor_t *cursor, _Unwind_Exception *except
       case _URC_HANDLER_FOUND:
         // found a catch clause or locals that need destructing in this frame
         // stop search and remember stack pointer at the frame
-        handlerNotFound = false;
         __unw_get_reg(cursor, UNW_REG_SP, &sp);
         exception_object->private_2 = (uintptr_t)sp;
         _LIBUNWIND_TRACE_UNWINDING(
diff --git a/lib/libunwind/src/UnwindRegistersRestore.S b/lib/libunwind/src/UnwindRegistersRestore.S
index 5d54432152..289afe98b0 100644
--- a/lib/libunwind/src/UnwindRegistersRestore.S
+++ b/lib/libunwind/src/UnwindRegistersRestore.S
@@ -13,14 +13,10 @@
 #if !defined(__USING_SJLJ_EXCEPTIONS__)
 
 #if defined(__i386__)
-DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind13Registers_x866jumptoEv)
+DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_x86_jumpto)
 #
-# void libunwind::Registers_x86::jumpto()
+# extern "C" void __libunwind_Registers_x86_jumpto(Registers_x86 *);
 #
-#if defined(_WIN32)
-# On windows, the 'this' pointer is passed in ecx instead of on the stack
-  movl   %ecx, %eax
-#else
 # On entry:
 #  +                       +
 #  +-----------------------+
@@ -30,7 +26,6 @@ DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind13Registers_x866jumptoEv)
 #  +-----------------------+   <-- SP
 #  +                       +
   movl   4(%esp), %eax
-#endif
   # set up eax and ret on new stack location
   movl  28(%eax), %edx # edx holds new stack pointer
   subl  $8,%edx
@@ -60,9 +55,9 @@ DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind13Registers_x866jumptoEv)
 
 #elif defined(__x86_64__)
 
-DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind16Registers_x86_646jumptoEv)
+DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_x86_64_jumpto)
 #
-# void libunwind::Registers_x86_64::jumpto()
+# extern "C" void __libunwind_Registers_x86_64_jumpto(Registers_x86_64 *);
 #
 #if defined(_WIN64)
 # On entry, thread_state pointer is in rcx; move it into rdi
@@ -175,7 +170,7 @@ DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind15Registers_ppc646jumptoEv)
   PPC64_LR(30)
   PPC64_LR(31)
 
-#ifdef PPC64_HAS_VMX
+#if defined(__VSX__)
 
   // restore VS registers
   // (note that this also restores floating point registers and V registers,
@@ -317,6 +312,7 @@ PPC64_CLVS_BOTTOM(n)
   PPC64_LF(30)
   PPC64_LF(31)
 
+#if defined(__ALTIVEC__)
   // restore vector registers if any are in use
   ld    %r5, PPC64_OFFS_VRSAVE(%r3)   // test VRsave
   cmpwi %r5, 0
@@ -378,6 +374,7 @@ PPC64_CLV_UNALIGNED_BOTTOM(n)
   PPC64_CLV_UNALIGNEDh(31)
 
 #endif
+#endif
 
 Lnovec:
   ld    %r0, PPC64_OFFS_CR(%r3)
@@ -436,6 +433,7 @@ DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind13Registers_ppc6jumptoEv)
   lwz     %r30,128(%r3)
   lwz     %r31,132(%r3)
 
+#ifndef __NO_FPRS__
   // restore float registers
   lfd     %f0, 160(%r3)
   lfd     %f1, 168(%r3)
@@ -469,7 +467,9 @@ DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind13Registers_ppc6jumptoEv)
   lfd     %f29,392(%r3)
   lfd     %f30,400(%r3)
   lfd     %f31,408(%r3)
+#endif
 
+#if defined(__ALTIVEC__)
   // restore vector registers if any are in use
   lwz     %r5, 156(%r3)       // test VRsave
   cmpwi   %r5, 0
@@ -542,6 +542,7 @@ DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind13Registers_ppc6jumptoEv)
   LOAD_VECTOR_UNALIGNEDh(29)
   LOAD_VECTOR_UNALIGNEDh(30)
   LOAD_VECTOR_UNALIGNEDh(31)
+#endif
 
 Lnovec:
   lwz     %r0, 136(%r3)   // __cr
@@ -560,13 +561,13 @@ Lnovec:
 #elif defined(__aarch64__)
 
 //
-// void libunwind::Registers_arm64::jumpto()
+// extern "C" void __libunwind_Registers_arm64_jumpto(Registers_arm64 *);
 //
 // On entry:
 //  thread_state pointer is in x0
 //
   .p2align 2
-DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind15Registers_arm646jumptoEv)
+DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_arm64_jumpto)
   // skip restore of x0,x1 for now
   ldp    x2, x3,  [x0, #0x010]
   ldp    x4, x5,  [x0, #0x020]
diff --git a/lib/libunwind/src/UnwindRegistersSave.S b/lib/libunwind/src/UnwindRegistersSave.S
index 51bb9b0688..94fc836545 100644
--- a/lib/libunwind/src/UnwindRegistersSave.S
+++ b/lib/libunwind/src/UnwindRegistersSave.S
@@ -384,7 +384,7 @@ DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
   mfvrsave    %r0
   std   %r0,  PPC64_OFFS_VRSAVE(%r3)
 
-#ifdef PPC64_HAS_VMX
+#if defined(__VSX__)
   // save VS registers
   // (note that this also saves floating point registers and V registers,
   // because part of VS is mapped to these registers)
@@ -501,6 +501,7 @@ DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
   PPC64_STF(30)
   PPC64_STF(31)
 
+#if defined(__ALTIVEC__)
   // save vector registers
 
   // Use 16-bytes below the stack pointer as an
@@ -548,6 +549,7 @@ DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
   PPC64_STV_UNALIGNED(30)
   PPC64_STV_UNALIGNED(31)
 
+#endif
 #endif
 
   li    %r3,  0   // return UNW_ESUCCESS
@@ -608,6 +610,7 @@ DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
   mfctr   %r0
   stw     %r0, 148(%r3)
 
+#if !defined(__NO_FPRS__)
   // save float registers
   stfd    %f0, 160(%r3)
   stfd    %f1, 168(%r3)
@@ -641,8 +644,9 @@ DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
   stfd    %f29,392(%r3)
   stfd    %f30,400(%r3)
   stfd    %f31,408(%r3)
+#endif
 
-
+#if defined(__ALTIVEC__)
   // save vector registers
 
   subi    %r4, %r1, 16
@@ -692,6 +696,7 @@ DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
   SAVE_VECTOR_UNALIGNED(%v29, 424+0x1D0)
   SAVE_VECTOR_UNALIGNED(%v30, 424+0x1E0)
   SAVE_VECTOR_UNALIGNED(%v31, 424+0x1F0)
+#endif
 
   li      %r3, 0  // return UNW_ESUCCESS
   blr
diff --git a/lib/libunwind/src/Unwind_AppleExtras.cpp b/lib/libunwind/src/Unwind_AppleExtras.cpp
index 536303993e..e3d41ca2b4 100644
--- a/lib/libunwind/src/Unwind_AppleExtras.cpp
+++ b/lib/libunwind/src/Unwind_AppleExtras.cpp
@@ -8,35 +8,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "config.h"
-#include "AddressSpace.hpp"
-#include "DwarfParser.hpp"
-
-
-// private keymgr stuff
-#define KEYMGR_GCC3_DW2_OBJ_LIST 302
-extern "C" {
- extern void _keymgr_set_and_unlock_processwide_ptr(int key, void *ptr);
- extern void *_keymgr_get_and_lock_processwide_ptr(int key);
-}
-
-// undocumented libgcc "struct object"
-struct libgcc_object {
-  void          *start;
-  void          *unused1;
-  void          *unused2;
-  void          *fde;
-  unsigned long  encoding;
-  void          *fde_end;
-  libgcc_object *next;
-};
-
-// undocumented libgcc "struct km_object_info" referenced by
-// KEYMGR_GCC3_DW2_OBJ_LIST
-struct libgcc_object_info {
-  libgcc_object   *seen_objects;
-  libgcc_object   *unseen_objects;
-  unsigned         spare[2];
-};
 
 
 // static linker symbols to prevent wrong two level namespace for _Unwind symbols
@@ -140,44 +111,3 @@ NOT_HERE_BEFORE_5_0(_Unwind_SjLj_Resume_or_Rethrow)
 NOT_HERE_BEFORE_5_0(_Unwind_SjLj_Unregister)
 
 #endif // defined(_LIBUNWIND_BUILD_SJLJ_APIS)
-
-
-namespace libunwind {
-
-_LIBUNWIND_HIDDEN
-bool checkKeyMgrRegisteredFDEs(uintptr_t pc, void *&fde) {
-#if __MAC_OS_X_VERSION_MIN_REQUIRED
-  // lastly check for old style keymgr registration of dynamically generated
-  // FDEs acquire exclusive access to libgcc_object_info
-  libgcc_object_info *head = (libgcc_object_info *)
-                _keymgr_get_and_lock_processwide_ptr(KEYMGR_GCC3_DW2_OBJ_LIST);
-  if (head != NULL) {
-    // look at each FDE in keymgr
-    for (libgcc_object *ob = head->unseen_objects; ob != NULL; ob = ob->next) {
-      CFI_Parser<LocalAddressSpace>::FDE_Info fdeInfo;
-      CFI_Parser<LocalAddressSpace>::CIE_Info cieInfo;
-      const char *msg = CFI_Parser<LocalAddressSpace>::decodeFDE(
-                                      LocalAddressSpace::sThisAddressSpace,
-                                      (uintptr_t)ob->fde, &fdeInfo, &cieInfo);
-      if (msg == NULL) {
-        // Check if this FDE is for a function that includes the pc
-        if ((fdeInfo.pcStart <= pc) && (pc < fdeInfo.pcEnd)) {
-          fde = (void*)fdeInfo.pcStart;
-          _keymgr_set_and_unlock_processwide_ptr(KEYMGR_GCC3_DW2_OBJ_LIST,
-                                                 head);
-          return true;
-        }
-      }
-    }
-  }
-  // release libgcc_object_info
-  _keymgr_set_and_unlock_processwide_ptr(KEYMGR_GCC3_DW2_OBJ_LIST, head);
-#else
-  (void)pc;
-  (void)fde;
-#endif
-  return false;
-}
-
-}
-
diff --git a/lib/libunwind/src/assembly.h b/lib/libunwind/src/assembly.h
index 4cf179e13e..f2f7c84830 100644
--- a/lib/libunwind/src/assembly.h
+++ b/lib/libunwind/src/assembly.h
@@ -25,9 +25,6 @@
 #define PPC64_OFFS_VRSAVE 304
 #define PPC64_OFFS_FP     312
 #define PPC64_OFFS_V      824
-#ifdef _ARCH_PWR8
-#define PPC64_HAS_VMX
-#endif
 #elif defined(__APPLE__) && defined(__aarch64__)
 #define SEPARATOR %%
 #else
@@ -48,6 +45,24 @@
 #define PPC64_OPD2
 #endif
 
+#if defined(__ARM_FEATURE_BTI_DEFAULT)
+  .pushsection ".note.gnu.property", "a" SEPARATOR                             \
+  .balign 8 SEPARATOR                                                          \
+  .long 4 SEPARATOR                                                            \
+  .long 0x10 SEPARATOR                                                         \
+  .long 0x5 SEPARATOR                                                          \
+  .asciz "GNU" SEPARATOR                                                       \
+  .long 0xc0000000 SEPARATOR /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */          \
+  .long 4 SEPARATOR                                                            \
+  .long 3 SEPARATOR /* GNU_PROPERTY_AARCH64_FEATURE_1_BTI AND */               \
+                    /* GNU_PROPERTY_AARCH64_FEATURE_1_PAC */                   \
+  .long 0 SEPARATOR                                                            \
+  .popsection SEPARATOR
+#define AARCH64_BTI  bti c
+#else
+#define AARCH64_BTI
+#endif
+
 #define GLUE2(a, b) a ## b
 #define GLUE(a, b) GLUE2(a, b)
 #define SYMBOL_NAME(name) GLUE(__USER_LABEL_PREFIX__, name)
@@ -144,7 +159,8 @@
   SYMBOL_IS_FUNC(SYMBOL_NAME(name)) SEPARATOR                                  \
   PPC64_OPD1                                                                   \
   SYMBOL_NAME(name):                                                           \
-  PPC64_OPD2
+  PPC64_OPD2                                                                   \
+  AARCH64_BTI
 
 #if defined(__arm__)
 #if !defined(__ARM_ARCH)
diff --git a/lib/libunwind/src/config.h b/lib/libunwind/src/config.h
index 842fd829af..9efed05405 100644
--- a/lib/libunwind/src/config.h
+++ b/lib/libunwind/src/config.h
@@ -18,23 +18,15 @@
 #include <stdint.h>
 #include <stdlib.h>
 
-// Define static_assert() unless already defined by compiler.
-#ifndef __has_feature
-  #define __has_feature(__x) 0
-#endif
-#if !(__has_feature(cxx_static_assert)) && !defined(static_assert)
-  #define static_assert(__b, __m) \
-      extern int compile_time_assert_failed[ ( __b ) ? 1 : -1 ]  \
-                                                  __attribute__( ( unused ) );
-#endif
+#include <__libunwind_config.h>
 
 // Platform specific configuration defines.
 #ifdef __APPLE__
   #if defined(FOR_DYLD)
-    #define _LIBUNWIND_SUPPORT_COMPACT_UNWIND
+    #define _LIBUNWIND_SUPPORT_COMPACT_UNWIND 1
   #else
-    #define _LIBUNWIND_SUPPORT_COMPACT_UNWIND
-    #define _LIBUNWIND_SUPPORT_DWARF_UNWIND   1
+    #define _LIBUNWIND_SUPPORT_COMPACT_UNWIND 1
+    #define _LIBUNWIND_SUPPORT_DWARF_UNWIND 1
   #endif
 #elif defined(_WIN32)
   #ifdef __SEH__
@@ -42,8 +34,19 @@
   #else
     #define _LIBUNWIND_SUPPORT_DWARF_UNWIND 1
   #endif
+#elif defined(_LIBUNWIND_IS_BAREMETAL)
+  #if !defined(_LIBUNWIND_ARM_EHABI)
+    #define _LIBUNWIND_SUPPORT_DWARF_UNWIND 1
+    #define _LIBUNWIND_SUPPORT_DWARF_INDEX 1
+  #endif
+#elif defined(__BIONIC__) && defined(_LIBUNWIND_ARM_EHABI)
+  // For ARM EHABI, Bionic didn't implement dl_iterate_phdr until API 21. After
+  // API 21, dl_iterate_phdr exists, but dl_unwind_find_exidx is much faster.
+  #define _LIBUNWIND_USE_DL_UNWIND_FIND_EXIDX 1
 #else
-  #if defined(__ARM_DWARF_EH__) || !defined(__arm__)
+  // Assume an ELF system with a dl_iterate_phdr function.
+  #define _LIBUNWIND_USE_DL_ITERATE_PHDR 1
+  #if !defined(_LIBUNWIND_ARM_EHABI)
     #define _LIBUNWIND_SUPPORT_DWARF_UNWIND 1
     #define _LIBUNWIND_SUPPORT_DWARF_INDEX 1
   #endif
@@ -91,6 +94,8 @@
 #error Unsupported target
 #endif
 
+// Apple/armv7k defaults to DWARF/Compact unwinding, but its libunwind also
+// needs to include the SJLJ APIs.
 #if (defined(__APPLE__) && defined(__arm__)) || defined(__USING_SJLJ_EXCEPTIONS__)
 #define _LIBUNWIND_BUILD_SJLJ_APIS
 #endif
@@ -111,8 +116,27 @@
 #endif
 #endif
 
-#if defined(__powerpc64__) && defined(_ARCH_PWR8)
-#define PPC64_HAS_VMX
+#ifndef _LIBUNWIND_REMEMBER_HEAP_ALLOC
+#if defined(_LIBUNWIND_REMEMBER_STACK_ALLOC) || defined(__APPLE__) ||          \
+    defined(__linux__) || defined(__ANDROID__) || defined(__MINGW32__) ||      \
+    defined(_LIBUNWIND_IS_BAREMETAL)
+#define _LIBUNWIND_REMEMBER_ALLOC(_size) alloca(_size)
+#define _LIBUNWIND_REMEMBER_FREE(_ptr)                                         \
+  do {                                                                         \
+  } while (0)
+#elif defined(_WIN32)
+#define _LIBUNWIND_REMEMBER_ALLOC(_size) _malloca(_size)
+#define _LIBUNWIND_REMEMBER_FREE(_ptr) _freea(_ptr)
+#define _LIBUNWIND_REMEMBER_CLEANUP_NEEDED
+#else
+#define _LIBUNWIND_REMEMBER_ALLOC(_size) malloc(_size)
+#define _LIBUNWIND_REMEMBER_FREE(_ptr) free(_ptr)
+#define _LIBUNWIND_REMEMBER_CLEANUP_NEEDED
+#endif
+#else /* _LIBUNWIND_REMEMBER_HEAP_ALLOC */
+#define _LIBUNWIND_REMEMBER_ALLOC(_size) malloc(_size)
+#define _LIBUNWIND_REMEMBER_FREE(_ptr) free(_ptr)
+#define _LIBUNWIND_REMEMBER_CLEANUP_NEEDED
 #endif
 
 #if defined(NDEBUG) && defined(_LIBUNWIND_IS_BAREMETAL)
diff --git a/lib/libunwind/src/libunwind.cpp b/lib/libunwind/src/libunwind.cpp
index fd079da308..c21461b1f4 100644
--- a/lib/libunwind/src/libunwind.cpp
+++ b/lib/libunwind/src/libunwind.cpp
@@ -62,6 +62,8 @@ _LIBUNWIND_HIDDEN int __unw_init_local(unw_cursor_t *cursor,
 # define REGISTER_KIND Registers_sparc
 #elif defined(__riscv) && __riscv_xlen == 64
 # define REGISTER_KIND Registers_riscv
+#elif defined(__ve__)
+# define REGISTER_KIND Registers_ve
 #else
 # error Architecture not supported
 #endif
diff --git a/lib/std/dwarf.zig b/lib/std/dwarf.zig
index 7df3a1bff6..602eddeaaf 100644
--- a/lib/std/dwarf.zig
+++ b/lib/std/dwarf.zig
@@ -157,6 +157,7 @@ const LineNumberProgram = struct {
     include_dirs: []const []const u8,
     file_entries: *ArrayList(FileEntry),
 
+    prev_valid: bool,
     prev_address: usize,
     prev_file: usize,
     prev_line: i64,
@@ -175,6 +176,7 @@ const LineNumberProgram = struct {
         self.basic_block = false;
         self.end_sequence = false;
         // Invalidate all the remaining fields
+        self.prev_valid = false;
         self.prev_address = 0;
         self.prev_file = undefined;
         self.prev_line = undefined;
@@ -197,6 +199,7 @@ const LineNumberProgram = struct {
             .file_entries = file_entries,
             .default_is_stmt = is_stmt,
             .target_address = target_address,
+            .prev_valid = false,
             .prev_address = 0,
             .prev_file = undefined,
             .prev_line = undefined,
@@ -208,7 +211,7 @@ const LineNumberProgram = struct {
     }
 
     pub fn checkLineMatch(self: *LineNumberProgram) !?debug.LineInfo {
-        if (self.target_address >= self.prev_address and self.target_address < self.address) {
+        if (self.prev_valid and self.target_address >= self.prev_address and self.target_address < self.address) {
             const file_entry = if (self.prev_file == 0) {
                 return error.MissingDebugInfo;
             } else if (self.prev_file - 1 >= self.file_entries.items.len) {
@@ -228,6 +231,7 @@ const LineNumberProgram = struct {
             };
         }
 
+        self.prev_valid = true;
         self.prev_address = self.address;
         self.prev_file = self.file;
         self.prev_line = self.line;
diff --git a/lib/std/target.zig b/lib/std/target.zig
index b3c417c348..3372f617a8 100644
--- a/lib/std/target.zig
+++ b/lib/std/target.zig
@@ -36,11 +36,11 @@ pub const Target = struct {
             openbsd,
             solaris,
             windows,
+            zos,
             haiku,
             minix,
             rtems,
             nacl,
-            cnk,
             aix,
             cuda,
             nvcl,
@@ -232,11 +232,11 @@ pub const Target = struct {
                     .kfreebsd,
                     .lv2,
                     .solaris,
+                    .zos,
                     .haiku,
                     .minix,
                     .rtems,
                     .nacl,
-                    .cnk,
                     .aix,
                     .cuda,
                     .nvcl,
@@ -391,10 +391,10 @@ pub const Target = struct {
                 .kfreebsd,
                 .lv2,
                 .solaris,
+                .zos,
                 .minix,
                 .rtems,
                 .nacl,
-                .cnk,
                 .aix,
                 .cuda,
                 .nvcl,
@@ -430,8 +430,8 @@ pub const Target = struct {
     pub const powerpc = @import("target/powerpc.zig");
     pub const riscv = @import("target/riscv.zig");
     pub const sparc = @import("target/sparc.zig");
-    pub const spirv = @import("target/spirv.zig");
     pub const systemz = @import("target/systemz.zig");
+    pub const ve = @import("target/ve.zig");
     pub const wasm = @import("target/wasm.zig");
     pub const x86 = @import("target/x86.zig");
 
@@ -443,6 +443,7 @@ pub const Target = struct {
         gnueabi,
         gnueabihf,
         gnux32,
+        gnuilp32,
         code16,
         eabi,
         eabihf,
@@ -468,10 +469,10 @@ pub const Target = struct {
                 .dragonfly,
                 .lv2,
                 .solaris,
+                .zos,
                 .minix,
                 .rtems,
                 .nacl,
-                .cnk,
                 .aix,
                 .cuda,
                 .nvcl,
@@ -592,7 +593,7 @@ pub const Target = struct {
             pub const Set = struct {
                 ints: [usize_count]usize,
 
-                pub const needed_bit_count = 168;
+                pub const needed_bit_count = 172;
                 pub const byte_count = (needed_bit_count + 7) / 8;
                 pub const usize_count = (byte_count + (@sizeOf(usize) - 1)) / @sizeOf(usize);
                 pub const Index = std.math.Log2Int(std.meta.Int(.unsigned, usize_count * @bitSizeOf(usize)));
@@ -714,6 +715,7 @@ pub const Target = struct {
             avr,
             bpfel,
             bpfeb,
+            csky,
             hexagon,
             mips,
             mipsel,
@@ -721,6 +723,7 @@ pub const Target = struct {
             mips64el,
             msp430,
             powerpc,
+            powerpcle,
             powerpc64,
             powerpc64le,
             r600,
@@ -831,7 +834,7 @@ pub const Target = struct {
                     .le32 => ._NONE,
                     .mips => ._MIPS,
                     .mipsel => ._MIPS_RS3_LE,
-                    .powerpc => ._PPC,
+                    .powerpc, .powerpcle => ._PPC,
                     .r600 => ._NONE,
                     .riscv32 => ._RISCV,
                     .sparc => ._SPARC,
@@ -870,6 +873,7 @@ pub const Target = struct {
                     .amdgcn => ._NONE,
                     .bpfel => ._BPF,
                     .bpfeb => ._BPF,
+                    .csky => ._NONE,
                     .sparcv9 => ._SPARCV9,
                     .s390x => ._S390,
                     .ve => ._NONE,
@@ -890,7 +894,7 @@ pub const Target = struct {
                     .le32 => .Unknown,
                     .mips => .Unknown,
                     .mipsel => .Unknown,
-                    .powerpc => .POWERPC,
+                    .powerpc, .powerpcle => .POWERPC,
                     .r600 => .Unknown,
                     .riscv32 => .RISCV32,
                     .sparc => .Unknown,
@@ -929,6 +933,7 @@ pub const Target = struct {
                     .amdgcn => .Unknown,
                     .bpfel => .Unknown,
                     .bpfeb => .Unknown,
+                    .csky => .Unknown,
                     .sparcv9 => .Unknown,
                     .s390x => .Unknown,
                     .ve => .Unknown,
@@ -948,6 +953,7 @@ pub const Target = struct {
                     .amdil,
                     .amdil64,
                     .bpfel,
+                    .csky,
                     .hexagon,
                     .hsail,
                     .hsail64,
@@ -961,6 +967,7 @@ pub const Target = struct {
                     .nvptx64,
                     .sparcel,
                     .tcele,
+                    .powerpcle,
                     .powerpc64le,
                     .r600,
                     .riscv32,
@@ -1011,11 +1018,13 @@ pub const Target = struct {
                     .arc,
                     .arm,
                     .armeb,
+                    .csky,
                     .hexagon,
                     .le32,
                     .mips,
                     .mipsel,
                     .powerpc,
+                    .powerpcle,
                     .r600,
                     .riscv32,
                     .sparc,
@@ -1065,17 +1074,14 @@ pub const Target = struct {
                 }
             }
 
-            /// Returns a name that matches the lib/std/target/* directory name.
+            /// Returns a name that matches the lib/std/target/* source file name.
             pub fn genericName(arch: Arch) []const u8 {
                 return switch (arch) {
                     .arm, .armeb, .thumb, .thumbeb => "arm",
                     .aarch64, .aarch64_be, .aarch64_32 => "aarch64",
-                    .avr => "avr",
                     .bpfel, .bpfeb => "bpf",
-                    .hexagon => "hexagon",
                     .mips, .mipsel, .mips64, .mips64el => "mips",
-                    .msp430 => "msp430",
-                    .powerpc, .powerpc64, .powerpc64le => "powerpc",
+                    .powerpc, .powerpcle, .powerpc64, .powerpc64le => "powerpc",
                     .amdgcn => "amdgpu",
                     .riscv32, .riscv64 => "riscv",
                     .sparc, .sparcv9, .sparcel => "sparc",
@@ -1098,13 +1104,14 @@ pub const Target = struct {
                     .hexagon => &hexagon.all_features,
                     .mips, .mipsel, .mips64, .mips64el => &mips.all_features,
                     .msp430 => &msp430.all_features,
-                    .powerpc, .powerpc64, .powerpc64le => &powerpc.all_features,
+                    .powerpc, .powerpcle, .powerpc64, .powerpc64le => &powerpc.all_features,
                     .amdgcn => &amdgpu.all_features,
                     .riscv32, .riscv64 => &riscv.all_features,
                     .sparc, .sparcv9, .sparcel => &sparc.all_features,
                     .s390x => &systemz.all_features,
                     .i386, .x86_64 => &x86.all_features,
                     .nvptx, .nvptx64 => &nvptx.all_features,
+                    .ve => &ve.all_features,
                     .wasm32, .wasm64 => &wasm.all_features,
 
                     else => &[0]Cpu.Feature{},
@@ -1121,13 +1128,14 @@ pub const Target = struct {
                     .hexagon => comptime allCpusFromDecls(hexagon.cpu),
                     .mips, .mipsel, .mips64, .mips64el => comptime allCpusFromDecls(mips.cpu),
                     .msp430 => comptime allCpusFromDecls(msp430.cpu),
-                    .powerpc, .powerpc64, .powerpc64le => comptime allCpusFromDecls(powerpc.cpu),
+                    .powerpc, .powerpcle, .powerpc64, .powerpc64le => comptime allCpusFromDecls(powerpc.cpu),
                     .amdgcn => comptime allCpusFromDecls(amdgpu.cpu),
                     .riscv32, .riscv64 => comptime allCpusFromDecls(riscv.cpu),
                     .sparc, .sparcv9, .sparcel => comptime allCpusFromDecls(sparc.cpu),
                     .s390x => comptime allCpusFromDecls(systemz.cpu),
                     .i386, .x86_64 => comptime allCpusFromDecls(x86.cpu),
                     .nvptx, .nvptx64 => comptime allCpusFromDecls(nvptx.cpu),
+                    .ve => comptime allCpusFromDecls(ve.cpu),
                     .wasm32, .wasm64 => comptime allCpusFromDecls(wasm.cpu),
 
                     else => &[0]*const Model{},
@@ -1177,17 +1185,19 @@ pub const Target = struct {
                     .mips64, .mips64el => &mips.cpu.mips64,
                     .msp430 => &msp430.cpu.generic,
                     .powerpc => &powerpc.cpu.ppc32,
+                    .powerpcle => &powerpc.cpu.ppc32,
                     .powerpc64 => &powerpc.cpu.ppc64,
                     .powerpc64le => &powerpc.cpu.ppc64le,
                     .amdgcn => &amdgpu.cpu.generic,
                     .riscv32 => &riscv.cpu.generic_rv32,
                     .riscv64 => &riscv.cpu.generic_rv64,
-                    .sparc, .sparcel => &sparc.cpu.v8,
+                    .sparc, .sparcel => &sparc.cpu.generic,
                     .sparcv9 => &sparc.cpu.v9,
                     .s390x => &systemz.cpu.generic,
                     .i386 => &x86.cpu._i386,
                     .x86_64 => &x86.cpu.x86_64,
                     .nvptx, .nvptx64 => &nvptx.cpu.sm_20,
+                    .ve => &ve.cpu.generic,
                     .wasm32, .wasm64 => &wasm.cpu.generic,
 
                     else => &S.generic_model,
@@ -1201,6 +1211,7 @@ pub const Target = struct {
                     .riscv64 => &riscv.cpu.baseline_rv64,
                     .i386 => &x86.cpu.pentium4,
                     .nvptx, .nvptx64 => &nvptx.cpu.sm_20,
+                    .sparc, .sparcel => &sparc.cpu.v8,
 
                     else => generic(arch),
                 };
@@ -1490,7 +1501,7 @@ pub const Target = struct {
                     return print(&result, "/lib{s}/{s}", .{ lib_suffix, loader });
                 },
 
-                .powerpc => return copy(&result, "/lib/ld.so.1"),
+                .powerpc, .powerpcle => return copy(&result, "/lib/ld.so.1"),
                 .powerpc64, .powerpc64le => return copy(&result, "/lib64/ld64.so.2"),
                 .s390x => return copy(&result, "/lib64/ld64.so.1"),
                 .sparcv9 => return copy(&result, "/lib64/ld-linux.so.2"),
@@ -1519,6 +1530,7 @@ pub const Target = struct {
                 // TODO go over each item in this list and either move it to the above list, or
                 // implement the standard dynamic linker path code for it.
                 .arc,
+                .csky,
                 .hexagon,
                 .msp430,
                 .r600,
@@ -1573,10 +1585,10 @@ pub const Target = struct {
             .kfreebsd,
             .lv2,
             .solaris,
+            .zos,
             .minix,
             .rtems,
             .nacl,
-            .cnk,
             .aix,
             .cuda,
             .nvcl,
diff --git a/lib/std/target/aarch64.zig b/lib/std/target/aarch64.zig
index 9a2b3e7322..03c2031207 100644
--- a/lib/std/target/aarch64.zig
+++ b/lib/std/target/aarch64.zig
@@ -1,14 +1,10 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2015-2021 Zig Contributors
-// This file is part of [zig](https://ziglang.org/), which is MIT licensed.
-// The MIT license requires this copyright notice to be included in all copies
-// and substantial portions of the software.
+//! This file is auto-generated by tools/update_cpu_features.zig.
+
 const std = @import("../std.zig");
 const CpuFeature = std.Target.Cpu.Feature;
 const CpuModel = std.Target.Cpu.Model;
 
 pub const Feature = enum {
-    a34,
     a65,
     a76,
     aes,
@@ -17,8 +13,6 @@ pub const Feature = enum {
     altnzcv,
     am,
     amvs,
-    apple_a10,
-    apple_a11,
     apple_a12,
     apple_a13,
     apple_a7,
@@ -26,6 +20,7 @@ pub const Feature = enum {
     arith_cbz_fusion,
     balance_fp_ops,
     bf16,
+    brbe,
     bti,
     call_saved_x10,
     call_saved_x11,
@@ -39,7 +34,11 @@ pub const Feature = enum {
     ccdp,
     ccidx,
     ccpp,
+    cmp_bcc_fusion,
     complxnum,
+    contextidr_el2,
+    cortex_a78c,
+    cortex_r82,
     crc,
     crypto,
     custom_cheap_as_move,
@@ -49,14 +48,14 @@ pub const Feature = enum {
     ecv,
     ete,
     exynos_cheap_as_move,
-    exynosm4,
+    exynos_m4,
     f32mm,
     f64mm,
     fgt,
-    fmi,
+    flagm,
     force_32bit_jump_tables,
-    fp_armv8,
     fp16fml,
+    fp_armv8,
     fptoint,
     fullfp16,
     fuse_address,
@@ -67,28 +66,32 @@ pub const Feature = enum {
     fuse_literals,
     harden_sls_blr,
     harden_sls_retbr,
+    hcx,
     i8mm,
     jsconv,
     lor,
+    ls64,
     lse,
     lsl_fast,
     mpam,
     mte,
     neon,
-    neoversee1,
-    neoversen1,
+    neoverse_e1,
+    neoverse_n1,
+    neoverse_n2,
+    neoverse_v1,
     no_neg_immediates,
     nv,
-    pa,
+    outline_atomics,
     pan,
     pan_rwv,
+    pauth,
     perfmon,
     pmu,
     predictable_select_expensive,
     predres,
     rand,
     ras,
-    rasv8_4,
     rcpc,
     rcpc_immo,
     rdm,
@@ -126,6 +129,7 @@ pub const Feature = enum {
     slow_strqro_store,
     sm4,
     spe,
+    spe_eef,
     specrestrict,
     ssbs,
     strict_align,
@@ -148,14 +152,18 @@ pub const Feature = enum {
     use_experimental_zeroing_pseudos,
     use_postra_scheduler,
     use_reciprocal_square_root,
-    v8a,
     v8_1a,
     v8_2a,
     v8_3a,
     v8_4a,
     v8_5a,
     v8_6a,
+    v8_7a,
+    v8a,
+    v8r,
     vh,
+    wfxt,
+    xs,
     zcm,
     zcz,
     zcz_fp,
@@ -170,26 +178,16 @@ pub const all_features = blk: {
     const len = @typeInfo(Feature).Enum.fields.len;
     std.debug.assert(len <= CpuFeature.Set.needed_bit_count);
     var result: [len]CpuFeature = undefined;
-    result[@enumToInt(Feature.a34)] = .{
-        .llvm_name = "a35",
-        .description = "Cortex-A34 ARM processors",
-        .dependencies = featureSet(&[_]Feature{
-            .crc,
-            .crypto,
-            .perfmon,
-            .v8a,
-        }),
-    };
     result[@enumToInt(Feature.a65)] = .{
         .llvm_name = "a65",
         .description = "Cortex-A65 ARM processors",
         .dependencies = featureSet(&[_]Feature{
             .crypto,
             .dotprod,
-            .fp_armv8,
             .fullfp16,
-            .neon,
-            .ras,
+            .fuse_address,
+            .fuse_aes,
+            .fuse_literals,
             .rcpc,
             .ssbs,
             .v8_2a,
@@ -202,6 +200,7 @@ pub const all_features = blk: {
             .crypto,
             .dotprod,
             .fullfp16,
+            .fuse_aes,
             .rcpc,
             .ssbs,
             .v8_2a,
@@ -241,49 +240,6 @@ pub const all_features = blk: {
             .am,
         }),
     };
-    result[@enumToInt(Feature.apple_a10)] = .{
-        .llvm_name = "apple-a10",
-        .description = "Apple A10",
-        .dependencies = featureSet(&[_]Feature{
-            .alternate_sextload_cvt_f32_pattern,
-            .arith_bcc_fusion,
-            .arith_cbz_fusion,
-            .crc,
-            .crypto,
-            .disable_latency_sched_heuristic,
-            .fp_armv8,
-            .fuse_aes,
-            .fuse_crypto_eor,
-            .lor,
-            .neon,
-            .pan,
-            .perfmon,
-            .rdm,
-            .vh,
-            .zcm,
-            .zcz,
-        }),
-    };
-    result[@enumToInt(Feature.apple_a11)] = .{
-        .llvm_name = "apple-a11",
-        .description = "Apple A11",
-        .dependencies = featureSet(&[_]Feature{
-            .alternate_sextload_cvt_f32_pattern,
-            .arith_bcc_fusion,
-            .arith_cbz_fusion,
-            .crypto,
-            .disable_latency_sched_heuristic,
-            .fp_armv8,
-            .fullfp16,
-            .fuse_aes,
-            .fuse_crypto_eor,
-            .neon,
-            .perfmon,
-            .v8_2a,
-            .zcm,
-            .zcz,
-        }),
-    };
     result[@enumToInt(Feature.apple_a12)] = .{
         .llvm_name = "apple-a12",
         .description = "Apple A12",
@@ -293,11 +249,9 @@ pub const all_features = blk: {
             .arith_cbz_fusion,
             .crypto,
             .disable_latency_sched_heuristic,
-            .fp_armv8,
             .fullfp16,
             .fuse_aes,
             .fuse_crypto_eor,
-            .neon,
             .perfmon,
             .v8_3a,
             .zcm,
@@ -313,12 +267,9 @@ pub const all_features = blk: {
             .arith_cbz_fusion,
             .crypto,
             .disable_latency_sched_heuristic,
-            .fp_armv8,
             .fp16fml,
-            .fullfp16,
             .fuse_aes,
             .fuse_crypto_eor,
-            .neon,
             .perfmon,
             .sha3,
             .v8_4a,
@@ -335,10 +286,8 @@ pub const all_features = blk: {
             .arith_cbz_fusion,
             .crypto,
             .disable_latency_sched_heuristic,
-            .fp_armv8,
             .fuse_aes,
             .fuse_crypto_eor,
-            .neon,
             .perfmon,
             .zcm,
             .zcz,
@@ -365,6 +314,11 @@ pub const all_features = blk: {
         .description = "Enable BFloat16 Extension",
         .dependencies = featureSet(&[_]Feature{}),
     };
+    result[@enumToInt(Feature.brbe)] = .{
+        .llvm_name = "brbe",
+        .description = "Enable Branch Record Buffer Extension",
+        .dependencies = featureSet(&[_]Feature{}),
+    };
     result[@enumToInt(Feature.bti)] = .{
         .llvm_name = "bti",
         .description = "Enable Branch Target Identification",
@@ -430,6 +384,11 @@ pub const all_features = blk: {
         .description = "Enable v8.2 data Cache Clean to Point of Persistence",
         .dependencies = featureSet(&[_]Feature{}),
     };
+    result[@enumToInt(Feature.cmp_bcc_fusion)] = .{
+        .llvm_name = "cmp-bcc-fusion",
+        .description = "CPU fuses cmp+bcc operations",
+        .dependencies = featureSet(&[_]Feature{}),
+    };
     result[@enumToInt(Feature.complxnum)] = .{
         .llvm_name = "complxnum",
         .description = "Enable v8.3-A Floating-point complex number support",
@@ -437,6 +396,38 @@ pub const all_features = blk: {
             .neon,
         }),
     };
+    result[@enumToInt(Feature.contextidr_el2)] = .{
+        .llvm_name = "CONTEXTIDREL2",
+        .description = "Enable RW operand Context ID Register (EL2)",
+        .dependencies = featureSet(&[_]Feature{}),
+    };
+    result[@enumToInt(Feature.cortex_a78c)] = .{
+        .llvm_name = "cortex-a78c",
+        .description = "Cortex-A78C ARM processors",
+        .dependencies = featureSet(&[_]Feature{
+            .cmp_bcc_fusion,
+            .crypto,
+            .dotprod,
+            .flagm,
+            .fp16fml,
+            .fuse_aes,
+            .pauth,
+            .perfmon,
+            .rcpc,
+            .spe,
+            .ssbs,
+            .use_postra_scheduler,
+            .v8_2a,
+        }),
+    };
+    result[@enumToInt(Feature.cortex_r82)] = .{
+        .llvm_name = "cortex-r82",
+        .description = "Cortex-R82 ARM Processors",
+        .dependencies = featureSet(&[_]Feature{
+            .use_postra_scheduler,
+            .v8r,
+        }),
+    };
     result[@enumToInt(Feature.crc)] = .{
         .llvm_name = "crc",
         .description = "Enable ARMv8 CRC-32 checksum instructions",
@@ -447,7 +438,6 @@ pub const all_features = blk: {
         .description = "Enable cryptographic instructions",
         .dependencies = featureSet(&[_]Feature{
             .aes,
-            .neon,
             .sha2,
         }),
     };
@@ -490,7 +480,7 @@ pub const all_features = blk: {
             .custom_cheap_as_move,
         }),
     };
-    result[@enumToInt(Feature.exynosm4)] = .{
+    result[@enumToInt(Feature.exynos_m4)] = .{
         .llvm_name = "exynosm4",
         .description = "Samsung Exynos-M4 processors",
         .dependencies = featureSet(&[_]Feature{
@@ -532,8 +522,8 @@ pub const all_features = blk: {
         .description = "Enable fine grained virtualization traps extension",
         .dependencies = featureSet(&[_]Feature{}),
     };
-    result[@enumToInt(Feature.fmi)] = .{
-        .llvm_name = "fmi",
+    result[@enumToInt(Feature.flagm)] = .{
+        .llvm_name = "flagm",
         .description = "Enable v8.4-A Flag Manipulation Instructions",
         .dependencies = featureSet(&[_]Feature{}),
     };
@@ -542,11 +532,6 @@ pub const all_features = blk: {
         .description = "Force jump table entries to be 32-bits wide except at MinSize",
         .dependencies = featureSet(&[_]Feature{}),
     };
-    result[@enumToInt(Feature.fp_armv8)] = .{
-        .llvm_name = "fp-armv8",
-        .description = "Enable ARMv8 FP",
-        .dependencies = featureSet(&[_]Feature{}),
-    };
     result[@enumToInt(Feature.fp16fml)] = .{
         .llvm_name = "fp16fml",
         .description = "Enable FP16 FML instructions",
@@ -554,6 +539,11 @@ pub const all_features = blk: {
             .fullfp16,
         }),
     };
+    result[@enumToInt(Feature.fp_armv8)] = .{
+        .llvm_name = "fp-armv8",
+        .description = "Enable ARMv8 FP",
+        .dependencies = featureSet(&[_]Feature{}),
+    };
     result[@enumToInt(Feature.fptoint)] = .{
         .llvm_name = "fptoint",
         .description = "Enable FRInt[32|64][Z|X] instructions that round a floating-point number to an integer (in FP format) forcing it to fit into a 32- or 64-bit int",
@@ -606,6 +596,11 @@ pub const all_features = blk: {
         .description = "Harden against straight line speculation across RET and BR instructions",
         .dependencies = featureSet(&[_]Feature{}),
     };
+    result[@enumToInt(Feature.hcx)] = .{
+        .llvm_name = "hcx",
+        .description = "Enable Armv8.7-A HCRX_EL2 system register",
+        .dependencies = featureSet(&[_]Feature{}),
+    };
     result[@enumToInt(Feature.i8mm)] = .{
         .llvm_name = "i8mm",
         .description = "Enable Matrix Multiply Int8 Extension",
@@ -623,6 +618,11 @@ pub const all_features = blk: {
         .description = "Enables ARM v8.1 Limited Ordering Regions extension",
         .dependencies = featureSet(&[_]Feature{}),
     };
+    result[@enumToInt(Feature.ls64)] = .{
+        .llvm_name = "ls64",
+        .description = "Enable Armv8.7-A LD64B/ST64B Accelerator Extension",
+        .dependencies = featureSet(&[_]Feature{}),
+    };
     result[@enumToInt(Feature.lse)] = .{
         .llvm_name = "lse",
         .description = "Enable ARMv8.1 Large System Extension (LSE) atomic instructions",
@@ -650,35 +650,62 @@ pub const all_features = blk: {
             .fp_armv8,
         }),
     };
-    result[@enumToInt(Feature.neoversee1)] = .{
+    result[@enumToInt(Feature.neoverse_e1)] = .{
         .llvm_name = "neoversee1",
         .description = "Neoverse E1 ARM processors",
         .dependencies = featureSet(&[_]Feature{
             .crypto,
             .dotprod,
-            .fp_armv8,
             .fullfp16,
-            .neon,
             .rcpc,
             .ssbs,
             .v8_2a,
         }),
     };
-    result[@enumToInt(Feature.neoversen1)] = .{
+    result[@enumToInt(Feature.neoverse_n1)] = .{
         .llvm_name = "neoversen1",
         .description = "Neoverse N1 ARM processors",
         .dependencies = featureSet(&[_]Feature{
             .crypto,
             .dotprod,
-            .fp_armv8,
             .fullfp16,
-            .neon,
             .rcpc,
             .spe,
             .ssbs,
             .v8_2a,
         }),
     };
+    result[@enumToInt(Feature.neoverse_n2)] = .{
+        .llvm_name = "neoversen2",
+        .description = "Neoverse N2 ARM processors",
+        .dependencies = featureSet(&[_]Feature{
+            .bf16,
+            .ete,
+            .i8mm,
+            .mte,
+            .sve2_bitperm,
+            .v8_5a,
+        }),
+    };
+    result[@enumToInt(Feature.neoverse_v1)] = .{
+        .llvm_name = "neoversev1",
+        .description = "Neoverse V1 ARM processors",
+        .dependencies = featureSet(&[_]Feature{
+            .bf16,
+            .ccdp,
+            .crypto,
+            .fp16fml,
+            .fuse_aes,
+            .i8mm,
+            .perfmon,
+            .rand,
+            .spe,
+            .ssbs,
+            .sve,
+            .use_postra_scheduler,
+            .v8_4a,
+        }),
+    };
     result[@enumToInt(Feature.no_neg_immediates)] = .{
         .llvm_name = "no-neg-immediates",
         .description = "Convert immediates and instructions to their negated or complemented equivalent when the immediate does not fit in the encoding.",
@@ -686,12 +713,12 @@ pub const all_features = blk: {
     };
     result[@enumToInt(Feature.nv)] = .{
         .llvm_name = "nv",
-        .description = "Enable v8.4-A Nested Virtualization extension",
+        .description = "Enable v8.4-A Nested Virtualization Enchancement",
         .dependencies = featureSet(&[_]Feature{}),
     };
-    result[@enumToInt(Feature.pa)] = .{
-        .llvm_name = "pa",
-        .description = "Enable v8.3-A Pointer Authentication extension",
+    result[@enumToInt(Feature.outline_atomics)] = .{
+        .llvm_name = "outline-atomics",
+        .description = "Enable out of line atomics to support LSE instructions",
         .dependencies = featureSet(&[_]Feature{}),
     };
     result[@enumToInt(Feature.pan)] = .{
@@ -706,6 +733,11 @@ pub const all_features = blk: {
             .pan,
         }),
     };
+    result[@enumToInt(Feature.pauth)] = .{
+        .llvm_name = "pauth",
+        .description = "Enable v8.3-A Pointer Authentication extension",
+        .dependencies = featureSet(&[_]Feature{}),
+    };
     result[@enumToInt(Feature.perfmon)] = .{
         .llvm_name = "perfmon",
         .description = "Enable ARMv8 PMUv3 Performance Monitors extension",
@@ -736,13 +768,6 @@ pub const all_features = blk: {
         .description = "Enable ARMv8 Reliability, Availability and Serviceability Extensions",
         .dependencies = featureSet(&[_]Feature{}),
     };
-    result[@enumToInt(Feature.rasv8_4)] = .{
-        .llvm_name = "rasv8_4",
-        .description = "Enable v8.4-A Reliability, Availability and Serviceability extension",
-        .dependencies = featureSet(&[_]Feature{
-            .ras,
-        }),
-    };
     result[@enumToInt(Feature.rcpc)] = .{
         .llvm_name = "rcpc",
         .description = "Enable support for RCPC extension",
@@ -906,7 +931,6 @@ pub const all_features = blk: {
         .llvm_name = "sha3",
         .description = "Enable SHA512 and SHA3 support",
         .dependencies = featureSet(&[_]Feature{
-            .neon,
             .sha2,
         }),
     };
@@ -937,6 +961,11 @@ pub const all_features = blk: {
         .description = "Enable Statistical Profiling extension",
         .dependencies = featureSet(&[_]Feature{}),
     };
+    result[@enumToInt(Feature.spe_eef)] = .{
+        .llvm_name = "spe-eef",
+        .description = "Enable extra register in the Statistical Profiling Extension",
+        .dependencies = featureSet(&[_]Feature{}),
+    };
     result[@enumToInt(Feature.specrestrict)] = .{
         .llvm_name = "specrestrict",
         .description = "Enable architectural speculation restriction",
@@ -955,7 +984,9 @@ pub const all_features = blk: {
     result[@enumToInt(Feature.sve)] = .{
         .llvm_name = "sve",
         .description = "Enable Scalable Vector Extension (SVE) instructions",
-        .dependencies = featureSet(&[_]Feature{}),
+        .dependencies = featureSet(&[_]Feature{
+            .fullfp16,
+        }),
     };
     result[@enumToInt(Feature.sve2)] = .{
         .llvm_name = "sve2",
@@ -1060,14 +1091,6 @@ pub const all_features = blk: {
         .description = "Use the reciprocal square root approximation",
         .dependencies = featureSet(&[_]Feature{}),
     };
-    result[@enumToInt(Feature.v8a)] = .{
-        .llvm_name = null,
-        .description = "Support ARM v8a instructions",
-        .dependencies = featureSet(&[_]Feature{
-            .fp_armv8,
-            .neon,
-        }),
-    };
     result[@enumToInt(Feature.v8_1a)] = .{
         .llvm_name = "v8.1a",
         .description = "Support ARM v8.1a instructions",
@@ -1077,8 +1100,8 @@ pub const all_features = blk: {
             .lse,
             .pan,
             .rdm,
-            .vh,
             .v8a,
+            .vh,
         }),
     };
     result[@enumToInt(Feature.v8_2a)] = .{
@@ -1099,7 +1122,7 @@ pub const all_features = blk: {
             .ccidx,
             .complxnum,
             .jsconv,
-            .pa,
+            .pauth,
             .rcpc,
             .v8_2a,
         }),
@@ -1111,11 +1134,10 @@ pub const all_features = blk: {
             .am,
             .dit,
             .dotprod,
-            .fmi,
+            .flagm,
             .mpam,
             .nv,
             .pmu,
-            .rasv8_4,
             .rcpc_immo,
             .sel2,
             .tlb_rmi,
@@ -1150,9 +1172,71 @@ pub const all_features = blk: {
             .v8_5a,
         }),
     };
+    result[@enumToInt(Feature.v8_7a)] = .{
+        .llvm_name = "v8.7a",
+        .description = "Support ARM v8.7a instructions",
+        .dependencies = featureSet(&[_]Feature{
+            .hcx,
+            .v8_6a,
+            .wfxt,
+            .xs,
+        }),
+    };
+    result[@enumToInt(Feature.v8a)] = .{
+        .llvm_name = null,
+        .description = "Support ARM v8a instructions",
+        .dependencies = featureSet(&[_]Feature{
+            .neon,
+        }),
+    };
+    result[@enumToInt(Feature.v8r)] = .{
+        .llvm_name = "v8r",
+        .description = "Support ARM v8r instructions",
+        .dependencies = featureSet(&[_]Feature{
+            .ccidx,
+            .ccpp,
+            .complxnum,
+            .contextidr_el2,
+            .crc,
+            .dit,
+            .dotprod,
+            .flagm,
+            .fp16fml,
+            .jsconv,
+            .lse,
+            .pan_rwv,
+            .pauth,
+            .perfmon,
+            .predres,
+            .ras,
+            .rcpc_immo,
+            .rdm,
+            .sb,
+            .sel2,
+            .sha3,
+            .sm4,
+            .specrestrict,
+            .ssbs,
+            .tlb_rmi,
+            .tracev8_4,
+            .uaops,
+        }),
+    };
     result[@enumToInt(Feature.vh)] = .{
         .llvm_name = "vh",
         .description = "Enables ARM v8.1 Virtual Host extension",
+        .dependencies = featureSet(&[_]Feature{
+            .contextidr_el2,
+        }),
+    };
+    result[@enumToInt(Feature.wfxt)] = .{
+        .llvm_name = "wfxt",
+        .description = "Enable Armv8.7-A WFET and WFIT instruction",
+        .dependencies = featureSet(&[_]Feature{}),
+    };
+    result[@enumToInt(Feature.xs)] = .{
+        .llvm_name = "xs",
+        .description = "Enable Armv8.7-A limited-TLB-maintenance instruction",
         .dependencies = featureSet(&[_]Feature{}),
     };
     result[@enumToInt(Feature.zcm)] = .{
@@ -1196,11 +1280,11 @@ pub const cpu = struct {
         .name = "a64fx",
         .llvm_name = "a64fx",
         .features = featureSet(&[_]Feature{
+            .aggressive_fma,
+            .arith_bcc_fusion,
             .complxnum,
-            .fp_armv8,
-            .fullfp16,
-            .neon,
             .perfmon,
+            .predictable_select_expensive,
             .sha2,
             .sve,
             .use_postra_scheduler,
@@ -1211,14 +1295,39 @@ pub const cpu = struct {
         .name = "apple_a10",
         .llvm_name = "apple-a10",
         .features = featureSet(&[_]Feature{
-            .apple_a10,
+            .alternate_sextload_cvt_f32_pattern,
+            .arith_bcc_fusion,
+            .arith_cbz_fusion,
+            .crc,
+            .crypto,
+            .disable_latency_sched_heuristic,
+            .fuse_aes,
+            .fuse_crypto_eor,
+            .lor,
+            .pan,
+            .perfmon,
+            .rdm,
+            .vh,
+            .zcm,
+            .zcz,
         }),
     };
     pub const apple_a11 = CpuModel{
         .name = "apple_a11",
         .llvm_name = "apple-a11",
         .features = featureSet(&[_]Feature{
-            .apple_a11,
+            .alternate_sextload_cvt_f32_pattern,
+            .arith_bcc_fusion,
+            .arith_cbz_fusion,
+            .crypto,
+            .disable_latency_sched_heuristic,
+            .fullfp16,
+            .fuse_aes,
+            .fuse_crypto_eor,
+            .perfmon,
+            .v8_2a,
+            .zcm,
+            .zcz,
         }),
     };
     pub const apple_a12 = CpuModel{
@@ -1235,6 +1344,37 @@ pub const cpu = struct {
             .apple_a13,
         }),
     };
+    pub const apple_a14 = CpuModel{
+        .name = "apple_a14",
+        .llvm_name = "apple-a14",
+        .features = featureSet(&[_]Feature{
+            .aggressive_fma,
+            .alternate_sextload_cvt_f32_pattern,
+            .altnzcv,
+            .arith_bcc_fusion,
+            .arith_cbz_fusion,
+            .ccdp,
+            .crypto,
+            .disable_latency_sched_heuristic,
+            .fp16fml,
+            .fptoint,
+            .fuse_address,
+            .fuse_aes,
+            .fuse_arith_logic,
+            .fuse_crypto_eor,
+            .fuse_csel,
+            .fuse_literals,
+            .perfmon,
+            .predres,
+            .sb,
+            .sha3,
+            .specrestrict,
+            .ssbs,
+            .v8_4a,
+            .zcm,
+            .zcz,
+        }),
+    };
     pub const apple_a7 = CpuModel{
         .name = "apple_a7",
         .llvm_name = "apple-a7",
@@ -1283,7 +1423,6 @@ pub const cpu = struct {
         .features = featureSet(&[_]Feature{
             .crypto,
             .fullfp16,
-            .neon,
             .v8_2a,
         }),
     };
@@ -1291,14 +1430,20 @@ pub const cpu = struct {
         .name = "cortex_a34",
         .llvm_name = "cortex-a34",
         .features = featureSet(&[_]Feature{
-            .a34,
+            .crc,
+            .crypto,
+            .perfmon,
+            .v8a,
         }),
     };
     pub const cortex_a35 = CpuModel{
         .name = "cortex_a35",
         .llvm_name = "cortex-a35",
         .features = featureSet(&[_]Feature{
-            .a34,
+            .crc,
+            .crypto,
+            .perfmon,
+            .v8a,
         }),
     };
     pub const cortex_a53 = CpuModel{
@@ -1366,6 +1511,7 @@ pub const cpu = struct {
             .crc,
             .crypto,
             .fuse_aes,
+            .fuse_literals,
             .perfmon,
             .v8a,
         }),
@@ -1412,11 +1558,11 @@ pub const cpu = struct {
         .name = "cortex_a77",
         .llvm_name = "cortex-a77",
         .features = featureSet(&[_]Feature{
+            .cmp_bcc_fusion,
             .crypto,
             .dotprod,
-            .fp_armv8,
             .fullfp16,
-            .neon,
+            .fuse_aes,
             .rcpc,
             .v8_2a,
         }),
@@ -1425,12 +1571,11 @@ pub const cpu = struct {
         .name = "cortex_a78",
         .llvm_name = "cortex-a78",
         .features = featureSet(&[_]Feature{
+            .cmp_bcc_fusion,
             .crypto,
             .dotprod,
-            .fp_armv8,
             .fullfp16,
             .fuse_aes,
-            .neon,
             .perfmon,
             .rcpc,
             .spe,
@@ -1439,16 +1584,29 @@ pub const cpu = struct {
             .v8_2a,
         }),
     };
+    pub const cortex_a78c = CpuModel{
+        .name = "cortex_a78c",
+        .llvm_name = "cortex-a78c",
+        .features = featureSet(&[_]Feature{
+            .cortex_a78c,
+        }),
+    };
+    pub const cortex_r82 = CpuModel{
+        .name = "cortex_r82",
+        .llvm_name = "cortex-r82",
+        .features = featureSet(&[_]Feature{
+            .cortex_r82,
+        }),
+    };
     pub const cortex_x1 = CpuModel{
         .name = "cortex_x1",
         .llvm_name = "cortex-x1",
         .features = featureSet(&[_]Feature{
+            .cmp_bcc_fusion,
             .crypto,
             .dotprod,
-            .fp_armv8,
             .fullfp16,
             .fuse_aes,
-            .neon,
             .perfmon,
             .rcpc,
             .spe,
@@ -1522,14 +1680,14 @@ pub const cpu = struct {
         .name = "exynos_m4",
         .llvm_name = "exynos-m4",
         .features = featureSet(&[_]Feature{
-            .exynosm4,
+            .exynos_m4,
         }),
     };
     pub const exynos_m5 = CpuModel{
         .name = "exynos_m5",
         .llvm_name = "exynos-m5",
         .features = featureSet(&[_]Feature{
-            .exynosm4,
+            .exynos_m4,
         }),
     };
     pub const falkor = CpuModel{
@@ -1555,9 +1713,9 @@ pub const cpu = struct {
         .features = featureSet(&[_]Feature{
             .ete,
             .fuse_aes,
+            .neon,
             .perfmon,
             .use_postra_scheduler,
-            .v8a,
         }),
     };
     pub const kryo = CpuModel{
@@ -1571,22 +1729,36 @@ pub const cpu = struct {
             .perfmon,
             .predictable_select_expensive,
             .use_postra_scheduler,
-            .zcz,
             .v8a,
+            .zcz,
         }),
     };
     pub const neoverse_e1 = CpuModel{
         .name = "neoverse_e1",
         .llvm_name = "neoverse-e1",
         .features = featureSet(&[_]Feature{
-            .neoversee1,
+            .neoverse_e1,
         }),
     };
     pub const neoverse_n1 = CpuModel{
         .name = "neoverse_n1",
         .llvm_name = "neoverse-n1",
         .features = featureSet(&[_]Feature{
-            .neoversen1,
+            .neoverse_n1,
+        }),
+    };
+    pub const neoverse_n2 = CpuModel{
+        .name = "neoverse_n2",
+        .llvm_name = "neoverse-n2",
+        .features = featureSet(&[_]Feature{
+            .neoverse_n2,
+        }),
+    };
+    pub const neoverse_v1 = CpuModel{
+        .name = "neoverse_v1",
+        .llvm_name = "neoverse-v1",
+        .features = featureSet(&[_]Feature{
+            .neoverse_v1,
         }),
     };
     pub const saphira = CpuModel{
@@ -1622,9 +1794,7 @@ pub const cpu = struct {
         .features = featureSet(&[_]Feature{
             .aggressive_fma,
             .arith_bcc_fusion,
-            .crc,
             .crypto,
-            .lse,
             .predictable_select_expensive,
             .use_postra_scheduler,
             .v8_1a,
@@ -1637,12 +1807,7 @@ pub const cpu = struct {
             .aggressive_fma,
             .arith_bcc_fusion,
             .balance_fp_ops,
-            .crc,
             .crypto,
-            .fp_armv8,
-            .lse,
-            .neon,
-            .pa,
             .perfmon,
             .predictable_select_expensive,
             .strict_align,
@@ -1695,7 +1860,6 @@ pub const cpu = struct {
             .custom_cheap_as_move,
             .dotprod,
             .fp16fml,
-            .fullfp16,
             .fuse_aes,
             .perfmon,
             .spe,
diff --git a/lib/std/target/amdgpu.zig b/lib/std/target/amdgpu.zig
index 09be754964..6e36a34a5a 100644
--- a/lib/std/target/amdgpu.zig
+++ b/lib/std/target/amdgpu.zig
@@ -1,22 +1,17 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2015-2021 Zig Contributors
-// This file is part of [zig](https://ziglang.org/), which is MIT licensed.
-// The MIT license requires this copyright notice to be included in all copies
-// and substantial portions of the software.
+//! This file is auto-generated by tools/update_cpu_features.zig.
+
 const std = @import("../std.zig");
 const CpuFeature = std.Target.Cpu.Feature;
 const CpuModel = std.Target.Cpu.Model;
 
 pub const Feature = enum {
     @"16_bit_insts",
-    DumpCode,
     a16,
     add_no_carry_insts,
     aperture_regs,
     atomic_fadd_insts,
     auto_waitcnt_before_barrier,
     ci_insts,
-    code_object_v3,
     cumode,
     dl_insts,
     dot1_insts,
@@ -28,7 +23,6 @@ pub const Feature = enum {
     dpp,
     dpp8,
     ds_src2_insts,
-    dumpcode,
     enable_ds128,
     enable_prt_strict_null,
     fast_denormal_f32,
@@ -47,13 +41,15 @@ pub const Feature = enum {
     get_wave_id_inst,
     gfx10,
     gfx10_3_insts,
-    gfx10_insts,
     gfx10_b_encoding,
+    gfx10_insts,
     gfx7_gfx8_gfx9_insts,
     gfx8_insts,
     gfx9,
     gfx9_insts,
     half_rate_64_ops,
+    image_gather4_d16_bug,
+    image_store_d16_bug,
     inst_fwd_prefetch_bug,
     int_clamp_insts,
     inv_2pi_inline_imm,
@@ -76,8 +72,6 @@ pub const Feature = enum {
     movrel,
     no_data_dep_hazard,
     no_sdst_cmpx,
-    no_sram_ecc_support,
-    no_xnack_support,
     nsa_encoding,
     nsa_to_vmem_bug,
     offset_3f_bug,
@@ -101,10 +95,13 @@ pub const Feature = enum {
     si_scheduler,
     smem_to_vector_write_hazard,
     southern_islands,
-    sram_ecc,
+    sramecc,
+    sramecc_support,
     trap_handler,
     trig_reduced_range,
+    unaligned_access_mode,
     unaligned_buffer_access,
+    unaligned_ds_access,
     unaligned_scratch_access,
     unpacked_d16_vmem,
     unsafe_ds_offset_folding,
@@ -120,6 +117,7 @@ pub const Feature = enum {
     wavefrontsize32,
     wavefrontsize64,
     xnack,
+    xnack_support,
 };
 
 pub usingnamespace CpuFeature.feature_set_fns(Feature);
@@ -133,11 +131,6 @@ pub const all_features = blk: {
         .description = "Has i16/f16 instructions",
         .dependencies = featureSet(&[_]Feature{}),
     };
-    result[@enumToInt(Feature.DumpCode)] = .{
-        .llvm_name = "DumpCode",
-        .description = "Dump MachineInstrs in the CodeEmitter",
-        .dependencies = featureSet(&[_]Feature{}),
-    };
     result[@enumToInt(Feature.a16)] = .{
         .llvm_name = "a16",
         .description = "Support gfx10-style A16 for 16-bit coordinates/gradients/lod/clamp/mip image operands",
@@ -170,11 +163,6 @@ pub const all_features = blk: {
         .description = "Additional instructions for CI+",
         .dependencies = featureSet(&[_]Feature{}),
     };
-    result[@enumToInt(Feature.code_object_v3)] = .{
-        .llvm_name = "code-object-v3",
-        .description = "Generate code object version 3",
-        .dependencies = featureSet(&[_]Feature{}),
-    };
     result[@enumToInt(Feature.cumode)] = .{
         .llvm_name = "cumode",
         .description = "Enable CU wavefront execution mode",
@@ -230,14 +218,9 @@ pub const all_features = blk: {
         .description = "Has ds_*_src2 instructions",
         .dependencies = featureSet(&[_]Feature{}),
     };
-    result[@enumToInt(Feature.dumpcode)] = .{
-        .llvm_name = "dumpcode",
-        .description = "Dump MachineInstrs in the CodeEmitter",
-        .dependencies = featureSet(&[_]Feature{}),
-    };
     result[@enumToInt(Feature.enable_ds128)] = .{
         .llvm_name = "enable-ds128",
-        .description = "Use ds_read|write_b128",
+        .description = "Use ds_{read|write}_b128",
         .dependencies = featureSet(&[_]Feature{}),
     };
     result[@enumToInt(Feature.enable_prt_strict_null)] = .{
@@ -345,7 +328,6 @@ pub const all_features = blk: {
             .movrel,
             .no_data_dep_hazard,
             .no_sdst_cmpx,
-            .no_sram_ecc_support,
             .pk_fmac_f16_inst,
             .register_banking,
             .s_memrealtime,
@@ -353,6 +335,8 @@ pub const all_features = blk: {
             .sdwa_omod,
             .sdwa_scalar,
             .sdwa_sdst,
+            .unaligned_buffer_access,
+            .unaligned_ds_access,
             .vop3_literal,
             .vop3p,
             .vscnt,
@@ -363,16 +347,16 @@ pub const all_features = blk: {
         .description = "Additional instructions for GFX10.3",
         .dependencies = featureSet(&[_]Feature{}),
     };
-    result[@enumToInt(Feature.gfx10_insts)] = .{
-        .llvm_name = "gfx10-insts",
-        .description = "Additional instructions for GFX10+",
-        .dependencies = featureSet(&[_]Feature{}),
-    };
     result[@enumToInt(Feature.gfx10_b_encoding)] = .{
         .llvm_name = "gfx10_b-encoding",
         .description = "Encoding format GFX10_B",
         .dependencies = featureSet(&[_]Feature{}),
     };
+    result[@enumToInt(Feature.gfx10_insts)] = .{
+        .llvm_name = "gfx10-insts",
+        .description = "Additional instructions for GFX10+",
+        .dependencies = featureSet(&[_]Feature{}),
+    };
     result[@enumToInt(Feature.gfx7_gfx8_gfx9_insts)] = .{
         .llvm_name = "gfx7-gfx8-gfx9-insts",
         .description = "Instructions shared in GFX7, GFX8, GFX9",
@@ -418,9 +402,12 @@ pub const all_features = blk: {
             .sdwa_omod,
             .sdwa_scalar,
             .sdwa_sdst,
+            .unaligned_buffer_access,
+            .unaligned_ds_access,
             .vgpr_index_mode,
             .vop3p,
             .wavefrontsize64,
+            .xnack_support,
         }),
     };
     result[@enumToInt(Feature.gfx9_insts)] = .{
@@ -433,6 +420,16 @@ pub const all_features = blk: {
         .description = "Most fp64 instructions are half rate instead of quarter",
         .dependencies = featureSet(&[_]Feature{}),
     };
+    result[@enumToInt(Feature.image_gather4_d16_bug)] = .{
+        .llvm_name = "image-gather4-d16-bug",
+        .description = "Image Gather4 D16 hardware bug",
+        .dependencies = featureSet(&[_]Feature{}),
+    };
+    result[@enumToInt(Feature.image_store_d16_bug)] = .{
+        .llvm_name = "image-store-d16-bug",
+        .description = "Image Store D16 hardware bug",
+        .dependencies = featureSet(&[_]Feature{}),
+    };
     result[@enumToInt(Feature.inst_fwd_prefetch_bug)] = .{
         .llvm_name = "inst-fwd-prefetch-bug",
         .description = "S_INST_PREFETCH instruction causes shader to hang",
@@ -455,7 +452,7 @@ pub const all_features = blk: {
     };
     result[@enumToInt(Feature.lds_misaligned_bug)] = .{
         .llvm_name = "lds-misaligned-bug",
-        .description = "Some GFX10 bug with misaligned multi-dword LDS access in WGP mode",
+        .description = "Some GFX10 bug with multi-dword LDS and flat access that is not naturally aligned in WGP mode",
         .dependencies = featureSet(&[_]Feature{}),
     };
     result[@enumToInt(Feature.ldsbankcount16)] = .{
@@ -543,16 +540,6 @@ pub const all_features = blk: {
         .description = "V_CMPX does not write VCC/SGPR in addition to EXEC",
         .dependencies = featureSet(&[_]Feature{}),
     };
-    result[@enumToInt(Feature.no_sram_ecc_support)] = .{
-        .llvm_name = "no-sram-ecc-support",
-        .description = "Hardware does not support SRAM ECC",
-        .dependencies = featureSet(&[_]Feature{}),
-    };
-    result[@enumToInt(Feature.no_xnack_support)] = .{
-        .llvm_name = "no-xnack-support",
-        .description = "Hardware does not support XNACK",
-        .dependencies = featureSet(&[_]Feature{}),
-    };
     result[@enumToInt(Feature.nsa_encoding)] = .{
         .llvm_name = "nsa-encoding",
         .description = "Support NSA encoding for image instructions",
@@ -656,9 +643,9 @@ pub const all_features = blk: {
             .mad_mac_f32_insts,
             .mimg_r128,
             .movrel,
-            .no_sram_ecc_support,
             .s_memtime_inst,
             .trig_reduced_range,
+            .unaligned_buffer_access,
             .wavefrontsize64,
         }),
     };
@@ -688,16 +675,19 @@ pub const all_features = blk: {
             .mad_mac_f32_insts,
             .mimg_r128,
             .movrel,
-            .no_sram_ecc_support,
-            .no_xnack_support,
             .s_memtime_inst,
             .trig_reduced_range,
             .wavefrontsize64,
         }),
     };
-    result[@enumToInt(Feature.sram_ecc)] = .{
-        .llvm_name = "sram-ecc",
-        .description = "Enable SRAM ECC",
+    result[@enumToInt(Feature.sramecc)] = .{
+        .llvm_name = "sramecc",
+        .description = "Enable SRAMECC",
+        .dependencies = featureSet(&[_]Feature{}),
+    };
+    result[@enumToInt(Feature.sramecc_support)] = .{
+        .llvm_name = "sramecc-support",
+        .description = "Hardware supports SRAMECC",
         .dependencies = featureSet(&[_]Feature{}),
     };
     result[@enumToInt(Feature.trap_handler)] = .{
@@ -710,9 +700,19 @@ pub const all_features = blk: {
         .description = "Requires use of fract on arguments to trig instructions",
         .dependencies = featureSet(&[_]Feature{}),
     };
+    result[@enumToInt(Feature.unaligned_access_mode)] = .{
+        .llvm_name = "unaligned-access-mode",
+        .description = "Enable unaligned global, local and region loads and stores if the hardware supports it",
+        .dependencies = featureSet(&[_]Feature{}),
+    };
     result[@enumToInt(Feature.unaligned_buffer_access)] = .{
         .llvm_name = "unaligned-buffer-access",
-        .description = "Support unaligned global loads and stores",
+        .description = "Hardware supports unaligned global loads and stores",
+        .dependencies = featureSet(&[_]Feature{}),
+    };
+    result[@enumToInt(Feature.unaligned_ds_access)] = .{
+        .llvm_name = "unaligned-ds-access",
+        .description = "Hardware supports unaligned local and region loads and stores",
         .dependencies = featureSet(&[_]Feature{}),
     };
     result[@enumToInt(Feature.unaligned_scratch_access)] = .{
@@ -770,7 +770,6 @@ pub const all_features = blk: {
             .mad_mac_f32_insts,
             .mimg_r128,
             .movrel,
-            .no_sram_ecc_support,
             .s_memrealtime,
             .s_memtime_inst,
             .scalar_stores,
@@ -778,6 +777,7 @@ pub const all_features = blk: {
             .sdwa_mav,
             .sdwa_out_mods_vopc,
             .trig_reduced_range,
+            .unaligned_buffer_access,
             .vgpr_index_mode,
             .wavefrontsize64,
         }),
@@ -817,6 +817,11 @@ pub const all_features = blk: {
         .description = "Enable XNACK support",
         .dependencies = featureSet(&[_]Feature{}),
     };
+    result[@enumToInt(Feature.xnack_support)] = .{
+        .llvm_name = "xnack-support",
+        .description = "Hardware supports XNACK",
+        .dependencies = featureSet(&[_]Feature{}),
+    };
     const ti = @typeInfo(Feature);
     for (result) |*elem, i| {
         elem.index = i;
@@ -830,9 +835,7 @@ pub const cpu = struct {
         .name = "bonaire",
         .llvm_name = "bonaire",
         .features = featureSet(&[_]Feature{
-            .code_object_v3,
             .ldsbankcount32,
-            .no_xnack_support,
             .sea_islands,
         }),
     };
@@ -840,22 +843,19 @@ pub const cpu = struct {
         .name = "carrizo",
         .llvm_name = "carrizo",
         .features = featureSet(&[_]Feature{
-            .code_object_v3,
             .fast_fmaf,
             .half_rate_64_ops,
             .ldsbankcount32,
             .unpacked_d16_vmem,
             .volcanic_islands,
-            .xnack,
+            .xnack_support,
         }),
     };
     pub const fiji = CpuModel{
         .name = "fiji",
         .llvm_name = "fiji",
         .features = featureSet(&[_]Feature{
-            .code_object_v3,
             .ldsbankcount32,
-            .no_xnack_support,
             .unpacked_d16_vmem,
             .volcanic_islands,
         }),
@@ -879,7 +879,6 @@ pub const cpu = struct {
         .name = "gfx1010",
         .llvm_name = "gfx1010",
         .features = featureSet(&[_]Feature{
-            .code_object_v3,
             .dl_insts,
             .ds_src2_insts,
             .flat_segment_offset_bug,
@@ -890,7 +889,6 @@ pub const cpu = struct {
             .lds_misaligned_bug,
             .ldsbankcount32,
             .mad_mac_f32_insts,
-            .no_xnack_support,
             .nsa_encoding,
             .nsa_to_vmem_bug,
             .offset_3f_bug,
@@ -903,46 +901,13 @@ pub const cpu = struct {
             .vcmpx_permlane_hazard,
             .vmem_to_scalar_write_hazard,
             .wavefrontsize32,
+            .xnack_support,
         }),
     };
     pub const gfx1011 = CpuModel{
         .name = "gfx1011",
         .llvm_name = "gfx1011",
         .features = featureSet(&[_]Feature{
-            .code_object_v3,
-            .dl_insts,
-            .dot1_insts,
-            .dot2_insts,
-            .dot5_insts,
-            .dot6_insts,
-            .ds_src2_insts,
-            .flat_segment_offset_bug,
-            .get_wave_id_inst,
-            .gfx10,
-            .inst_fwd_prefetch_bug,
-            .lds_branch_vmem_war_hazard,
-            .ldsbankcount32,
-            .mad_mac_f32_insts,
-            .no_xnack_support,
-            .nsa_encoding,
-            .nsa_to_vmem_bug,
-            .offset_3f_bug,
-            .s_memtime_inst,
-            .scalar_atomics,
-            .scalar_flat_scratch_insts,
-            .scalar_stores,
-            .smem_to_vector_write_hazard,
-            .vcmpx_exec_war_hazard,
-            .vcmpx_permlane_hazard,
-            .vmem_to_scalar_write_hazard,
-            .wavefrontsize32,
-        }),
-    };
-    pub const gfx1012 = CpuModel{
-        .name = "gfx1012",
-        .llvm_name = "gfx1012",
-        .features = featureSet(&[_]Feature{
-            .code_object_v3,
             .dl_insts,
             .dot1_insts,
             .dot2_insts,
@@ -957,7 +922,6 @@ pub const cpu = struct {
             .lds_misaligned_bug,
             .ldsbankcount32,
             .mad_mac_f32_insts,
-            .no_xnack_support,
             .nsa_encoding,
             .nsa_to_vmem_bug,
             .offset_3f_bug,
@@ -970,13 +934,46 @@ pub const cpu = struct {
             .vcmpx_permlane_hazard,
             .vmem_to_scalar_write_hazard,
             .wavefrontsize32,
+            .xnack_support,
+        }),
+    };
+    pub const gfx1012 = CpuModel{
+        .name = "gfx1012",
+        .llvm_name = "gfx1012",
+        .features = featureSet(&[_]Feature{
+            .dl_insts,
+            .dot1_insts,
+            .dot2_insts,
+            .dot5_insts,
+            .dot6_insts,
+            .ds_src2_insts,
+            .flat_segment_offset_bug,
+            .get_wave_id_inst,
+            .gfx10,
+            .inst_fwd_prefetch_bug,
+            .lds_branch_vmem_war_hazard,
+            .lds_misaligned_bug,
+            .ldsbankcount32,
+            .mad_mac_f32_insts,
+            .nsa_encoding,
+            .nsa_to_vmem_bug,
+            .offset_3f_bug,
+            .s_memtime_inst,
+            .scalar_atomics,
+            .scalar_flat_scratch_insts,
+            .scalar_stores,
+            .smem_to_vector_write_hazard,
+            .vcmpx_exec_war_hazard,
+            .vcmpx_permlane_hazard,
+            .vmem_to_scalar_write_hazard,
+            .wavefrontsize32,
+            .xnack_support,
         }),
     };
     pub const gfx1030 = CpuModel{
         .name = "gfx1030",
         .llvm_name = "gfx1030",
         .features = featureSet(&[_]Feature{
-            .code_object_v3,
             .dl_insts,
             .dot1_insts,
             .dot2_insts,
@@ -986,7 +983,57 @@ pub const cpu = struct {
             .gfx10_3_insts,
             .gfx10_b_encoding,
             .ldsbankcount32,
-            .no_xnack_support,
+            .nsa_encoding,
+            .wavefrontsize32,
+        }),
+    };
+    pub const gfx1031 = CpuModel{
+        .name = "gfx1031",
+        .llvm_name = "gfx1031",
+        .features = featureSet(&[_]Feature{
+            .dl_insts,
+            .dot1_insts,
+            .dot2_insts,
+            .dot5_insts,
+            .dot6_insts,
+            .gfx10,
+            .gfx10_3_insts,
+            .gfx10_b_encoding,
+            .ldsbankcount32,
+            .nsa_encoding,
+            .wavefrontsize32,
+        }),
+    };
+    pub const gfx1032 = CpuModel{
+        .name = "gfx1032",
+        .llvm_name = "gfx1032",
+        .features = featureSet(&[_]Feature{
+            .dl_insts,
+            .dot1_insts,
+            .dot2_insts,
+            .dot5_insts,
+            .dot6_insts,
+            .gfx10,
+            .gfx10_3_insts,
+            .gfx10_b_encoding,
+            .ldsbankcount32,
+            .nsa_encoding,
+            .wavefrontsize32,
+        }),
+    };
+    pub const gfx1033 = CpuModel{
+        .name = "gfx1033",
+        .llvm_name = "gfx1033",
+        .features = featureSet(&[_]Feature{
+            .dl_insts,
+            .dot1_insts,
+            .dot2_insts,
+            .dot5_insts,
+            .dot6_insts,
+            .gfx10,
+            .gfx10_3_insts,
+            .gfx10_b_encoding,
+            .ldsbankcount32,
             .nsa_encoding,
             .wavefrontsize32,
         }),
@@ -995,11 +1042,8 @@ pub const cpu = struct {
         .name = "gfx600",
         .llvm_name = "gfx600",
         .features = featureSet(&[_]Feature{
-            .code_object_v3,
             .fast_fmaf,
             .half_rate_64_ops,
-            .ldsbankcount32,
-            .no_xnack_support,
             .southern_islands,
         }),
     };
@@ -1007,9 +1051,13 @@ pub const cpu = struct {
         .name = "gfx601",
         .llvm_name = "gfx601",
         .features = featureSet(&[_]Feature{
-            .code_object_v3,
-            .ldsbankcount32,
-            .no_xnack_support,
+            .southern_islands,
+        }),
+    };
+    pub const gfx602 = CpuModel{
+        .name = "gfx602",
+        .llvm_name = "gfx602",
+        .features = featureSet(&[_]Feature{
             .southern_islands,
         }),
     };
@@ -1017,9 +1065,7 @@ pub const cpu = struct {
         .name = "gfx700",
         .llvm_name = "gfx700",
         .features = featureSet(&[_]Feature{
-            .code_object_v3,
             .ldsbankcount32,
-            .no_xnack_support,
             .sea_islands,
         }),
     };
@@ -1027,11 +1073,9 @@ pub const cpu = struct {
         .name = "gfx701",
         .llvm_name = "gfx701",
         .features = featureSet(&[_]Feature{
-            .code_object_v3,
             .fast_fmaf,
             .half_rate_64_ops,
             .ldsbankcount32,
-            .no_xnack_support,
             .sea_islands,
         }),
     };
@@ -1039,10 +1083,8 @@ pub const cpu = struct {
         .name = "gfx702",
         .llvm_name = "gfx702",
         .features = featureSet(&[_]Feature{
-            .code_object_v3,
             .fast_fmaf,
             .ldsbankcount16,
-            .no_xnack_support,
             .sea_islands,
         }),
     };
@@ -1050,9 +1092,7 @@ pub const cpu = struct {
         .name = "gfx703",
         .llvm_name = "gfx703",
         .features = featureSet(&[_]Feature{
-            .code_object_v3,
             .ldsbankcount16,
-            .no_xnack_support,
             .sea_islands,
         }),
     };
@@ -1060,9 +1100,15 @@ pub const cpu = struct {
         .name = "gfx704",
         .llvm_name = "gfx704",
         .features = featureSet(&[_]Feature{
-            .code_object_v3,
             .ldsbankcount32,
-            .no_xnack_support,
+            .sea_islands,
+        }),
+    };
+    pub const gfx705 = CpuModel{
+        .name = "gfx705",
+        .llvm_name = "gfx705",
+        .features = featureSet(&[_]Feature{
+            .ldsbankcount16,
             .sea_islands,
         }),
     };
@@ -1070,22 +1116,19 @@ pub const cpu = struct {
         .name = "gfx801",
         .llvm_name = "gfx801",
         .features = featureSet(&[_]Feature{
-            .code_object_v3,
             .fast_fmaf,
             .half_rate_64_ops,
             .ldsbankcount32,
             .unpacked_d16_vmem,
             .volcanic_islands,
-            .xnack,
+            .xnack_support,
         }),
     };
     pub const gfx802 = CpuModel{
         .name = "gfx802",
         .llvm_name = "gfx802",
         .features = featureSet(&[_]Feature{
-            .code_object_v3,
             .ldsbankcount32,
-            .no_xnack_support,
             .sgpr_init_bug,
             .unpacked_d16_vmem,
             .volcanic_islands,
@@ -1095,9 +1138,17 @@ pub const cpu = struct {
         .name = "gfx803",
         .llvm_name = "gfx803",
         .features = featureSet(&[_]Feature{
-            .code_object_v3,
             .ldsbankcount32,
-            .no_xnack_support,
+            .unpacked_d16_vmem,
+            .volcanic_islands,
+        }),
+    };
+    pub const gfx805 = CpuModel{
+        .name = "gfx805",
+        .llvm_name = "gfx805",
+        .features = featureSet(&[_]Feature{
+            .ldsbankcount32,
+            .sgpr_init_bug,
             .unpacked_d16_vmem,
             .volcanic_islands,
         }),
@@ -1106,61 +1157,56 @@ pub const cpu = struct {
         .name = "gfx810",
         .llvm_name = "gfx810",
         .features = featureSet(&[_]Feature{
-            .code_object_v3,
+            .image_gather4_d16_bug,
+            .image_store_d16_bug,
             .ldsbankcount16,
             .volcanic_islands,
-            .xnack,
+            .xnack_support,
         }),
     };
     pub const gfx900 = CpuModel{
         .name = "gfx900",
         .llvm_name = "gfx900",
         .features = featureSet(&[_]Feature{
-            .code_object_v3,
             .gfx9,
+            .image_gather4_d16_bug,
             .ldsbankcount32,
             .mad_mix_insts,
-            .no_sram_ecc_support,
-            .no_xnack_support,
         }),
     };
     pub const gfx902 = CpuModel{
         .name = "gfx902",
         .llvm_name = "gfx902",
         .features = featureSet(&[_]Feature{
-            .code_object_v3,
             .gfx9,
+            .image_gather4_d16_bug,
             .ldsbankcount32,
             .mad_mix_insts,
-            .no_sram_ecc_support,
-            .xnack,
         }),
     };
     pub const gfx904 = CpuModel{
         .name = "gfx904",
         .llvm_name = "gfx904",
         .features = featureSet(&[_]Feature{
-            .code_object_v3,
             .fma_mix_insts,
             .gfx9,
+            .image_gather4_d16_bug,
             .ldsbankcount32,
-            .no_sram_ecc_support,
-            .no_xnack_support,
         }),
     };
     pub const gfx906 = CpuModel{
         .name = "gfx906",
         .llvm_name = "gfx906",
         .features = featureSet(&[_]Feature{
-            .code_object_v3,
             .dl_insts,
             .dot1_insts,
             .dot2_insts,
             .fma_mix_insts,
             .gfx9,
             .half_rate_64_ops,
+            .image_gather4_d16_bug,
             .ldsbankcount32,
-            .no_xnack_support,
+            .sramecc_support,
         }),
     };
     pub const gfx908 = CpuModel{
@@ -1168,7 +1214,6 @@ pub const cpu = struct {
         .llvm_name = "gfx908",
         .features = featureSet(&[_]Feature{
             .atomic_fadd_insts,
-            .code_object_v3,
             .dl_insts,
             .dot1_insts,
             .dot2_insts,
@@ -1179,19 +1224,30 @@ pub const cpu = struct {
             .fma_mix_insts,
             .gfx9,
             .half_rate_64_ops,
+            .image_gather4_d16_bug,
             .ldsbankcount32,
             .mai_insts,
             .mfma_inline_literal_bug,
             .pk_fmac_f16_inst,
-            .sram_ecc,
+            .sramecc_support,
         }),
     };
     pub const gfx909 = CpuModel{
         .name = "gfx909",
         .llvm_name = "gfx909",
         .features = featureSet(&[_]Feature{
-            .code_object_v3,
             .gfx9,
+            .image_gather4_d16_bug,
+            .ldsbankcount32,
+            .mad_mix_insts,
+        }),
+    };
+    pub const gfx90c = CpuModel{
+        .name = "gfx90c",
+        .llvm_name = "gfx90c",
+        .features = featureSet(&[_]Feature{
+            .gfx9,
+            .image_gather4_d16_bug,
             .ldsbankcount32,
             .mad_mix_insts,
             .xnack,
@@ -1201,9 +1257,6 @@ pub const cpu = struct {
         .name = "hainan",
         .llvm_name = "hainan",
         .features = featureSet(&[_]Feature{
-            .code_object_v3,
-            .ldsbankcount32,
-            .no_xnack_support,
             .southern_islands,
         }),
     };
@@ -1211,11 +1264,9 @@ pub const cpu = struct {
         .name = "hawaii",
         .llvm_name = "hawaii",
         .features = featureSet(&[_]Feature{
-            .code_object_v3,
             .fast_fmaf,
             .half_rate_64_ops,
             .ldsbankcount32,
-            .no_xnack_support,
             .sea_islands,
         }),
     };
@@ -1223,9 +1274,7 @@ pub const cpu = struct {
         .name = "iceland",
         .llvm_name = "iceland",
         .features = featureSet(&[_]Feature{
-            .code_object_v3,
             .ldsbankcount32,
-            .no_xnack_support,
             .sgpr_init_bug,
             .unpacked_d16_vmem,
             .volcanic_islands,
@@ -1235,9 +1284,7 @@ pub const cpu = struct {
         .name = "kabini",
         .llvm_name = "kabini",
         .features = featureSet(&[_]Feature{
-            .code_object_v3,
             .ldsbankcount16,
-            .no_xnack_support,
             .sea_islands,
         }),
     };
@@ -1245,9 +1292,7 @@ pub const cpu = struct {
         .name = "kaveri",
         .llvm_name = "kaveri",
         .features = featureSet(&[_]Feature{
-            .code_object_v3,
             .ldsbankcount32,
-            .no_xnack_support,
             .sea_islands,
         }),
     };
@@ -1255,9 +1300,7 @@ pub const cpu = struct {
         .name = "mullins",
         .llvm_name = "mullins",
         .features = featureSet(&[_]Feature{
-            .code_object_v3,
             .ldsbankcount16,
-            .no_xnack_support,
             .sea_islands,
         }),
     };
@@ -1265,9 +1308,6 @@ pub const cpu = struct {
         .name = "oland",
         .llvm_name = "oland",
         .features = featureSet(&[_]Feature{
-            .code_object_v3,
-            .ldsbankcount32,
-            .no_xnack_support,
             .southern_islands,
         }),
     };
@@ -1275,9 +1315,6 @@ pub const cpu = struct {
         .name = "pitcairn",
         .llvm_name = "pitcairn",
         .features = featureSet(&[_]Feature{
-            .code_object_v3,
-            .ldsbankcount32,
-            .no_xnack_support,
             .southern_islands,
         }),
     };
@@ -1285,9 +1322,7 @@ pub const cpu = struct {
         .name = "polaris10",
         .llvm_name = "polaris10",
         .features = featureSet(&[_]Feature{
-            .code_object_v3,
             .ldsbankcount32,
-            .no_xnack_support,
             .unpacked_d16_vmem,
             .volcanic_islands,
         }),
@@ -1296,9 +1331,7 @@ pub const cpu = struct {
         .name = "polaris11",
         .llvm_name = "polaris11",
         .features = featureSet(&[_]Feature{
-            .code_object_v3,
             .ldsbankcount32,
-            .no_xnack_support,
             .unpacked_d16_vmem,
             .volcanic_islands,
         }),
@@ -1307,21 +1340,19 @@ pub const cpu = struct {
         .name = "stoney",
         .llvm_name = "stoney",
         .features = featureSet(&[_]Feature{
-            .code_object_v3,
+            .image_gather4_d16_bug,
+            .image_store_d16_bug,
             .ldsbankcount16,
             .volcanic_islands,
-            .xnack,
+            .xnack_support,
         }),
     };
     pub const tahiti = CpuModel{
         .name = "tahiti",
         .llvm_name = "tahiti",
         .features = featureSet(&[_]Feature{
-            .code_object_v3,
             .fast_fmaf,
             .half_rate_64_ops,
-            .ldsbankcount32,
-            .no_xnack_support,
             .southern_islands,
         }),
     };
@@ -1329,9 +1360,17 @@ pub const cpu = struct {
         .name = "tonga",
         .llvm_name = "tonga",
         .features = featureSet(&[_]Feature{
-            .code_object_v3,
             .ldsbankcount32,
-            .no_xnack_support,
+            .sgpr_init_bug,
+            .unpacked_d16_vmem,
+            .volcanic_islands,
+        }),
+    };
+    pub const tongapro = CpuModel{
+        .name = "tongapro",
+        .llvm_name = "tongapro",
+        .features = featureSet(&[_]Feature{
+            .ldsbankcount32,
             .sgpr_init_bug,
             .unpacked_d16_vmem,
             .volcanic_islands,
@@ -1341,9 +1380,6 @@ pub const cpu = struct {
         .name = "verde",
         .llvm_name = "verde",
         .features = featureSet(&[_]Feature{
-            .code_object_v3,
-            .ldsbankcount32,
-            .no_xnack_support,
             .southern_islands,
         }),
     };
diff --git a/lib/std/target/arm.zig b/lib/std/target/arm.zig
index 96365b3e04..ecd08f2861 100644
--- a/lib/std/target/arm.zig
+++ b/lib/std/target/arm.zig
@@ -1,8 +1,5 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2015-2021 Zig Contributors
-// This file is part of [zig](https://ziglang.org/), which is MIT licensed.
-// The MIT license requires this copyright notice to be included in all copies
-// and substantial portions of the software.
+//! This file is auto-generated by tools/update_cpu_features.zig.
+
 const std = @import("../std.zig");
 const CpuFeature = std.Target.Cpu.Feature;
 const CpuModel = std.Target.Cpu.Model;
@@ -53,6 +50,8 @@ pub const Feature = enum {
     fullfp16,
     fuse_aes,
     fuse_literals,
+    harden_sls_blr,
+    harden_sls_retbr,
     has_v4t,
     has_v5t,
     has_v5te,
@@ -62,6 +61,7 @@ pub const Feature = enum {
     has_v6t2,
     has_v7,
     has_v7clrex,
+    has_v8,
     has_v8_1a,
     has_v8_1m_main,
     has_v8_2a,
@@ -69,7 +69,7 @@ pub const Feature = enum {
     has_v8_4a,
     has_v8_5a,
     has_v8_6a,
-    has_v8,
+    has_v8_7a,
     has_v8m,
     has_v8m_main,
     hwdiv,
@@ -85,10 +85,10 @@ pub const Feature = enum {
     mp,
     muxed_units,
     mve,
-    mve_fp,
     mve1beat,
     mve2beat,
     mve4beat,
+    mve_fp,
     nacl_trap,
     neon,
     neon_fpmovs,
@@ -148,10 +148,6 @@ pub const Feature = enum {
     v7r,
     v7s,
     v7ve,
-    v8a,
-    v8m,
-    v8m_main,
-    v8r,
     v8_1a,
     v8_1m_main,
     v8_2a,
@@ -159,6 +155,11 @@ pub const Feature = enum {
     v8_4a,
     v8_5a,
     v8_6a,
+    v8_7a,
+    v8a,
+    v8m,
+    v8m_main,
+    v8r,
     vfp2,
     vfp2sp,
     vfp3,
@@ -238,7 +239,7 @@ pub const all_features = blk: {
         .llvm_name = "cde",
         .description = "Support CDE instructions",
         .dependencies = featureSet(&[_]Feature{
-            .v8m_main,
+            .has_v8m_main,
         }),
     };
     result[@enumToInt(Feature.cdecp0)] = .{
@@ -312,7 +313,6 @@ pub const all_features = blk: {
         .description = "Enable support for Cryptography extensions",
         .dependencies = featureSet(&[_]Feature{
             .aes,
-            .neon,
             .sha2,
         }),
     };
@@ -419,7 +419,6 @@ pub const all_features = blk: {
         .description = "Enable ARMv8 FP with only 16 d-registers",
         .dependencies = featureSet(&[_]Feature{
             .fp_armv8d16sp,
-            .fp64,
             .vfp4d16,
         }),
     };
@@ -434,7 +433,6 @@ pub const all_features = blk: {
         .llvm_name = "fp-armv8sp",
         .description = "Enable ARMv8 FP with no double precision",
         .dependencies = featureSet(&[_]Feature{
-            .d32,
             .fp_armv8d16sp,
             .vfp4sp,
         }),
@@ -481,6 +479,16 @@ pub const all_features = blk: {
         .description = "CPU fuses literal generation operations",
         .dependencies = featureSet(&[_]Feature{}),
     };
+    result[@enumToInt(Feature.harden_sls_blr)] = .{
+        .llvm_name = "harden-sls-blr",
+        .description = "Harden against straight line speculation across indirect calls",
+        .dependencies = featureSet(&[_]Feature{}),
+    };
+    result[@enumToInt(Feature.harden_sls_retbr)] = .{
+        .llvm_name = "harden-sls-retbr",
+        .description = "Harden against straight line speculation across RETurn and BranchRegister instructions",
+        .dependencies = featureSet(&[_]Feature{}),
+    };
     result[@enumToInt(Feature.has_v4t)] = .{
         .llvm_name = "v4t",
         .description = "Support ARM v4T instructions",
@@ -525,18 +533,18 @@ pub const all_features = blk: {
         .llvm_name = "v6t2",
         .description = "Support ARM v6t2 instructions",
         .dependencies = featureSet(&[_]Feature{
-            .thumb2,
             .has_v6k,
             .has_v8m,
+            .thumb2,
         }),
     };
     result[@enumToInt(Feature.has_v7)] = .{
         .llvm_name = "v7",
         .description = "Support ARM v7 instructions",
         .dependencies = featureSet(&[_]Feature{
-            .perfmon,
             .has_v6t2,
             .has_v7clrex,
+            .perfmon,
         }),
     };
     result[@enumToInt(Feature.has_v7clrex)] = .{
@@ -592,8 +600,8 @@ pub const all_features = blk: {
         .llvm_name = "v8.5a",
         .description = "Support ARM v8.5a instructions",
         .dependencies = featureSet(&[_]Feature{
-            .sb,
             .has_v8_4a,
+            .sb,
         }),
     };
     result[@enumToInt(Feature.has_v8_6a)] = .{
@@ -601,8 +609,15 @@ pub const all_features = blk: {
         .description = "Support ARM v8.6a instructions",
         .dependencies = featureSet(&[_]Feature{
             .bf16,
-            .i8mm,
             .has_v8_5a,
+            .i8mm,
+        }),
+    };
+    result[@enumToInt(Feature.has_v8_7a)] = .{
+        .llvm_name = "v8.7a",
+        .description = "Support ARM v8.7a instructions",
+        .dependencies = featureSet(&[_]Feature{
+            .has_v8_6a,
         }),
     };
     result[@enumToInt(Feature.has_v8m)] = .{
@@ -640,14 +655,14 @@ pub const all_features = blk: {
         .llvm_name = "iwmmxt",
         .description = "ARMv5te architecture",
         .dependencies = featureSet(&[_]Feature{
-            .has_v5te,
+            .v5te,
         }),
     };
     result[@enumToInt(Feature.iwmmxt2)] = .{
         .llvm_name = "iwmmxt2",
         .description = "ARMv5te architecture",
         .dependencies = featureSet(&[_]Feature{
-            .has_v5te,
+            .v5te,
         }),
     };
     result[@enumToInt(Feature.lob)] = .{
@@ -695,15 +710,6 @@ pub const all_features = blk: {
             .has_v8_1m_main,
         }),
     };
-    result[@enumToInt(Feature.mve_fp)] = .{
-        .llvm_name = "mve.fp",
-        .description = "Support M-Class Vector Extension with integer and floating ops",
-        .dependencies = featureSet(&[_]Feature{
-            .fp_armv8d16sp,
-            .fullfp16,
-            .mve,
-        }),
-    };
     result[@enumToInt(Feature.mve1beat)] = .{
         .llvm_name = "mve1beat",
         .description = "Model MVE instructions as a 1 beat per tick architecture",
@@ -719,6 +725,14 @@ pub const all_features = blk: {
         .description = "Model MVE instructions as a 4 beats per tick architecture",
         .dependencies = featureSet(&[_]Feature{}),
     };
+    result[@enumToInt(Feature.mve_fp)] = .{
+        .llvm_name = "mve.fp",
+        .description = "Support M-Class Vector Extension with integer and floating ops",
+        .dependencies = featureSet(&[_]Feature{
+            .fullfp16,
+            .mve,
+        }),
+    };
     result[@enumToInt(Feature.nacl_trap)] = .{
         .llvm_name = "nacl-trap",
         .description = "NaCl trap",
@@ -944,32 +958,32 @@ pub const all_features = blk: {
         .llvm_name = "armv4t",
         .description = "ARMv4t architecture",
         .dependencies = featureSet(&[_]Feature{
-            .strict_align,
             .has_v4t,
+            .strict_align,
         }),
     };
     result[@enumToInt(Feature.v5t)] = .{
         .llvm_name = "armv5t",
         .description = "ARMv5t architecture",
         .dependencies = featureSet(&[_]Feature{
-            .strict_align,
             .has_v5t,
+            .strict_align,
         }),
     };
     result[@enumToInt(Feature.v5te)] = .{
         .llvm_name = "armv5te",
         .description = "ARMv5te architecture",
         .dependencies = featureSet(&[_]Feature{
-            .strict_align,
             .has_v5te,
+            .strict_align,
         }),
     };
     result[@enumToInt(Feature.v5tej)] = .{
         .llvm_name = "armv5tej",
         .description = "ARMv5tej architecture",
         .dependencies = featureSet(&[_]Feature{
-            .strict_align,
             .has_v5te,
+            .strict_align,
         }),
     };
     result[@enumToInt(Feature.v6)] = .{
@@ -980,18 +994,6 @@ pub const all_features = blk: {
             .has_v6,
         }),
     };
-    result[@enumToInt(Feature.v6m)] = .{
-        .llvm_name = "armv6-m",
-        .description = "ARMv6m architecture",
-        .dependencies = featureSet(&[_]Feature{
-            .db,
-            .mclass,
-            .noarm,
-            .strict_align,
-            .thumb_mode,
-            .has_v6m,
-        }),
-    };
     result[@enumToInt(Feature.v6j)] = .{
         .llvm_name = "armv6j",
         .description = "ARMv7a architecture",
@@ -1010,8 +1012,20 @@ pub const all_features = blk: {
         .llvm_name = "armv6kz",
         .description = "ARMv6kz architecture",
         .dependencies = featureSet(&[_]Feature{
-            .trustzone,
             .has_v6k,
+            .trustzone,
+        }),
+    };
+    result[@enumToInt(Feature.v6m)] = .{
+        .llvm_name = "armv6-m",
+        .description = "ARMv6m architecture",
+        .dependencies = featureSet(&[_]Feature{
+            .db,
+            .has_v6m,
+            .mclass,
+            .noarm,
+            .strict_align,
+            .thumb_mode,
         }),
     };
     result[@enumToInt(Feature.v6sm)] = .{
@@ -1019,11 +1033,11 @@ pub const all_features = blk: {
         .description = "ARMv6sm architecture",
         .dependencies = featureSet(&[_]Feature{
             .db,
+            .has_v6m,
             .mclass,
             .noarm,
             .strict_align,
             .thumb_mode,
-            .has_v6m,
         }),
     };
     result[@enumToInt(Feature.v6t2)] = .{
@@ -1041,32 +1055,8 @@ pub const all_features = blk: {
             .aclass,
             .db,
             .dsp,
+            .has_v7,
             .neon,
-            .has_v7,
-        }),
-    };
-    result[@enumToInt(Feature.v7m)] = .{
-        .llvm_name = "armv7-m",
-        .description = "ARMv7m architecture",
-        .dependencies = featureSet(&[_]Feature{
-            .db,
-            .hwdiv,
-            .mclass,
-            .noarm,
-            .thumb_mode,
-            .thumb2,
-            .has_v7,
-        }),
-    };
-    result[@enumToInt(Feature.v7r)] = .{
-        .llvm_name = "armv7-r",
-        .description = "ARMv7r architecture",
-        .dependencies = featureSet(&[_]Feature{
-            .db,
-            .dsp,
-            .hwdiv,
-            .rclass,
-            .has_v7,
         }),
     };
     result[@enumToInt(Feature.v7em)] = .{
@@ -1075,12 +1065,11 @@ pub const all_features = blk: {
         .dependencies = featureSet(&[_]Feature{
             .db,
             .dsp,
+            .has_v7,
             .hwdiv,
             .mclass,
             .noarm,
             .thumb_mode,
-            .thumb2,
-            .has_v7,
         }),
     };
     result[@enumToInt(Feature.v7k)] = .{
@@ -1090,6 +1079,29 @@ pub const all_features = blk: {
             .v7a,
         }),
     };
+    result[@enumToInt(Feature.v7m)] = .{
+        .llvm_name = "armv7-m",
+        .description = "ARMv7m architecture",
+        .dependencies = featureSet(&[_]Feature{
+            .db,
+            .has_v7,
+            .hwdiv,
+            .mclass,
+            .noarm,
+            .thumb_mode,
+        }),
+    };
+    result[@enumToInt(Feature.v7r)] = .{
+        .llvm_name = "armv7-r",
+        .description = "ARMv7r architecture",
+        .dependencies = featureSet(&[_]Feature{
+            .db,
+            .dsp,
+            .has_v7,
+            .hwdiv,
+            .rclass,
+        }),
+    };
     result[@enumToInt(Feature.v7s)] = .{
         .llvm_name = "armv7s",
         .description = "ARMv7a architecture",
@@ -1104,73 +1116,10 @@ pub const all_features = blk: {
             .aclass,
             .db,
             .dsp,
-            .mp,
-            .neon,
-            .trustzone,
             .has_v7,
-            .virtualization,
-        }),
-    };
-    result[@enumToInt(Feature.v8a)] = .{
-        .llvm_name = "armv8-a",
-        .description = "ARMv8a architecture",
-        .dependencies = featureSet(&[_]Feature{
-            .aclass,
-            .crc,
-            .crypto,
-            .db,
-            .dsp,
-            .fp_armv8,
             .mp,
             .neon,
             .trustzone,
-            .has_v8,
-            .virtualization,
-        }),
-    };
-    result[@enumToInt(Feature.v8m)] = .{
-        .llvm_name = "armv8-m.base",
-        .description = "ARMv8mBaseline architecture",
-        .dependencies = featureSet(&[_]Feature{
-            .@"8msecext",
-            .acquire_release,
-            .db,
-            .hwdiv,
-            .mclass,
-            .noarm,
-            .strict_align,
-            .thumb_mode,
-            .has_v7clrex,
-            .has_v8m,
-        }),
-    };
-    result[@enumToInt(Feature.v8m_main)] = .{
-        .llvm_name = "armv8-m.main",
-        .description = "ARMv8mMainline architecture",
-        .dependencies = featureSet(&[_]Feature{
-            .@"8msecext",
-            .acquire_release,
-            .db,
-            .hwdiv,
-            .mclass,
-            .noarm,
-            .thumb_mode,
-            .has_v8m_main,
-        }),
-    };
-    result[@enumToInt(Feature.v8r)] = .{
-        .llvm_name = "armv8-r",
-        .description = "ARMv8r architecture",
-        .dependencies = featureSet(&[_]Feature{
-            .crc,
-            .db,
-            .dfb,
-            .dsp,
-            .fp_armv8,
-            .mp,
-            .neon,
-            .rclass,
-            .has_v8,
             .virtualization,
         }),
     };
@@ -1184,10 +1133,9 @@ pub const all_features = blk: {
             .db,
             .dsp,
             .fp_armv8,
-            .mp,
-            .neon,
-            .trustzone,
             .has_v8_1a,
+            .mp,
+            .trustzone,
             .virtualization,
         }),
     };
@@ -1198,13 +1146,13 @@ pub const all_features = blk: {
             .@"8msecext",
             .acquire_release,
             .db,
+            .has_v8_1m_main,
             .hwdiv,
             .lob,
             .mclass,
             .noarm,
             .ras,
             .thumb_mode,
-            .has_v8_1m_main,
         }),
     };
     result[@enumToInt(Feature.v8_2a)] = .{
@@ -1217,11 +1165,10 @@ pub const all_features = blk: {
             .db,
             .dsp,
             .fp_armv8,
+            .has_v8_2a,
             .mp,
-            .neon,
             .ras,
             .trustzone,
-            .has_v8_2a,
             .virtualization,
         }),
     };
@@ -1235,11 +1182,10 @@ pub const all_features = blk: {
             .db,
             .dsp,
             .fp_armv8,
+            .has_v8_3a,
             .mp,
-            .neon,
             .ras,
             .trustzone,
-            .has_v8_3a,
             .virtualization,
         }),
     };
@@ -1251,14 +1197,12 @@ pub const all_features = blk: {
             .crc,
             .crypto,
             .db,
-            .dotprod,
             .dsp,
             .fp_armv8,
+            .has_v8_4a,
             .mp,
-            .neon,
             .ras,
             .trustzone,
-            .has_v8_4a,
             .virtualization,
         }),
     };
@@ -1270,14 +1214,12 @@ pub const all_features = blk: {
             .crc,
             .crypto,
             .db,
-            .dotprod,
             .dsp,
             .fp_armv8,
+            .has_v8_5a,
             .mp,
-            .neon,
             .ras,
             .trustzone,
-            .has_v8_5a,
             .virtualization,
         }),
     };
@@ -1289,14 +1231,91 @@ pub const all_features = blk: {
             .crc,
             .crypto,
             .db,
-            .dotprod,
             .dsp,
             .fp_armv8,
+            .has_v8_6a,
             .mp,
-            .neon,
             .ras,
             .trustzone,
-            .has_v8_6a,
+            .virtualization,
+        }),
+    };
+    result[@enumToInt(Feature.v8_7a)] = .{
+        .llvm_name = "armv8.7-a",
+        .description = "ARMv86a architecture",
+        .dependencies = featureSet(&[_]Feature{
+            .aclass,
+            .crc,
+            .crypto,
+            .db,
+            .dsp,
+            .fp_armv8,
+            .has_v8_7a,
+            .mp,
+            .ras,
+            .trustzone,
+            .virtualization,
+        }),
+    };
+    result[@enumToInt(Feature.v8a)] = .{
+        .llvm_name = "armv8-a",
+        .description = "ARMv8a architecture",
+        .dependencies = featureSet(&[_]Feature{
+            .aclass,
+            .crc,
+            .crypto,
+            .db,
+            .dsp,
+            .fp_armv8,
+            .has_v8,
+            .mp,
+            .trustzone,
+            .virtualization,
+        }),
+    };
+    result[@enumToInt(Feature.v8m)] = .{
+        .llvm_name = "armv8-m.base",
+        .description = "ARMv8mBaseline architecture",
+        .dependencies = featureSet(&[_]Feature{
+            .@"8msecext",
+            .acquire_release,
+            .db,
+            .has_v7clrex,
+            .has_v8m,
+            .hwdiv,
+            .mclass,
+            .noarm,
+            .strict_align,
+            .thumb_mode,
+        }),
+    };
+    result[@enumToInt(Feature.v8m_main)] = .{
+        .llvm_name = "armv8-m.main",
+        .description = "ARMv8mMainline architecture",
+        .dependencies = featureSet(&[_]Feature{
+            .@"8msecext",
+            .acquire_release,
+            .db,
+            .has_v8m_main,
+            .hwdiv,
+            .mclass,
+            .noarm,
+            .thumb_mode,
+        }),
+    };
+    result[@enumToInt(Feature.v8r)] = .{
+        .llvm_name = "armv8-r",
+        .description = "ARMv8r architecture",
+        .dependencies = featureSet(&[_]Feature{
+            .crc,
+            .db,
+            .dfb,
+            .dsp,
+            .fp_armv8,
+            .has_v8,
+            .mp,
+            .neon,
+            .rclass,
             .virtualization,
         }),
     };
@@ -1327,7 +1346,6 @@ pub const all_features = blk: {
         .llvm_name = "vfp3d16",
         .description = "Enable VFP3 instructions with only 16 d-registers",
         .dependencies = featureSet(&[_]Feature{
-            .fp64,
             .vfp2,
             .vfp3d16sp,
         }),
@@ -1351,7 +1369,6 @@ pub const all_features = blk: {
         .llvm_name = "vfp4",
         .description = "Enable VFP4 instructions",
         .dependencies = featureSet(&[_]Feature{
-            .fp16,
             .vfp3,
             .vfp4d16,
             .vfp4sp,
@@ -1361,8 +1378,6 @@ pub const all_features = blk: {
         .llvm_name = "vfp4d16",
         .description = "Enable VFP4 instructions with only 16 d-registers",
         .dependencies = featureSet(&[_]Feature{
-            .fp16,
-            .fp64,
             .vfp3d16,
             .vfp4d16sp,
         }),
@@ -1379,8 +1394,6 @@ pub const all_features = blk: {
         .llvm_name = "vfp4sp",
         .description = "Enable VFP4 instructions with no double precision",
         .dependencies = featureSet(&[_]Feature{
-            .d32,
-            .fp16,
             .vfp3sp,
             .vfp4d16sp,
         }),
@@ -1417,7 +1430,7 @@ pub const all_features = blk: {
         .llvm_name = "xscale",
         .description = "ARMv5te architecture",
         .dependencies = featureSet(&[_]Feature{
-            .has_v5te,
+            .v5te,
         }),
     };
     result[@enumToInt(Feature.zcz)] = .{
@@ -1480,8 +1493,8 @@ pub const cpu = struct {
         .name = "arm1136jf_s",
         .llvm_name = "arm1136jf-s",
         .features = featureSet(&[_]Feature{
-            .v6,
             .slowfpvmlx,
+            .v6,
             .vfp2,
         }),
     };
@@ -1496,8 +1509,8 @@ pub const cpu = struct {
         .name = "arm1156t2f_s",
         .llvm_name = "arm1156t2f-s",
         .features = featureSet(&[_]Feature{
-            .v6t2,
             .slowfpvmlx,
+            .v6t2,
             .vfp2,
         }),
     };
@@ -1519,8 +1532,8 @@ pub const cpu = struct {
         .name = "arm1176jzf_s",
         .llvm_name = "arm1176jzf-s",
         .features = featureSet(&[_]Feature{
-            .v6kz,
             .slowfpvmlx,
+            .v6kz,
             .vfp2,
         }),
     };
@@ -1654,11 +1667,11 @@ pub const cpu = struct {
         .name = "cortex_a12",
         .llvm_name = "cortex-a12",
         .features = featureSet(&[_]Feature{
-            .v7a,
             .avoid_partial_cpsr,
             .mp,
             .ret_addr_stack,
             .trustzone,
+            .v7a,
             .vfp4,
             .virtualization,
             .vmlx_forwarding,
@@ -1668,14 +1681,13 @@ pub const cpu = struct {
         .name = "cortex_a15",
         .llvm_name = "cortex-a15",
         .features = featureSet(&[_]Feature{
-            .v7a,
             .avoid_partial_cpsr,
-            .dont_widen_vmovs,
             .mp,
             .muxed_units,
             .ret_addr_stack,
             .splat_vfp_neon,
             .trustzone,
+            .v7a,
             .vfp4,
             .virtualization,
             .vldn_align,
@@ -1685,11 +1697,11 @@ pub const cpu = struct {
         .name = "cortex_a17",
         .llvm_name = "cortex-a17",
         .features = featureSet(&[_]Feature{
-            .v7a,
             .avoid_partial_cpsr,
             .mp,
             .ret_addr_stack,
             .trustzone,
+            .v7a,
             .vfp4,
             .virtualization,
             .vmlx_forwarding,
@@ -1699,10 +1711,6 @@ pub const cpu = struct {
         .name = "cortex_a32",
         .llvm_name = "cortex-a32",
         .features = featureSet(&[_]Feature{
-            .crc,
-            .crypto,
-            .hwdiv,
-            .hwdiv_arm,
             .v8a,
         }),
     };
@@ -1710,10 +1718,6 @@ pub const cpu = struct {
         .name = "cortex_a35",
         .llvm_name = "cortex-a35",
         .features = featureSet(&[_]Feature{
-            .crc,
-            .crypto,
-            .hwdiv,
-            .hwdiv_arm,
             .v8a,
         }),
     };
@@ -1721,13 +1725,13 @@ pub const cpu = struct {
         .name = "cortex_a5",
         .llvm_name = "cortex-a5",
         .features = featureSet(&[_]Feature{
-            .v7a,
             .mp,
             .ret_addr_stack,
             .slow_fp_brcc,
             .slowfpvfmx,
             .slowfpvmlx,
             .trustzone,
+            .v7a,
             .vfp4,
             .vmlx_forwarding,
         }),
@@ -1736,49 +1740,39 @@ pub const cpu = struct {
         .name = "cortex_a53",
         .llvm_name = "cortex-a53",
         .features = featureSet(&[_]Feature{
-            .v8a,
-            .crc,
-            .crypto,
             .fpao,
-            .hwdiv,
-            .hwdiv_arm,
+            .v8a,
         }),
     };
     pub const cortex_a55 = CpuModel{
         .name = "cortex_a55",
         .llvm_name = "cortex-a55",
         .features = featureSet(&[_]Feature{
-            .v8_2a,
             .dotprod,
-            .hwdiv,
-            .hwdiv_arm,
+            .v8_2a,
         }),
     };
     pub const cortex_a57 = CpuModel{
         .name = "cortex_a57",
         .llvm_name = "cortex-a57",
         .features = featureSet(&[_]Feature{
-            .v8a,
             .avoid_partial_cpsr,
             .cheap_predicable_cpsr,
-            .crc,
-            .crypto,
             .fpao,
-            .hwdiv,
-            .hwdiv_arm,
+            .v8a,
         }),
     };
     pub const cortex_a7 = CpuModel{
         .name = "cortex_a7",
         .llvm_name = "cortex-a7",
         .features = featureSet(&[_]Feature{
-            .v7a,
             .mp,
             .ret_addr_stack,
             .slow_fp_brcc,
             .slowfpvfmx,
             .slowfpvmlx,
             .trustzone,
+            .v7a,
             .vfp4,
             .virtualization,
             .vmlx_forwarding,
@@ -1790,10 +1784,6 @@ pub const cpu = struct {
         .llvm_name = "cortex-a72",
         .features = featureSet(&[_]Feature{
             .v8a,
-            .crc,
-            .crypto,
-            .hwdiv,
-            .hwdiv_arm,
         }),
     };
     pub const cortex_a73 = CpuModel{
@@ -1801,20 +1791,14 @@ pub const cpu = struct {
         .llvm_name = "cortex-a73",
         .features = featureSet(&[_]Feature{
             .v8a,
-            .crc,
-            .crypto,
-            .hwdiv,
-            .hwdiv_arm,
         }),
     };
     pub const cortex_a75 = CpuModel{
         .name = "cortex_a75",
         .llvm_name = "cortex-a75",
         .features = featureSet(&[_]Feature{
-            .v8_2a,
             .dotprod,
-            .hwdiv,
-            .hwdiv_arm,
+            .v8_2a,
         }),
     };
     pub const cortex_a76 = CpuModel{
@@ -1822,13 +1806,9 @@ pub const cpu = struct {
         .llvm_name = "cortex-a76",
         .features = featureSet(&[_]Feature{
             .a76,
-            .v8_2a,
-            .crc,
-            .crypto,
             .dotprod,
             .fullfp16,
-            .hwdiv,
-            .hwdiv_arm,
+            .v8_2a,
         }),
     };
     pub const cortex_a76ae = CpuModel{
@@ -1836,52 +1816,49 @@ pub const cpu = struct {
         .llvm_name = "cortex-a76ae",
         .features = featureSet(&[_]Feature{
             .a76,
-            .v8_2a,
-            .crc,
-            .crypto,
             .dotprod,
             .fullfp16,
-            .hwdiv,
-            .hwdiv_arm,
+            .v8_2a,
         }),
     };
     pub const cortex_a77 = CpuModel{
         .name = "cortex_a77",
         .llvm_name = "cortex-a77",
         .features = featureSet(&[_]Feature{
-            .v8_2a,
-            .crc,
-            .crypto,
             .dotprod,
             .fullfp16,
-            .hwdiv,
-            .hwdiv_arm,
+            .v8_2a,
         }),
     };
     pub const cortex_a78 = CpuModel{
         .name = "cortex_a78",
         .llvm_name = "cortex-a78",
         .features = featureSet(&[_]Feature{
-            .v8_2a,
-            .crc,
-            .crypto,
             .dotprod,
             .fullfp16,
-            .hwdiv,
-            .hwdiv_arm,
+            .v8_2a,
+        }),
+    };
+    pub const cortex_a78c = CpuModel{
+        .name = "cortex_a78c",
+        .llvm_name = "cortex-a78c",
+        .features = featureSet(&[_]Feature{
+            .dotprod,
+            .fullfp16,
+            .v8_2a,
         }),
     };
     pub const cortex_a8 = CpuModel{
         .name = "cortex_a8",
         .llvm_name = "cortex-a8",
         .features = featureSet(&[_]Feature{
-            .v7a,
             .nonpipelined_vfp,
             .ret_addr_stack,
             .slow_fp_brcc,
             .slowfpvfmx,
             .slowfpvmlx,
             .trustzone,
+            .v7a,
             .vmlx_forwarding,
             .vmlx_hazards,
         }),
@@ -1890,7 +1867,6 @@ pub const cpu = struct {
         .name = "cortex_a9",
         .llvm_name = "cortex-a9",
         .features = featureSet(&[_]Feature{
-            .v7a,
             .avoid_partial_cpsr,
             .expand_fp_mlx,
             .fp16,
@@ -1900,6 +1876,7 @@ pub const cpu = struct {
             .prefer_vmovsr,
             .ret_addr_stack,
             .trustzone,
+            .v7a,
             .vldn_align,
             .vmlx_forwarding,
             .vmlx_hazards,
@@ -1930,26 +1907,25 @@ pub const cpu = struct {
         .name = "cortex_m23",
         .llvm_name = "cortex-m23",
         .features = featureSet(&[_]Feature{
-            .v8m,
             .no_movt,
+            .v8m,
         }),
     };
     pub const cortex_m3 = CpuModel{
         .name = "cortex_m3",
         .llvm_name = "cortex-m3",
         .features = featureSet(&[_]Feature{
-            .v7m,
             .loop_align,
             .m3,
             .no_branch_predictor,
             .use_misched,
+            .v7m,
         }),
     };
     pub const cortex_m33 = CpuModel{
         .name = "cortex_m33",
         .llvm_name = "cortex-m33",
         .features = featureSet(&[_]Feature{
-            .v8m_main,
             .dsp,
             .fp_armv8d16sp,
             .loop_align,
@@ -1957,13 +1933,13 @@ pub const cpu = struct {
             .slowfpvfmx,
             .slowfpvmlx,
             .use_misched,
+            .v8m_main,
         }),
     };
     pub const cortex_m35p = CpuModel{
         .name = "cortex_m35p",
         .llvm_name = "cortex-m35p",
         .features = featureSet(&[_]Feature{
-            .v8m_main,
             .dsp,
             .fp_armv8d16sp,
             .loop_align,
@@ -1971,18 +1947,19 @@ pub const cpu = struct {
             .slowfpvfmx,
             .slowfpvmlx,
             .use_misched,
+            .v8m_main,
         }),
     };
     pub const cortex_m4 = CpuModel{
         .name = "cortex_m4",
         .llvm_name = "cortex-m4",
         .features = featureSet(&[_]Feature{
-            .v7em,
             .loop_align,
             .no_branch_predictor,
             .slowfpvfmx,
             .slowfpvmlx,
             .use_misched,
+            .v7em,
             .vfp4d16sp,
         }),
     };
@@ -1990,45 +1967,45 @@ pub const cpu = struct {
         .name = "cortex_m55",
         .llvm_name = "cortex-m55",
         .features = featureSet(&[_]Feature{
-            .v8_1m_main,
-            .dsp,
             .fp_armv8d16,
             .loop_align,
             .mve_fp,
             .no_branch_predictor,
             .slowfpvmlx,
             .use_misched,
+            .v8_1m_main,
         }),
     };
     pub const cortex_m7 = CpuModel{
         .name = "cortex_m7",
         .llvm_name = "cortex-m7",
         .features = featureSet(&[_]Feature{
-            .v7em,
             .fp_armv8d16,
+            .use_misched,
+            .v7em,
         }),
     };
     pub const cortex_r4 = CpuModel{
         .name = "cortex_r4",
         .llvm_name = "cortex-r4",
         .features = featureSet(&[_]Feature{
-            .v7r,
             .avoid_partial_cpsr,
             .r4,
             .ret_addr_stack,
+            .v7r,
         }),
     };
     pub const cortex_r4f = CpuModel{
         .name = "cortex_r4f",
         .llvm_name = "cortex-r4f",
         .features = featureSet(&[_]Feature{
-            .v7r,
             .avoid_partial_cpsr,
             .r4,
             .ret_addr_stack,
             .slow_fp_brcc,
             .slowfpvfmx,
             .slowfpvmlx,
+            .v7r,
             .vfp3d16,
         }),
     };
@@ -2036,13 +2013,13 @@ pub const cpu = struct {
         .name = "cortex_r5",
         .llvm_name = "cortex-r5",
         .features = featureSet(&[_]Feature{
-            .v7r,
             .avoid_partial_cpsr,
             .hwdiv_arm,
             .ret_addr_stack,
             .slow_fp_brcc,
             .slowfpvfmx,
             .slowfpvmlx,
+            .v7r,
             .vfp3d16,
         }),
     };
@@ -2050,16 +2027,15 @@ pub const cpu = struct {
         .name = "cortex_r52",
         .llvm_name = "cortex-r52",
         .features = featureSet(&[_]Feature{
-            .v8r,
             .fpao,
             .use_misched,
+            .v8r,
         }),
     };
     pub const cortex_r7 = CpuModel{
         .name = "cortex_r7",
         .llvm_name = "cortex-r7",
         .features = featureSet(&[_]Feature{
-            .v7r,
             .avoid_partial_cpsr,
             .fp16,
             .hwdiv_arm,
@@ -2068,6 +2044,7 @@ pub const cpu = struct {
             .slow_fp_brcc,
             .slowfpvfmx,
             .slowfpvmlx,
+            .v7r,
             .vfp3d16,
         }),
     };
@@ -2075,7 +2052,6 @@ pub const cpu = struct {
         .name = "cortex_r8",
         .llvm_name = "cortex-r8",
         .features = featureSet(&[_]Feature{
-            .v7r,
             .avoid_partial_cpsr,
             .fp16,
             .hwdiv_arm,
@@ -2084,6 +2060,7 @@ pub const cpu = struct {
             .slow_fp_brcc,
             .slowfpvfmx,
             .slowfpvmlx,
+            .v7r,
             .vfp3d16,
         }),
     };
@@ -2091,34 +2068,25 @@ pub const cpu = struct {
         .name = "cortex_x1",
         .llvm_name = "cortex-x1",
         .features = featureSet(&[_]Feature{
-            .v8_2a,
-            .crc,
-            .crypto,
             .dotprod,
             .fullfp16,
-            .hwdiv,
-            .hwdiv_arm,
+            .v8_2a,
         }),
     };
     pub const cyclone = CpuModel{
         .name = "cyclone",
         .llvm_name = "cyclone",
         .features = featureSet(&[_]Feature{
-            .v8a,
             .avoid_movs_shop,
             .avoid_partial_cpsr,
-            .crypto,
             .disable_postra_scheduler,
-            .hwdiv,
-            .hwdiv_arm,
-            .mp,
             .neonfp,
             .ret_addr_stack,
             .slowfpvfmx,
             .slowfpvmlx,
             .swift,
             .use_misched,
-            .vfp4,
+            .v8a,
             .zcz,
         }),
     };
@@ -2133,34 +2101,34 @@ pub const cpu = struct {
         .name = "exynos_m1",
         .llvm_name = null,
         .features = featureSet(&[_]Feature{
-            .v8a,
             .exynos,
+            .v8a,
         }),
     };
     pub const exynos_m2 = CpuModel{
         .name = "exynos_m2",
         .llvm_name = null,
         .features = featureSet(&[_]Feature{
-            .v8a,
             .exynos,
+            .v8a,
         }),
     };
     pub const exynos_m3 = CpuModel{
         .name = "exynos_m3",
         .llvm_name = "exynos-m3",
         .features = featureSet(&[_]Feature{
-            .v8_2a,
             .exynos,
+            .v8a,
         }),
     };
     pub const exynos_m4 = CpuModel{
         .name = "exynos_m4",
         .llvm_name = "exynos-m4",
         .features = featureSet(&[_]Feature{
-            .v8_2a,
             .dotprod,
             .exynos,
             .fullfp16,
+            .v8_2a,
         }),
     };
     pub const exynos_m5 = CpuModel{
@@ -2190,7 +2158,6 @@ pub const cpu = struct {
         .llvm_name = "krait",
         .features = featureSet(&[_]Feature{
             .avoid_partial_cpsr,
-            .fp16,
             .hwdiv,
             .hwdiv_arm,
             .muxed_units,
@@ -2205,10 +2172,6 @@ pub const cpu = struct {
         .name = "kryo",
         .llvm_name = "kryo",
         .features = featureSet(&[_]Feature{
-            .crc,
-            .crypto,
-            .hwdiv,
-            .hwdiv_arm,
             .v8a,
         }),
     };
@@ -2216,8 +2179,8 @@ pub const cpu = struct {
         .name = "mpcore",
         .llvm_name = "mpcore",
         .features = featureSet(&[_]Feature{
-            .v6k,
             .slowfpvmlx,
+            .v6k,
             .vfp2,
         }),
     };
@@ -2232,12 +2195,27 @@ pub const cpu = struct {
         .name = "neoverse_n1",
         .llvm_name = "neoverse-n1",
         .features = featureSet(&[_]Feature{
-            .v8_2a,
-            .crc,
-            .crypto,
             .dotprod,
-            .hwdiv,
-            .hwdiv_arm,
+            .v8_2a,
+        }),
+    };
+    pub const neoverse_n2 = CpuModel{
+        .name = "neoverse_n2",
+        .llvm_name = "neoverse-n2",
+        .features = featureSet(&[_]Feature{
+            .bf16,
+            .i8mm,
+            .v8_5a,
+        }),
+    };
+    pub const neoverse_v1 = CpuModel{
+        .name = "neoverse_v1",
+        .llvm_name = "neoverse-v1",
+        .features = featureSet(&[_]Feature{
+            .bf16,
+            .fullfp16,
+            .i8mm,
+            .v8_4a,
         }),
     };
     pub const sc000 = CpuModel{
@@ -2251,10 +2229,10 @@ pub const cpu = struct {
         .name = "sc300",
         .llvm_name = "sc300",
         .features = featureSet(&[_]Feature{
-            .v7m,
             .m3,
             .no_branch_predictor,
             .use_misched,
+            .v7m,
         }),
     };
     pub const strongarm = CpuModel{
@@ -2289,7 +2267,6 @@ pub const cpu = struct {
         .name = "swift",
         .llvm_name = "swift",
         .features = featureSet(&[_]Feature{
-            .v7a,
             .avoid_movs_shop,
             .avoid_partial_cpsr,
             .disable_postra_scheduler,
@@ -2308,6 +2285,7 @@ pub const cpu = struct {
             .slowfpvmlx,
             .swift,
             .use_misched,
+            .v7a,
             .vfp4,
             .vmlx_hazards,
             .wide_stride_vfp,
diff --git a/lib/std/target/avr.zig b/lib/std/target/avr.zig
index f85867444a..079d30cf92 100644
--- a/lib/std/target/avr.zig
+++ b/lib/std/target/avr.zig
@@ -1,8 +1,5 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2015-2021 Zig Contributors
-// This file is part of [zig](https://ziglang.org/), which is MIT licensed.
-// The MIT license requires this copyright notice to be included in all copies
-// and substantial portions of the software.
+//! This file is auto-generated by tools/update_cpu_features.zig.
+
 const std = @import("../std.zig");
 const CpuFeature = std.Target.Cpu.Feature;
 const CpuModel = std.Target.Cpu.Model;
@@ -163,7 +160,6 @@ pub const all_features = blk: {
         .dependencies = featureSet(&[_]Feature{
             .avr0,
             .@"break",
-            .memmappedregs,
             .sram,
             .tinyencoding,
         }),
@@ -802,6 +798,13 @@ pub const cpu = struct {
             .avr5,
         }),
     };
+    pub const atmega168pb = CpuModel{
+        .name = "atmega168pb",
+        .llvm_name = "atmega168pb",
+        .features = featureSet(&[_]Feature{
+            .avr5,
+        }),
+    };
     pub const atmega169 = CpuModel{
         .name = "atmega169",
         .llvm_name = "atmega169",
@@ -949,6 +952,13 @@ pub const cpu = struct {
             .avr5,
         }),
     };
+    pub const atmega324pb = CpuModel{
+        .name = "atmega324pb",
+        .llvm_name = "atmega324pb",
+        .features = featureSet(&[_]Feature{
+            .avr5,
+        }),
+    };
     pub const atmega325 = CpuModel{
         .name = "atmega325",
         .llvm_name = "atmega325",
@@ -1019,6 +1029,13 @@ pub const cpu = struct {
             .avr5,
         }),
     };
+    pub const atmega328pb = CpuModel{
+        .name = "atmega328pb",
+        .llvm_name = "atmega328pb",
+        .features = featureSet(&[_]Feature{
+            .avr5,
+        }),
+    };
     pub const atmega329 = CpuModel{
         .name = "atmega329",
         .llvm_name = "atmega329",
@@ -1166,6 +1183,13 @@ pub const cpu = struct {
             .avr4,
         }),
     };
+    pub const atmega48pb = CpuModel{
+        .name = "atmega48pb",
+        .llvm_name = "atmega48pb",
+        .features = featureSet(&[_]Feature{
+            .avr4,
+        }),
+    };
     pub const atmega64 = CpuModel{
         .name = "atmega64",
         .llvm_name = "atmega64",
@@ -1338,7 +1362,11 @@ pub const cpu = struct {
         .name = "atmega8",
         .llvm_name = "atmega8",
         .features = featureSet(&[_]Feature{
-            .avr4,
+            .avr2,
+            .lpmx,
+            .movw,
+            .mul,
+            .spm,
         }),
     };
     pub const atmega8515 = CpuModel{
@@ -1391,11 +1419,22 @@ pub const cpu = struct {
             .avr4,
         }),
     };
+    pub const atmega88pb = CpuModel{
+        .name = "atmega88pb",
+        .llvm_name = "atmega88pb",
+        .features = featureSet(&[_]Feature{
+            .avr4,
+        }),
+    };
     pub const atmega8a = CpuModel{
         .name = "atmega8a",
         .llvm_name = "atmega8a",
         .features = featureSet(&[_]Feature{
-            .avr4,
+            .avr2,
+            .lpmx,
+            .movw,
+            .mul,
+            .spm,
         }),
     };
     pub const atmega8hva = CpuModel{
@@ -1595,6 +1634,13 @@ pub const cpu = struct {
             .avr25,
         }),
     };
+    pub const attiny441 = CpuModel{
+        .name = "attiny441",
+        .llvm_name = "attiny441",
+        .features = featureSet(&[_]Feature{
+            .avr25,
+        }),
+    };
     pub const attiny44a = CpuModel{
         .name = "attiny44a",
         .llvm_name = "attiny44a",
@@ -1651,6 +1697,13 @@ pub const cpu = struct {
             .avr25,
         }),
     };
+    pub const attiny841 = CpuModel{
+        .name = "attiny841",
+        .llvm_name = "attiny841",
+        .features = featureSet(&[_]Feature{
+            .avr25,
+        }),
+    };
     pub const attiny84a = CpuModel{
         .name = "attiny84a",
         .llvm_name = "attiny84a",
@@ -1802,7 +1855,7 @@ pub const cpu = struct {
         .name = "atxmega16e5",
         .llvm_name = "atxmega16e5",
         .features = featureSet(&[_]Feature{
-            .xmega,
+            .xmegau,
         }),
     };
     pub const atxmega192a3 = CpuModel{
@@ -1907,7 +1960,7 @@ pub const cpu = struct {
         .name = "atxmega32e5",
         .llvm_name = "atxmega32e5",
         .features = featureSet(&[_]Feature{
-            .xmega,
+            .xmegau,
         }),
     };
     pub const atxmega32x1 = CpuModel{
@@ -2005,7 +2058,7 @@ pub const cpu = struct {
         .name = "atxmega8e5",
         .llvm_name = "atxmega8e5",
         .features = featureSet(&[_]Feature{
-            .xmega,
+            .xmegau,
         }),
     };
     pub const avr1 = CpuModel{
diff --git a/lib/std/target/bpf.zig b/lib/std/target/bpf.zig
index 73287ec6a8..3d3032689d 100644
--- a/lib/std/target/bpf.zig
+++ b/lib/std/target/bpf.zig
@@ -1,8 +1,5 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2015-2021 Zig Contributors
-// This file is part of [zig](https://ziglang.org/), which is MIT licensed.
-// The MIT license requires this copyright notice to be included in all copies
-// and substantial portions of the software.
+//! This file is auto-generated by tools/update_cpu_features.zig.
+
 const std = @import("../std.zig");
 const CpuFeature = std.Target.Cpu.Feature;
 const CpuModel = std.Target.Cpu.Model;
diff --git a/lib/std/target/hexagon.zig b/lib/std/target/hexagon.zig
index b1f565f52d..aa8d8f6efd 100644
--- a/lib/std/target/hexagon.zig
+++ b/lib/std/target/hexagon.zig
@@ -1,8 +1,5 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2015-2021 Zig Contributors
-// This file is part of [zig](https://ziglang.org/), which is MIT licensed.
-// The MIT license requires this copyright notice to be included in all copies
-// and substantial portions of the software.
+//! This file is auto-generated by tools/update_cpu_features.zig.
+
 const std = @import("../std.zig");
 const CpuFeature = std.Target.Cpu.Feature;
 const CpuModel = std.Target.Cpu.Model;
@@ -92,7 +89,6 @@ pub const all_features = blk: {
         .llvm_name = "hvxv62",
         .description = "Hexagon HVX instructions",
         .dependencies = featureSet(&[_]Feature{
-            .hvx,
             .hvxv60,
         }),
     };
@@ -100,8 +96,6 @@ pub const all_features = blk: {
         .llvm_name = "hvxv65",
         .description = "Hexagon HVX instructions",
         .dependencies = featureSet(&[_]Feature{
-            .hvx,
-            .hvxv60,
             .hvxv62,
         }),
     };
@@ -109,9 +103,6 @@ pub const all_features = blk: {
         .llvm_name = "hvxv66",
         .description = "Hexagon HVX instructions",
         .dependencies = featureSet(&[_]Feature{
-            .hvx,
-            .hvxv60,
-            .hvxv62,
             .hvxv65,
             .zreg,
         }),
@@ -120,9 +111,6 @@ pub const all_features = blk: {
         .llvm_name = "hvxv67",
         .description = "Hexagon HVX instructions",
         .dependencies = featureSet(&[_]Feature{
-            .hvxv60,
-            .hvxv62,
-            .hvxv65,
             .hvxv66,
         }),
     };
@@ -248,7 +236,6 @@ pub const cpu = struct {
             .memops,
             .nvj,
             .nvs,
-            .packets,
             .prev65,
             .small_data,
             .v5,
@@ -265,7 +252,6 @@ pub const cpu = struct {
             .memops,
             .nvj,
             .nvs,
-            .packets,
             .prev65,
             .small_data,
             .v5,
@@ -280,7 +266,6 @@ pub const cpu = struct {
             .memops,
             .nvj,
             .nvs,
-            .packets,
             .prev65,
             .small_data,
             .v5,
@@ -296,7 +281,6 @@ pub const cpu = struct {
             .memops,
             .nvj,
             .nvs,
-            .packets,
             .prev65,
             .small_data,
             .v5,
@@ -313,7 +297,6 @@ pub const cpu = struct {
             .memops,
             .nvj,
             .nvs,
-            .packets,
             .prev65,
             .small_data,
             .v5,
@@ -332,7 +315,6 @@ pub const cpu = struct {
             .memops,
             .nvj,
             .nvs,
-            .packets,
             .small_data,
             .v5,
             .v55,
@@ -351,7 +333,6 @@ pub const cpu = struct {
             .memops,
             .nvj,
             .nvs,
-            .packets,
             .small_data,
             .v5,
             .v55,
@@ -371,7 +352,6 @@ pub const cpu = struct {
             .memops,
             .nvj,
             .nvs,
-            .packets,
             .small_data,
             .v5,
             .v55,
@@ -391,7 +371,6 @@ pub const cpu = struct {
             .mem_noshuf,
             .memops,
             .nvs,
-            .packets,
             .small_data,
             .tinycore,
             .v5,
diff --git a/lib/std/target/mips.zig b/lib/std/target/mips.zig
index 59da13ac39..c88d2ec8fb 100644
--- a/lib/std/target/mips.zig
+++ b/lib/std/target/mips.zig
@@ -1,8 +1,5 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2015-2021 Zig Contributors
-// This file is part of [zig](https://ziglang.org/), which is MIT licensed.
-// The MIT license requires this copyright notice to be included in all copies
-// and substantial portions of the software.
+//! This file is auto-generated by tools/update_cpu_features.zig.
+
 const std = @import("../std.zig");
 const CpuFeature = std.Target.Cpu.Feature;
 const CpuModel = std.Target.Cpu.Model;
@@ -108,7 +105,6 @@ pub const all_features = blk: {
         .llvm_name = "dspr3",
         .description = "Mips DSP-R3 ASE",
         .dependencies = featureSet(&[_]Feature{
-            .dsp,
             .dspr2,
         }),
     };
@@ -301,10 +297,8 @@ pub const all_features = blk: {
         .llvm_name = "mips64r6",
         .description = "Mips64r6 ISA Support [experimental]",
         .dependencies = featureSet(&[_]Feature{
-            .abs2008,
             .mips32r6,
             .mips64r5,
-            .nan2008,
         }),
     };
     result[@enumToInt(Feature.msa)] = .{
@@ -515,16 +509,13 @@ pub const cpu = struct {
         .llvm_name = "octeon",
         .features = featureSet(&[_]Feature{
             .cnmips,
-            .mips64r2,
         }),
     };
     pub const @"octeon+" = CpuModel{
         .name = "octeon+",
         .llvm_name = "octeon+",
         .features = featureSet(&[_]Feature{
-            .cnmips,
             .cnmipsp,
-            .mips64r2,
         }),
     };
     pub const p5600 = CpuModel{
diff --git a/lib/std/target/msp430.zig b/lib/std/target/msp430.zig
index c1005a1d81..be59b09706 100644
--- a/lib/std/target/msp430.zig
+++ b/lib/std/target/msp430.zig
@@ -1,8 +1,5 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2015-2021 Zig Contributors
-// This file is part of [zig](https://ziglang.org/), which is MIT licensed.
-// The MIT license requires this copyright notice to be included in all copies
-// and substantial portions of the software.
+//! This file is auto-generated by tools/update_cpu_features.zig.
+
 const std = @import("../std.zig");
 const CpuFeature = std.Target.Cpu.Feature;
 const CpuModel = std.Target.Cpu.Model;
diff --git a/lib/std/target/nvptx.zig b/lib/std/target/nvptx.zig
index b025fbfcf7..6b9b9bf777 100644
--- a/lib/std/target/nvptx.zig
+++ b/lib/std/target/nvptx.zig
@@ -1,8 +1,5 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2015-2021 Zig Contributors
-// This file is part of [zig](https://ziglang.org/), which is MIT licensed.
-// The MIT license requires this copyright notice to be included in all copies
-// and substantial portions of the software.
+//! This file is auto-generated by tools/update_cpu_features.zig.
+
 const std = @import("../std.zig");
 const CpuFeature = std.Target.Cpu.Feature;
 const CpuModel = std.Target.Cpu.Model;
diff --git a/lib/std/target/powerpc.zig b/lib/std/target/powerpc.zig
index 2db7d30e8d..4e2200a47f 100644
--- a/lib/std/target/powerpc.zig
+++ b/lib/std/target/powerpc.zig
@@ -1,8 +1,5 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2015-2021 Zig Contributors
-// This file is part of [zig](https://ziglang.org/), which is MIT licensed.
-// The MIT license requires this copyright notice to be included in all copies
-// and substantial portions of the software.
+//! This file is auto-generated by tools/update_cpu_features.zig.
+
 const std = @import("../std.zig");
 const CpuFeature = std.Target.Cpu.Feature;
 const CpuModel = std.Target.Cpu.Model;
@@ -10,6 +7,7 @@ const CpuModel = std.Target.Cpu.Model;
 pub const Feature = enum {
     @"64bit",
     @"64bitregs",
+    aix,
     allow_unaligned_fp_access,
     altivec,
     booke,
@@ -19,6 +17,7 @@ pub const Feature = enum {
     crypto,
     direct_move,
     e500,
+    efpu2,
     extdiv,
     fcpsgn,
     float128,
@@ -32,6 +31,7 @@ pub const Feature = enum {
     fsqrt,
     fuse_addi_load,
     fuse_addis_load,
+    fuse_store,
     fusion,
     hard_float,
     htm,
@@ -44,7 +44,10 @@ pub const Feature = enum {
     lfiwax,
     longcall,
     mfocrf,
+    mma,
+    modern_aix_as,
     msync,
+    paired_vector_memops,
     partword_atomics,
     pcrelative_memops,
     popcntd,
@@ -53,13 +56,12 @@ pub const Feature = enum {
     power8_vector,
     power9_altivec,
     power9_vector,
-    ppc_postra_sched,
-    ppc_prera_sched,
     ppc4xx,
     ppc6xx,
+    ppc_postra_sched,
+    ppc_prera_sched,
     predictable_select_expensive,
     prefix_instrs,
-    qpx,
     recipprec,
     secure_plt,
     slow_popcntd,
@@ -86,6 +88,11 @@ pub const all_features = blk: {
         .description = "Enable 64-bit registers usage for ppc32 [beta]",
         .dependencies = featureSet(&[_]Feature{}),
     };
+    result[@enumToInt(Feature.aix)] = .{
+        .llvm_name = "aix",
+        .description = "AIX OS",
+        .dependencies = featureSet(&[_]Feature{}),
+    };
     result[@enumToInt(Feature.allow_unaligned_fp_access)] = .{
         .llvm_name = "allow-unaligned-fp-access",
         .description = "CPU does not trap on unaligned FP access",
@@ -139,6 +146,13 @@ pub const all_features = blk: {
         .description = "Enable E500/E500mc instructions",
         .dependencies = featureSet(&[_]Feature{}),
     };
+    result[@enumToInt(Feature.efpu2)] = .{
+        .llvm_name = "efpu2",
+        .description = "Enable Embedded Floating-Point APU 2 instructions",
+        .dependencies = featureSet(&[_]Feature{
+            .spe,
+        }),
+    };
     result[@enumToInt(Feature.extdiv)] = .{
         .llvm_name = "extdiv",
         .description = "Enable extended divide instructions",
@@ -228,6 +242,13 @@ pub const all_features = blk: {
             .fusion,
         }),
     };
+    result[@enumToInt(Feature.fuse_store)] = .{
+        .llvm_name = "fuse-store",
+        .description = "Target supports store clustering",
+        .dependencies = featureSet(&[_]Feature{
+            .fusion,
+        }),
+    };
     result[@enumToInt(Feature.fusion)] = .{
         .llvm_name = "fusion",
         .description = "Target supports instruction fusion",
@@ -292,6 +313,20 @@ pub const all_features = blk: {
         .description = "Enable the MFOCRF instruction",
         .dependencies = featureSet(&[_]Feature{}),
     };
+    result[@enumToInt(Feature.mma)] = .{
+        .llvm_name = "mma",
+        .description = "Enable MMA instructions",
+        .dependencies = featureSet(&[_]Feature{
+            .paired_vector_memops,
+            .power8_vector,
+            .power9_altivec,
+        }),
+    };
+    result[@enumToInt(Feature.modern_aix_as)] = .{
+        .llvm_name = "modern-aix-as",
+        .description = "AIX system assembler is modern enough to support new mnes",
+        .dependencies = featureSet(&[_]Feature{}),
+    };
     result[@enumToInt(Feature.msync)] = .{
         .llvm_name = "msync",
         .description = "Has only the msync instruction instead of sync",
@@ -299,6 +334,13 @@ pub const all_features = blk: {
             .booke,
         }),
     };
+    result[@enumToInt(Feature.paired_vector_memops)] = .{
+        .llvm_name = "paired-vector-memops",
+        .description = "32Byte load and store instructions",
+        .dependencies = featureSet(&[_]Feature{
+            .isa_v30_instructions,
+        }),
+    };
     result[@enumToInt(Feature.partword_atomics)] = .{
         .llvm_name = "partword-atomics",
         .description = "Enable l[bh]arx and st[bh]cx.",
@@ -308,7 +350,7 @@ pub const all_features = blk: {
         .llvm_name = "pcrelative-memops",
         .description = "Enable PC relative Memory Ops",
         .dependencies = featureSet(&[_]Feature{
-            .isa_v30_instructions,
+            .prefix_instrs,
         }),
     };
     result[@enumToInt(Feature.popcntd)] = .{
@@ -351,21 +393,10 @@ pub const all_features = blk: {
         .llvm_name = "power9-vector",
         .description = "Enable POWER9 vector instructions",
         .dependencies = featureSet(&[_]Feature{
-            .isa_v30_instructions,
             .power8_vector,
             .power9_altivec,
         }),
     };
-    result[@enumToInt(Feature.ppc_postra_sched)] = .{
-        .llvm_name = "ppc-postra-sched",
-        .description = "Use PowerPC post-RA scheduling strategy",
-        .dependencies = featureSet(&[_]Feature{}),
-    };
-    result[@enumToInt(Feature.ppc_prera_sched)] = .{
-        .llvm_name = "ppc-prera-sched",
-        .description = "Use PowerPC pre-RA scheduling strategy",
-        .dependencies = featureSet(&[_]Feature{}),
-    };
     result[@enumToInt(Feature.ppc4xx)] = .{
         .llvm_name = "ppc4xx",
         .description = "Enable PPC 4xx instructions",
@@ -376,6 +407,16 @@ pub const all_features = blk: {
         .description = "Enable PPC 6xx instructions",
         .dependencies = featureSet(&[_]Feature{}),
     };
+    result[@enumToInt(Feature.ppc_postra_sched)] = .{
+        .llvm_name = "ppc-postra-sched",
+        .description = "Use PowerPC post-RA scheduling strategy",
+        .dependencies = featureSet(&[_]Feature{}),
+    };
+    result[@enumToInt(Feature.ppc_prera_sched)] = .{
+        .llvm_name = "ppc-prera-sched",
+        .description = "Use PowerPC pre-RA scheduling strategy",
+        .dependencies = featureSet(&[_]Feature{}),
+    };
     result[@enumToInt(Feature.predictable_select_expensive)] = .{
         .llvm_name = "predictable-select-expensive",
         .description = "Prefer likely predicted branches over selects",
@@ -385,18 +426,10 @@ pub const all_features = blk: {
         .llvm_name = "prefix-instrs",
         .description = "Enable prefixed instructions",
         .dependencies = featureSet(&[_]Feature{
-            .isa_v30_instructions,
             .power8_vector,
             .power9_altivec,
         }),
     };
-    result[@enumToInt(Feature.qpx)] = .{
-        .llvm_name = "qpx",
-        .description = "Enable QPX instructions",
-        .dependencies = featureSet(&[_]Feature{
-            .fpu,
-        }),
-    };
     result[@enumToInt(Feature.recipprec)] = .{
         .llvm_name = "recipprec",
         .description = "Assume higher precision reciprocal estimates",
@@ -452,94 +485,90 @@ pub const all_features = blk: {
 };
 
 pub const cpu = struct {
-    pub const @"ppc440" = CpuModel{
-        .name = "ppc440",
+    pub const @"440" = CpuModel{
+        .name = "440",
         .llvm_name = "440",
         .features = featureSet(&[_]Feature{
-            .booke,
             .fres,
             .frsqrte,
-            .icbt,
             .isel,
             .msync,
         }),
     };
-    pub const @"ppc450" = CpuModel{
-        .name = "ppc450",
+    pub const @"450" = CpuModel{
+        .name = "450",
         .llvm_name = "450",
         .features = featureSet(&[_]Feature{
-            .booke,
             .fres,
             .frsqrte,
-            .icbt,
             .isel,
             .msync,
         }),
     };
-    pub const @"ppc601" = CpuModel{
-        .name = "ppc601",
+    pub const @"601" = CpuModel{
+        .name = "601",
         .llvm_name = "601",
         .features = featureSet(&[_]Feature{
             .fpu,
         }),
     };
-    pub const @"ppc602" = CpuModel{
-        .name = "ppc602",
+    pub const @"602" = CpuModel{
+        .name = "602",
         .llvm_name = "602",
         .features = featureSet(&[_]Feature{
             .fpu,
         }),
     };
-    pub const @"ppc603" = CpuModel{
-        .name = "ppc603",
+    pub const @"603" = CpuModel{
+        .name = "603",
         .llvm_name = "603",
         .features = featureSet(&[_]Feature{
             .fres,
             .frsqrte,
         }),
     };
-    pub const @"ppc603e" = CpuModel{
-        .name = "ppc603e",
+    pub const @"603e" = CpuModel{
+        .name = "603e",
         .llvm_name = "603e",
         .features = featureSet(&[_]Feature{
             .fres,
             .frsqrte,
         }),
     };
-    pub const @"ppc603ev" = CpuModel{
-        .name = "ppc603ev",
+    pub const @"603ev" = CpuModel{
+        .name = "603ev",
         .llvm_name = "603ev",
         .features = featureSet(&[_]Feature{
             .fres,
             .frsqrte,
         }),
     };
-    pub const @"ppc604" = CpuModel{
-        .name = "ppc604",
+    pub const @"604" = CpuModel{
+        .name = "604",
         .llvm_name = "604",
         .features = featureSet(&[_]Feature{
             .fres,
             .frsqrte,
         }),
     };
-    pub const @"ppc604e" = CpuModel{
-        .name = "ppc604e",
+    pub const @"604e" = CpuModel{
+        .name = "604e",
         .llvm_name = "604e",
         .features = featureSet(&[_]Feature{
             .fres,
             .frsqrte,
         }),
     };
-    pub const @"ppc620" = CpuModel{
-        .name = "ppc620",
+    pub const @"620" = CpuModel{
+        .name = "620",
         .llvm_name = "620",
         .features = featureSet(&[_]Feature{
             .fres,
             .frsqrte,
         }),
     };
-    pub const @"ppc7400" = CpuModel{
-        .name = "ppc7400",
+    pub const @"7400" = CpuModel{
+        .name = "7400",
         .llvm_name = "7400",
         .features = featureSet(&[_]Feature{
             .altivec,
@@ -547,8 +576,8 @@ pub const cpu = struct {
             .frsqrte,
         }),
     };
-    pub const @"ppc7450" = CpuModel{
-        .name = "ppc7450",
+    pub const @"7450" = CpuModel{
+        .name = "7450",
         .llvm_name = "7450",
         .features = featureSet(&[_]Feature{
             .altivec,
@@ -556,16 +585,16 @@ pub const cpu = struct {
             .frsqrte,
         }),
     };
-    pub const @"ppc750" = CpuModel{
-        .name = "ppc750",
+    pub const @"750" = CpuModel{
+        .name = "750",
         .llvm_name = "750",
         .features = featureSet(&[_]Feature{
             .fres,
             .frsqrte,
         }),
     };
-    pub const @"ppc970" = CpuModel{
-        .name = "ppc970",
+    pub const @"970" = CpuModel{
+        .name = "970",
         .llvm_name = "970",
         .features = featureSet(&[_]Feature{
             .@"64bit",
@@ -592,7 +621,6 @@ pub const cpu = struct {
             .frsqrte,
             .frsqrtes,
             .fsqrt,
-            .icbt,
             .isel,
             .ldbrx,
             .lfiwax,
@@ -602,38 +630,10 @@ pub const cpu = struct {
             .stfiwx,
         }),
     };
-    pub const a2q = CpuModel{
-        .name = "a2q",
-        .llvm_name = "a2q",
-        .features = featureSet(&[_]Feature{
-            .@"64bit",
-            .booke,
-            .cmpb,
-            .fcpsgn,
-            .fpcvt,
-            .fprnd,
-            .fre,
-            .fres,
-            .frsqrte,
-            .frsqrtes,
-            .fsqrt,
-            .icbt,
-            .isel,
-            .ldbrx,
-            .lfiwax,
-            .mfocrf,
-            .qpx,
-            .recipprec,
-            .slow_popcntd,
-            .stfiwx,
-        }),
-    };
     pub const e500 = CpuModel{
         .name = "e500",
         .llvm_name = "e500",
         .features = featureSet(&[_]Feature{
-            .booke,
-            .icbt,
             .isel,
             .msync,
             .spe,
@@ -644,7 +644,6 @@ pub const cpu = struct {
         .llvm_name = "e500mc",
         .features = featureSet(&[_]Feature{
             .booke,
-            .icbt,
             .isel,
             .stfiwx,
         }),
@@ -655,7 +654,6 @@ pub const cpu = struct {
         .features = featureSet(&[_]Feature{
             .@"64bit",
             .booke,
-            .icbt,
             .isel,
             .mfocrf,
             .stfiwx,
@@ -667,7 +665,6 @@ pub const cpu = struct {
         .features = featureSet(&[_]Feature{
             .@"64bit",
             .allow_unaligned_fp_access,
-            .altivec,
             .bpermd,
             .cmpb,
             .crypto,
@@ -681,28 +678,24 @@ pub const cpu = struct {
             .frsqrte,
             .frsqrtes,
             .fsqrt,
+            .fuse_store,
             .htm,
             .icbt,
-            .isa_v30_instructions,
-            .isa_v31_instructions,
             .isel,
             .ldbrx,
             .lfiwax,
             .mfocrf,
+            .mma,
             .partword_atomics,
             .pcrelative_memops,
             .popcntd,
             .power10_vector,
-            .power8_altivec,
-            .power8_vector,
-            .power9_altivec,
-            .power9_vector,
+            .ppc_postra_sched,
+            .ppc_prera_sched,
             .predictable_select_expensive,
-            .prefix_instrs,
             .recipprec,
             .stfiwx,
             .two_const_nr,
-            .vsx,
         }),
     };
     pub const g3 = CpuModel{
@@ -760,7 +753,7 @@ pub const cpu = struct {
     };
     pub const ppc32 = CpuModel{
         .name = "ppc32",
-        .llvm_name = "ppc",
+        .llvm_name = "ppc32",
         .features = featureSet(&[_]Feature{
             .hard_float,
         }),
@@ -784,7 +777,6 @@ pub const cpu = struct {
         .features = featureSet(&[_]Feature{
             .@"64bit",
             .allow_unaligned_fp_access,
-            .altivec,
             .bpermd,
             .cmpb,
             .crypto,
@@ -808,13 +800,11 @@ pub const cpu = struct {
             .mfocrf,
             .partword_atomics,
             .popcntd,
-            .power8_altivec,
             .power8_vector,
             .predictable_select_expensive,
             .recipprec,
             .stfiwx,
             .two_const_nr,
-            .vsx,
         }),
     };
     pub const pwr10 = CpuModel{
@@ -823,7 +813,6 @@ pub const cpu = struct {
         .features = featureSet(&[_]Feature{
             .@"64bit",
             .allow_unaligned_fp_access,
-            .altivec,
             .bpermd,
             .cmpb,
             .crypto,
@@ -837,28 +826,24 @@ pub const cpu = struct {
             .frsqrte,
             .frsqrtes,
             .fsqrt,
+            .fuse_store,
             .htm,
             .icbt,
-            .isa_v30_instructions,
-            .isa_v31_instructions,
             .isel,
             .ldbrx,
             .lfiwax,
             .mfocrf,
+            .mma,
             .partword_atomics,
             .pcrelative_memops,
             .popcntd,
             .power10_vector,
-            .power8_altivec,
-            .power8_vector,
-            .power9_altivec,
-            .power9_vector,
+            .ppc_postra_sched,
+            .ppc_prera_sched,
             .predictable_select_expensive,
-            .prefix_instrs,
             .recipprec,
             .stfiwx,
             .two_const_nr,
-            .vsx,
         }),
     };
     pub const pwr3 = CpuModel{
@@ -963,7 +948,6 @@ pub const cpu = struct {
         .features = featureSet(&[_]Feature{
             .@"64bit",
             .allow_unaligned_fp_access,
-            .altivec,
             .bpermd,
             .cmpb,
             .extdiv,
@@ -992,7 +976,6 @@ pub const cpu = struct {
         .features = featureSet(&[_]Feature{
             .@"64bit",
             .allow_unaligned_fp_access,
-            .altivec,
             .bpermd,
             .cmpb,
             .crypto,
@@ -1016,13 +999,11 @@ pub const cpu = struct {
             .mfocrf,
             .partword_atomics,
             .popcntd,
-            .power8_altivec,
             .power8_vector,
             .predictable_select_expensive,
             .recipprec,
             .stfiwx,
             .two_const_nr,
-            .vsx,
         }),
     };
     pub const pwr9 = CpuModel{
@@ -1031,7 +1012,6 @@ pub const cpu = struct {
         .features = featureSet(&[_]Feature{
             .@"64bit",
             .allow_unaligned_fp_access,
-            .altivec,
             .bpermd,
             .cmpb,
             .crypto,
@@ -1047,16 +1027,12 @@ pub const cpu = struct {
             .fsqrt,
             .htm,
             .icbt,
-            .isa_v30_instructions,
             .isel,
             .ldbrx,
             .lfiwax,
             .mfocrf,
             .partword_atomics,
             .popcntd,
-            .power8_altivec,
-            .power8_vector,
-            .power9_altivec,
             .power9_vector,
             .ppc_postra_sched,
             .ppc_prera_sched,
@@ -1065,7 +1041,6 @@ pub const cpu = struct {
             .stfiwx,
             .two_const_nr,
             .vectors_use_two_units,
-            .vsx,
         }),
     };
 };
diff --git a/lib/std/target/riscv.zig b/lib/std/target/riscv.zig
index b9eea13c87..5b19a936cf 100644
--- a/lib/std/target/riscv.zig
+++ b/lib/std/target/riscv.zig
@@ -1,8 +1,5 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2015-2021 Zig Contributors
-// This file is part of [zig](https://ziglang.org/), which is MIT licensed.
-// The MIT license requires this copyright notice to be included in all copies
-// and substantial portions of the software.
+//! This file is auto-generated by tools/update_cpu_features.zig.
+
 const std = @import("../std.zig");
 const CpuFeature = std.Target.Cpu.Feature;
 const CpuModel = std.Target.Cpu.Model;
@@ -15,6 +12,7 @@ pub const Feature = enum {
     e,
     experimental_b,
     experimental_v,
+    experimental_zba,
     experimental_zbb,
     experimental_zbc,
     experimental_zbe,
@@ -25,6 +23,9 @@ pub const Feature = enum {
     experimental_zbr,
     experimental_zbs,
     experimental_zbt,
+    experimental_zfh,
+    experimental_zvamo,
+    experimental_zvlsseg,
     f,
     m,
     no_rvc_hints,
@@ -100,6 +101,7 @@ pub const all_features = blk: {
         .llvm_name = "experimental-b",
         .description = "'B' (Bit Manipulation Instructions)",
         .dependencies = featureSet(&[_]Feature{
+            .experimental_zba,
             .experimental_zbb,
             .experimental_zbc,
             .experimental_zbe,
@@ -114,9 +116,12 @@ pub const all_features = blk: {
     result[@enumToInt(Feature.experimental_v)] = .{
         .llvm_name = "experimental-v",
         .description = "'V' (Vector Instructions)",
-        .dependencies = featureSet(&[_]Feature{
-            .f,
-        }),
+        .dependencies = featureSet(&[_]Feature{}),
+    };
+    result[@enumToInt(Feature.experimental_zba)] = .{
+        .llvm_name = "experimental-zba",
+        .description = "'Zba' (Address calculation 'B' Instructions)",
+        .dependencies = featureSet(&[_]Feature{}),
     };
     result[@enumToInt(Feature.experimental_zbb)] = .{
         .llvm_name = "experimental-zbb",
@@ -168,6 +173,27 @@ pub const all_features = blk: {
         .description = "'Zbt' (Ternary 'B' Instructions)",
         .dependencies = featureSet(&[_]Feature{}),
     };
+    result[@enumToInt(Feature.experimental_zfh)] = .{
+        .llvm_name = "experimental-zfh",
+        .description = "'Zfh' (Half-Precision Floating-Point)",
+        .dependencies = featureSet(&[_]Feature{
+            .f,
+        }),
+    };
+    result[@enumToInt(Feature.experimental_zvamo)] = .{
+        .llvm_name = "experimental-zvamo",
+        .description = "'Zvamo'(Vector AMO Operations)",
+        .dependencies = featureSet(&[_]Feature{
+            .experimental_v,
+        }),
+    };
+    result[@enumToInt(Feature.experimental_zvlsseg)] = .{
+        .llvm_name = "experimental-zvlsseg",
+        .description = "'Zvlsseg' (Vector segment load/store instructions)",
+        .dependencies = featureSet(&[_]Feature{
+            .experimental_v,
+        }),
+    };
     result[@enumToInt(Feature.f)] = .{
         .llvm_name = "f",
         .description = "'F' (Single-Precision Floating-Point)",
@@ -364,7 +390,6 @@ pub const cpu = struct {
             .a,
             .c,
             .d,
-            .f,
             .m,
         }),
     };
@@ -376,7 +401,6 @@ pub const cpu = struct {
             .a,
             .c,
             .d,
-            .f,
             .m,
         }),
     };
@@ -404,6 +428,18 @@ pub const cpu = struct {
             .@"64bit",
         }),
     };
+    pub const sifive_7_rv32 = CpuModel{
+        .name = "sifive_7_rv32",
+        .llvm_name = "sifive-7-rv32",
+        .features = featureSet(&[_]Feature{}),
+    };
+    pub const sifive_7_rv64 = CpuModel{
+        .name = "sifive_7_rv64",
+        .llvm_name = "sifive-7-rv64",
+        .features = featureSet(&[_]Feature{
+            .@"64bit",
+        }),
+    };
     pub const sifive_e31 = CpuModel{
         .name = "sifive_e31",
         .llvm_name = "sifive-e31",
@@ -413,6 +449,16 @@ pub const cpu = struct {
             .m,
         }),
     };
+    pub const sifive_e76 = CpuModel{
+        .name = "sifive_e76",
+        .llvm_name = "sifive-e76",
+        .features = featureSet(&[_]Feature{
+            .a,
+            .c,
+            .f,
+            .m,
+        }),
+    };
     pub const sifive_u54 = CpuModel{
         .name = "sifive_u54",
         .llvm_name = "sifive-u54",
@@ -421,7 +467,17 @@ pub const cpu = struct {
             .a,
             .c,
             .d,
-            .f,
+            .m,
+        }),
+    };
+    pub const sifive_u74 = CpuModel{
+        .name = "sifive_u74",
+        .llvm_name = "sifive-u74",
+        .features = featureSet(&[_]Feature{
+            .@"64bit",
+            .a,
+            .c,
+            .d,
             .m,
         }),
     };
diff --git a/lib/std/target/sparc.zig b/lib/std/target/sparc.zig
index a075160d59..5423739be6 100644
--- a/lib/std/target/sparc.zig
+++ b/lib/std/target/sparc.zig
@@ -1,8 +1,5 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2015-2021 Zig Contributors
-// This file is part of [zig](https://ziglang.org/), which is MIT licensed.
-// The MIT license requires this copyright notice to be included in all copies
-// and substantial portions of the software.
+//! This file is auto-generated by tools/update_cpu_features.zig.
+
 const std = @import("../std.zig");
 const CpuFeature = std.Target.Cpu.Feature;
 const CpuModel = std.Target.Cpu.Model;
@@ -160,6 +157,11 @@ pub const cpu = struct {
         .llvm_name = "f934",
         .features = featureSet(&[_]Feature{}),
     };
+    pub const generic = CpuModel{
+        .name = "generic",
+        .llvm_name = "generic",
+        .features = featureSet(&[_]Feature{}),
+    };
     pub const gr712rc = CpuModel{
         .name = "gr712rc",
         .llvm_name = "gr712rc",
diff --git a/lib/std/target/systemz.zig b/lib/std/target/systemz.zig
index 8a78167e69..65c53984eb 100644
--- a/lib/std/target/systemz.zig
+++ b/lib/std/target/systemz.zig
@@ -1,8 +1,5 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2015-2021 Zig Contributors
-// This file is part of [zig](https://ziglang.org/), which is MIT licensed.
-// The MIT license requires this copyright notice to be included in all copies
-// and substantial portions of the software.
+//! This file is auto-generated by tools/update_cpu_features.zig.
+
 const std = @import("../std.zig");
 const CpuFeature = std.Target.Cpu.Feature;
 const CpuModel = std.Target.Cpu.Model;
diff --git a/lib/std/target/ve.zig b/lib/std/target/ve.zig
new file mode 100644
index 0000000000..ff3eea698d
--- /dev/null
+++ b/lib/std/target/ve.zig
@@ -0,0 +1,36 @@
+//! This file is auto-generated by tools/update_cpu_features.zig.
+
+const std = @import("../std.zig");
+const CpuFeature = std.Target.Cpu.Feature;
+const CpuModel = std.Target.Cpu.Model;
+
+pub const Feature = enum {
+    vpu,
+};
+
+pub usingnamespace CpuFeature.feature_set_fns(Feature);
+
+pub const all_features = blk: {
+    const len = @typeInfo(Feature).Enum.fields.len;
+    std.debug.assert(len <= CpuFeature.Set.needed_bit_count);
+    var result: [len]CpuFeature = undefined;
+    result[@enumToInt(Feature.vpu)] = .{
+        .llvm_name = "vpu",
+        .description = "Enable the VPU",
+        .dependencies = featureSet(&[_]Feature{}),
+    };
+    const ti = @typeInfo(Feature);
+    for (result) |*elem, i| {
+        elem.index = i;
+        elem.name = ti.Enum.fields[i].name;
+    }
+    break :blk result;
+};
+
+pub const cpu = struct {
+    pub const generic = CpuModel{
+        .name = "generic",
+        .llvm_name = "generic",
+        .features = featureSet(&[_]Feature{}),
+    };
+};
diff --git a/lib/std/target/wasm.zig b/lib/std/target/wasm.zig
index 0a3281c692..4714125f30 100644
--- a/lib/std/target/wasm.zig
+++ b/lib/std/target/wasm.zig
@@ -1,8 +1,5 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2015-2021 Zig Contributors
-// This file is part of [zig](https://ziglang.org/), which is MIT licensed.
-// The MIT license requires this copyright notice to be included in all copies
-// and substantial portions of the software.
+//! This file is auto-generated by tools/update_cpu_features.zig.
+
 const std = @import("../std.zig");
 const CpuFeature = std.Target.Cpu.Feature;
 const CpuModel = std.Target.Cpu.Model;
diff --git a/lib/std/target/x86.zig b/lib/std/target/x86.zig
index abe154d509..eeb773b034 100644
--- a/lib/std/target/x86.zig
+++ b/lib/std/target/x86.zig
@@ -1,13 +1,12 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2015-2021 Zig Contributors
-// This file is part of [zig](https://ziglang.org/), which is MIT licensed.
-// The MIT license requires this copyright notice to be included in all copies
-// and substantial portions of the software.
+//! This file is auto-generated by tools/update_cpu_features.zig.
+
 const std = @import("../std.zig");
 const CpuFeature = std.Target.Cpu.Feature;
 const CpuModel = std.Target.Cpu.Model;
 
 pub const Feature = enum {
+    @"16bit_mode",
+    @"32bit_mode",
     @"3dnow",
     @"3dnowa",
     @"64bit",
@@ -33,6 +32,7 @@ pub const Feature = enum {
     avx512vnni,
     avx512vp2intersect,
     avx512vpopcntdq,
+    avxvnni,
     bmi,
     bmi2,
     branchfusion,
@@ -64,11 +64,14 @@ pub const Feature = enum {
     fma,
     fma4,
     fsgsbase,
+    fsrm,
     fxsr,
     gfni,
+    hreset,
     idivl_to_divb,
     idivq_to_divl,
     invpcid,
+    kl,
     lea_sp,
     lea_uses_ag,
     lvi_cfi,
@@ -76,12 +79,10 @@ pub const Feature = enum {
     lwp,
     lzcnt,
     macrofusion,
-    merge_to_threeway_branch,
     mmx,
     movbe,
     movdir64b,
     movdiri,
-    mpx,
     mwaitx,
     nopl,
     pad_short_functions,
@@ -120,15 +121,16 @@ pub const Feature = enum {
     slow_unaligned_mem_32,
     soft_float,
     sse,
-    sse_unaligned_mem,
     sse2,
     sse3,
     sse4_1,
     sse4_2,
     sse4a,
+    sse_unaligned_mem,
     ssse3,
     tbm,
     tsxldtrk,
+    uintr,
     use_aa,
     use_glm_div_sqrt_costs,
     vaes,
@@ -136,6 +138,7 @@ pub const Feature = enum {
     vzeroupper,
     waitpkg,
     wbnoinvd,
+    widekl,
     x87,
     xop,
     xsave,
@@ -150,6 +153,16 @@ pub const all_features = blk: {
     const len = @typeInfo(Feature).Enum.fields.len;
     std.debug.assert(len <= CpuFeature.Set.needed_bit_count);
     var result: [len]CpuFeature = undefined;
+    result[@enumToInt(Feature.@"16bit_mode")] = .{
+        .llvm_name = "16bit-mode",
+        .description = "16-bit mode (i8086)",
+        .dependencies = featureSet(&[_]Feature{}),
+    };
+    result[@enumToInt(Feature.@"32bit_mode")] = .{
+        .llvm_name = "32bit-mode",
+        .description = "32-bit mode (80386)",
+        .dependencies = featureSet(&[_]Feature{}),
+    };
     result[@enumToInt(Feature.@"3dnow")] = .{
         .llvm_name = "3dnow",
         .description = "Enable 3DNow! instructions",
@@ -321,6 +334,13 @@ pub const all_features = blk: {
             .avx512f,
         }),
     };
+    result[@enumToInt(Feature.avxvnni)] = .{
+        .llvm_name = "avxvnni",
+        .description = "Support AVX_VNNI encoding",
+        .dependencies = featureSet(&[_]Feature{
+            .avx2,
+        }),
+    };
     result[@enumToInt(Feature.bmi)] = .{
         .llvm_name = "bmi",
         .description = "Support BMI instructions",
@@ -485,6 +505,11 @@ pub const all_features = blk: {
         .description = "Support FS/GS Base instructions",
         .dependencies = featureSet(&[_]Feature{}),
     };
+    result[@enumToInt(Feature.fsrm)] = .{
+        .llvm_name = "fsrm",
+        .description = "REP MOVSB of short lengths is faster",
+        .dependencies = featureSet(&[_]Feature{}),
+    };
     result[@enumToInt(Feature.fxsr)] = .{
         .llvm_name = "fxsr",
         .description = "Support fxsave/fxrestore instructions",
@@ -497,6 +522,11 @@ pub const all_features = blk: {
             .sse2,
         }),
     };
+    result[@enumToInt(Feature.hreset)] = .{
+        .llvm_name = "hreset",
+        .description = "Has hreset instruction",
+        .dependencies = featureSet(&[_]Feature{}),
+    };
     result[@enumToInt(Feature.idivl_to_divb)] = .{
         .llvm_name = "idivl-to-divb",
         .description = "Use 8-bit divide for positive values less than 256",
@@ -512,6 +542,13 @@ pub const all_features = blk: {
         .description = "Invalidate Process-Context Identifier",
         .dependencies = featureSet(&[_]Feature{}),
     };
+    result[@enumToInt(Feature.kl)] = .{
+        .llvm_name = "kl",
+        .description = "Support Key Locker kl Instructions",
+        .dependencies = featureSet(&[_]Feature{
+            .sse2,
+        }),
+    };
     result[@enumToInt(Feature.lea_sp)] = .{
         .llvm_name = "lea-sp",
         .description = "Use LEA for adjusting the stack pointer",
@@ -547,11 +584,6 @@ pub const all_features = blk: {
         .description = "Various instructions can be fused with conditional branches",
         .dependencies = featureSet(&[_]Feature{}),
     };
-    result[@enumToInt(Feature.merge_to_threeway_branch)] = .{
-        .llvm_name = "merge-to-threeway-branch",
-        .description = "Merge branches to a three-way conditional branch",
-        .dependencies = featureSet(&[_]Feature{}),
-    };
     result[@enumToInt(Feature.mmx)] = .{
         .llvm_name = "mmx",
         .description = "Enable MMX instructions",
@@ -572,11 +604,6 @@ pub const all_features = blk: {
         .description = "Support movdiri instruction",
         .dependencies = featureSet(&[_]Feature{}),
     };
-    result[@enumToInt(Feature.mpx)] = .{
-        .llvm_name = "mpx",
-        .description = "Deprecated. Support MPX instructions",
-        .dependencies = featureSet(&[_]Feature{}),
-    };
     result[@enumToInt(Feature.mwaitx)] = .{
         .llvm_name = "mwaitx",
         .description = "Enable MONITORX/MWAITX timer functionality",
@@ -691,7 +718,7 @@ pub const all_features = blk: {
     };
     result[@enumToInt(Feature.sahf)] = .{
         .llvm_name = "sahf",
-        .description = "Support LAHF and SAHF instructions",
+        .description = "Support LAHF and SAHF instructions in 64-bit mode",
         .dependencies = featureSet(&[_]Feature{}),
     };
     result[@enumToInt(Feature.serialize)] = .{
@@ -778,11 +805,6 @@ pub const all_features = blk: {
         .description = "Enable SSE instructions",
         .dependencies = featureSet(&[_]Feature{}),
     };
-    result[@enumToInt(Feature.sse_unaligned_mem)] = .{
-        .llvm_name = "sse-unaligned-mem",
-        .description = "Allow unaligned memory operands with SSE instructions",
-        .dependencies = featureSet(&[_]Feature{}),
-    };
     result[@enumToInt(Feature.sse2)] = .{
         .llvm_name = "sse2",
         .description = "Enable SSE2 instructions",
@@ -818,6 +840,11 @@ pub const all_features = blk: {
             .sse3,
         }),
     };
+    result[@enumToInt(Feature.sse_unaligned_mem)] = .{
+        .llvm_name = "sse-unaligned-mem",
+        .description = "Allow unaligned memory operands with SSE instructions",
+        .dependencies = featureSet(&[_]Feature{}),
+    };
     result[@enumToInt(Feature.ssse3)] = .{
         .llvm_name = "ssse3",
         .description = "Enable SSSE3 instructions",
@@ -835,6 +862,11 @@ pub const all_features = blk: {
         .description = "Support TSXLDTRK instructions",
         .dependencies = featureSet(&[_]Feature{}),
     };
+    result[@enumToInt(Feature.uintr)] = .{
+        .llvm_name = "uintr",
+        .description = "Has UINTR Instructions",
+        .dependencies = featureSet(&[_]Feature{}),
+    };
     result[@enumToInt(Feature.use_aa)] = .{
         .llvm_name = "use-aa",
         .description = "Use alias analysis during codegen",
@@ -876,6 +908,13 @@ pub const all_features = blk: {
         .description = "Write Back No Invalidate",
         .dependencies = featureSet(&[_]Feature{}),
     };
+    result[@enumToInt(Feature.widekl)] = .{
+        .llvm_name = "widekl",
+        .description = "Support Key Locker wide Instructions",
+        .dependencies = featureSet(&[_]Feature{
+            .kl,
+        }),
+    };
     result[@enumToInt(Feature.x87)] = .{
         .llvm_name = "x87",
         .description = "Enable X87 float instructions",
@@ -923,6 +962,97 @@ pub const all_features = blk: {
 };
 
 pub const cpu = struct {
+    pub const _i386 = CpuModel{
+        .name = "_i386",
+        .llvm_name = "i386",
+        .features = featureSet(&[_]Feature{
+            .slow_unaligned_mem_16,
+            .vzeroupper,
+            .x87,
+        }),
+    };
+    pub const _i486 = CpuModel{
+        .name = "_i486",
+        .llvm_name = "i486",
+        .features = featureSet(&[_]Feature{
+            .slow_unaligned_mem_16,
+            .vzeroupper,
+            .x87,
+        }),
+    };
+    pub const _i586 = CpuModel{
+        .name = "_i586",
+        .llvm_name = "i586",
+        .features = featureSet(&[_]Feature{
+            .cx8,
+            .slow_unaligned_mem_16,
+            .vzeroupper,
+            .x87,
+        }),
+    };
+    pub const _i686 = CpuModel{
+        .name = "_i686",
+        .llvm_name = "i686",
+        .features = featureSet(&[_]Feature{
+            .cmov,
+            .cx8,
+            .slow_unaligned_mem_16,
+            .vzeroupper,
+            .x87,
+        }),
+    };
+    pub const alderlake = CpuModel{
+        .name = "alderlake",
+        .llvm_name = "alderlake",
+        .features = featureSet(&[_]Feature{
+            .@"64bit",
+            .adx,
+            .aes,
+            .avxvnni,
+            .bmi,
+            .bmi2,
+            .cldemote,
+            .clflushopt,
+            .cmov,
+            .cx16,
+            .ermsb,
+            .f16c,
+            .false_deps_popcnt,
+            .fast_15bytenop,
+            .fast_gather,
+            .fast_scalar_fsqrt,
+            .fast_shld_rotate,
+            .fast_variable_shuffle,
+            .fast_vector_fsqrt,
+            .fma,
+            .fsgsbase,
+            .fxsr,
+            .hreset,
+            .idivq_to_divl,
+            .invpcid,
+            .lzcnt,
+            .macrofusion,
+            .mmx,
+            .movbe,
+            .nopl,
+            .pclmul,
+            .popcnt,
+            .prfchw,
+            .ptwrite,
+            .rdrnd,
+            .rdseed,
+            .sahf,
+            .serialize,
+            .sgx,
+            .slow_3ops_lea,
+            .vzeroupper,
+            .waitpkg,
+            .x87,
+            .xsavec,
+            .xsaveopt,
+            .xsaves,
+        }),
+    };
     pub const amdfam10 = CpuModel{
         .name = "amdfam10",
         .llvm_name = "amdfam10",
@@ -931,7 +1061,6 @@ pub const cpu = struct {
             .@"64bit",
             .cmov,
             .cx16,
-            .cx8,
             .fast_scalar_shift_masks,
             .fxsr,
             .lzcnt,
@@ -959,6 +1088,42 @@ pub const cpu = struct {
             .x87,
         }),
     };
+    pub const athlon64 = CpuModel{
+        .name = "athlon64",
+        .llvm_name = "athlon64",
+        .features = featureSet(&[_]Feature{
+            .@"3dnowa",
+            .@"64bit",
+            .cmov,
+            .cx8,
+            .fast_scalar_shift_masks,
+            .fxsr,
+            .nopl,
+            .slow_shld,
+            .slow_unaligned_mem_16,
+            .sse2,
+            .vzeroupper,
+            .x87,
+        }),
+    };
+    pub const athlon64_sse3 = CpuModel{
+        .name = "athlon64_sse3",
+        .llvm_name = "athlon64-sse3",
+        .features = featureSet(&[_]Feature{
+            .@"3dnowa",
+            .@"64bit",
+            .cmov,
+            .cx16,
+            .fast_scalar_shift_masks,
+            .fxsr,
+            .nopl,
+            .slow_shld,
+            .slow_unaligned_mem_16,
+            .sse3,
+            .vzeroupper,
+            .x87,
+        }),
+    };
     pub const athlon_4 = CpuModel{
         .name = "athlon_4",
         .llvm_name = "athlon-4",
@@ -1039,43 +1204,6 @@ pub const cpu = struct {
             .x87,
         }),
     };
-    pub const athlon64 = CpuModel{
-        .name = "athlon64",
-        .llvm_name = "athlon64",
-        .features = featureSet(&[_]Feature{
-            .@"3dnowa",
-            .@"64bit",
-            .cmov,
-            .cx8,
-            .fast_scalar_shift_masks,
-            .fxsr,
-            .nopl,
-            .slow_shld,
-            .slow_unaligned_mem_16,
-            .sse2,
-            .vzeroupper,
-            .x87,
-        }),
-    };
-    pub const athlon64_sse3 = CpuModel{
-        .name = "athlon64_sse3",
-        .llvm_name = "athlon64-sse3",
-        .features = featureSet(&[_]Feature{
-            .@"3dnowa",
-            .@"64bit",
-            .cmov,
-            .cx16,
-            .cx8,
-            .fast_scalar_shift_masks,
-            .fxsr,
-            .nopl,
-            .slow_shld,
-            .slow_unaligned_mem_16,
-            .sse3,
-            .vzeroupper,
-            .x87,
-        }),
-    };
     pub const atom = CpuModel{
         .name = "atom",
         .llvm_name = "atom",
@@ -1083,7 +1211,6 @@ pub const cpu = struct {
             .@"64bit",
             .cmov,
             .cx16,
-            .cx8,
             .fxsr,
             .idivl_to_divb,
             .idivq_to_divl,
@@ -1109,7 +1236,6 @@ pub const cpu = struct {
             .@"64bit",
             .cmov,
             .cx16,
-            .cx8,
             .fast_scalar_shift_masks,
             .fxsr,
             .lzcnt,
@@ -1132,7 +1258,6 @@ pub const cpu = struct {
             .branchfusion,
             .cmov,
             .cx16,
-            .cx8,
             .fast_11bytenop,
             .fast_scalar_shift_masks,
             .fxsr,
@@ -1161,7 +1286,6 @@ pub const cpu = struct {
             .branchfusion,
             .cmov,
             .cx16,
-            .cx8,
             .f16c,
             .fast_11bytenop,
             .fast_bextr,
@@ -1194,7 +1318,6 @@ pub const cpu = struct {
             .branchfusion,
             .cmov,
             .cx16,
-            .cx8,
             .f16c,
             .fast_11bytenop,
             .fast_bextr,
@@ -1215,7 +1338,6 @@ pub const cpu = struct {
             .vzeroupper,
             .x87,
             .xop,
-            .xsave,
             .xsaveopt,
         }),
     };
@@ -1231,7 +1353,6 @@ pub const cpu = struct {
             .branchfusion,
             .cmov,
             .cx16,
-            .cx8,
             .f16c,
             .fast_11bytenop,
             .fast_bextr,
@@ -1255,7 +1376,6 @@ pub const cpu = struct {
             .vzeroupper,
             .x87,
             .xop,
-            .xsave,
             .xsaveopt,
         }),
     };
@@ -1266,7 +1386,6 @@ pub const cpu = struct {
             .@"64bit",
             .cmov,
             .cx16,
-            .cx8,
             .fxsr,
             .idivl_to_divb,
             .idivq_to_divl,
@@ -1290,13 +1409,11 @@ pub const cpu = struct {
         .features = featureSet(&[_]Feature{
             .@"64bit",
             .adx,
-            .avx,
             .avx2,
             .bmi,
             .bmi2,
             .cmov,
             .cx16,
-            .cx8,
             .ermsb,
             .f16c,
             .false_deps_lzcnt_tzcnt,
@@ -1312,7 +1429,6 @@ pub const cpu = struct {
             .invpcid,
             .lzcnt,
             .macrofusion,
-            .merge_to_threeway_branch,
             .mmx,
             .movbe,
             .nopl,
@@ -1323,10 +1439,8 @@ pub const cpu = struct {
             .rdseed,
             .sahf,
             .slow_3ops_lea,
-            .sse4_2,
             .vzeroupper,
             .x87,
-            .xsave,
             .xsaveopt,
         }),
     };
@@ -1337,7 +1451,6 @@ pub const cpu = struct {
             .@"64bit",
             .cmov,
             .cx16,
-            .cx8,
             .fast_15bytenop,
             .fast_scalar_shift_masks,
             .fast_vector_shift_masks,
@@ -1361,11 +1474,9 @@ pub const cpu = struct {
         .features = featureSet(&[_]Feature{
             .@"64bit",
             .aes,
-            .avx,
             .bmi,
             .cmov,
             .cx16,
-            .cx8,
             .f16c,
             .fast_15bytenop,
             .fast_bextr,
@@ -1384,9 +1495,7 @@ pub const cpu = struct {
             .sahf,
             .slow_shld,
             .sse4a,
-            .ssse3,
             .x87,
-            .xsave,
             .xsaveopt,
         }),
     };
@@ -1421,12 +1530,8 @@ pub const cpu = struct {
             .@"64bit",
             .adx,
             .aes,
-            .avx,
-            .avx2,
-            .avx512bw,
             .avx512cd,
             .avx512dq,
-            .avx512f,
             .avx512ifma,
             .avx512vbmi,
             .avx512vl,
@@ -1435,23 +1540,19 @@ pub const cpu = struct {
             .clflushopt,
             .cmov,
             .cx16,
-            .cx8,
             .ermsb,
-            .f16c,
             .fast_15bytenop,
             .fast_gather,
             .fast_scalar_fsqrt,
             .fast_shld_rotate,
             .fast_variable_shuffle,
             .fast_vector_fsqrt,
-            .fma,
             .fsgsbase,
             .fxsr,
             .idivq_to_divl,
             .invpcid,
             .lzcnt,
             .macrofusion,
-            .merge_to_threeway_branch,
             .mmx,
             .movbe,
             .nopl,
@@ -1466,10 +1567,8 @@ pub const cpu = struct {
             .sgx,
             .sha,
             .slow_3ops_lea,
-            .sse4_2,
             .vzeroupper,
             .x87,
-            .xsave,
             .xsavec,
             .xsaveopt,
             .xsaves,
@@ -1482,12 +1581,9 @@ pub const cpu = struct {
             .@"64bit",
             .adx,
             .aes,
-            .avx,
-            .avx2,
             .avx512bw,
             .avx512cd,
             .avx512dq,
-            .avx512f,
             .avx512vl,
             .avx512vnni,
             .bmi,
@@ -1496,9 +1592,7 @@ pub const cpu = struct {
             .clwb,
             .cmov,
             .cx16,
-            .cx8,
             .ermsb,
-            .f16c,
             .false_deps_popcnt,
             .fast_15bytenop,
             .fast_gather,
@@ -1506,14 +1600,12 @@ pub const cpu = struct {
             .fast_shld_rotate,
             .fast_variable_shuffle,
             .fast_vector_fsqrt,
-            .fma,
             .fsgsbase,
             .fxsr,
             .idivq_to_divl,
             .invpcid,
             .lzcnt,
             .macrofusion,
-            .merge_to_threeway_branch,
             .mmx,
             .movbe,
             .nopl,
@@ -1526,10 +1618,8 @@ pub const cpu = struct {
             .rdseed,
             .sahf,
             .slow_3ops_lea,
-            .sse4_2,
             .vzeroupper,
             .x87,
-            .xsave,
             .xsavec,
             .xsaveopt,
             .xsaves,
@@ -1542,13 +1632,9 @@ pub const cpu = struct {
             .@"64bit",
             .adx,
             .aes,
-            .avx,
-            .avx2,
             .avx512bf16,
-            .avx512bw,
             .avx512cd,
             .avx512dq,
-            .avx512f,
             .avx512vl,
             .avx512vnni,
             .bmi,
@@ -1557,9 +1643,7 @@ pub const cpu = struct {
             .clwb,
             .cmov,
             .cx16,
-            .cx8,
             .ermsb,
-            .f16c,
             .false_deps_popcnt,
             .fast_15bytenop,
             .fast_gather,
@@ -1567,14 +1651,12 @@ pub const cpu = struct {
             .fast_shld_rotate,
             .fast_variable_shuffle,
             .fast_vector_fsqrt,
-            .fma,
             .fsgsbase,
             .fxsr,
             .idivq_to_divl,
             .invpcid,
             .lzcnt,
             .macrofusion,
-            .merge_to_threeway_branch,
             .mmx,
             .movbe,
             .nopl,
@@ -1587,47 +1669,29 @@ pub const cpu = struct {
             .rdseed,
             .sahf,
             .slow_3ops_lea,
-            .sse4_2,
             .vzeroupper,
             .x87,
-            .xsave,
             .xsavec,
             .xsaveopt,
             .xsaves,
         }),
     };
-    pub const core_avx_i = CpuModel{
-        .name = "core_avx_i",
-        .llvm_name = "core-avx-i",
+    pub const core2 = CpuModel{
+        .name = "core2",
+        .llvm_name = "core2",
         .features = featureSet(&[_]Feature{
             .@"64bit",
-            .avx,
             .cmov,
             .cx16,
-            .cx8,
-            .f16c,
-            .false_deps_popcnt,
-            .fast_15bytenop,
-            .fast_scalar_fsqrt,
-            .fast_shld_rotate,
-            .fsgsbase,
             .fxsr,
-            .idivq_to_divl,
             .macrofusion,
-            .merge_to_threeway_branch,
             .mmx,
             .nopl,
-            .pclmul,
-            .popcnt,
-            .rdrnd,
             .sahf,
-            .slow_3ops_lea,
-            .slow_unaligned_mem_32,
-            .sse4_2,
+            .slow_unaligned_mem_16,
+            .ssse3,
             .vzeroupper,
             .x87,
-            .xsave,
-            .xsaveopt,
         }),
     };
     pub const core_avx2 = CpuModel{
@@ -1635,13 +1699,11 @@ pub const cpu = struct {
         .llvm_name = "core-avx2",
         .features = featureSet(&[_]Feature{
             .@"64bit",
-            .avx,
             .avx2,
             .bmi,
             .bmi2,
             .cmov,
             .cx16,
-            .cx8,
             .ermsb,
             .f16c,
             .false_deps_lzcnt_tzcnt,
@@ -1657,7 +1719,6 @@ pub const cpu = struct {
             .invpcid,
             .lzcnt,
             .macrofusion,
-            .merge_to_threeway_branch,
             .mmx,
             .movbe,
             .nopl,
@@ -1666,30 +1727,38 @@ pub const cpu = struct {
             .rdrnd,
             .sahf,
             .slow_3ops_lea,
-            .sse4_2,
             .vzeroupper,
             .x87,
-            .xsave,
             .xsaveopt,
         }),
     };
-    pub const core2 = CpuModel{
-        .name = "core2",
-        .llvm_name = "core2",
+    pub const core_avx_i = CpuModel{
+        .name = "core_avx_i",
+        .llvm_name = "core-avx-i",
         .features = featureSet(&[_]Feature{
             .@"64bit",
             .cmov,
             .cx16,
-            .cx8,
+            .f16c,
+            .false_deps_popcnt,
+            .fast_15bytenop,
+            .fast_scalar_fsqrt,
+            .fast_shld_rotate,
+            .fsgsbase,
             .fxsr,
+            .idivq_to_divl,
             .macrofusion,
             .mmx,
             .nopl,
+            .pclmul,
+            .popcnt,
+            .rdrnd,
             .sahf,
-            .slow_unaligned_mem_16,
-            .ssse3,
+            .slow_3ops_lea,
+            .slow_unaligned_mem_32,
             .vzeroupper,
             .x87,
+            .xsaveopt,
         }),
     };
     pub const corei7 = CpuModel{
@@ -1699,7 +1768,6 @@ pub const cpu = struct {
             .@"64bit",
             .cmov,
             .cx16,
-            .cx8,
             .fxsr,
             .macrofusion,
             .mmx,
@@ -1719,7 +1787,6 @@ pub const cpu = struct {
             .avx,
             .cmov,
             .cx16,
-            .cx8,
             .false_deps_popcnt,
             .fast_15bytenop,
             .fast_scalar_fsqrt,
@@ -1727,7 +1794,6 @@ pub const cpu = struct {
             .fxsr,
             .idivq_to_divl,
             .macrofusion,
-            .merge_to_threeway_branch,
             .mmx,
             .nopl,
             .pclmul,
@@ -1735,10 +1801,8 @@ pub const cpu = struct {
             .sahf,
             .slow_3ops_lea,
             .slow_unaligned_mem_32,
-            .sse4_2,
             .vzeroupper,
             .x87,
-            .xsave,
             .xsaveopt,
         }),
     };
@@ -1746,8 +1810,12 @@ pub const cpu = struct {
         .name = "generic",
         .llvm_name = "generic",
         .features = featureSet(&[_]Feature{
+            .@"64bit",
             .cx8,
-            .slow_unaligned_mem_16,
+            .idivq_to_divl,
+            .macrofusion,
+            .slow_3ops_lea,
+            .slow_incdec,
             .vzeroupper,
             .x87,
         }),
@@ -1772,7 +1840,6 @@ pub const cpu = struct {
             .clflushopt,
             .cmov,
             .cx16,
-            .cx8,
             .false_deps_popcnt,
             .fsgsbase,
             .fxsr,
@@ -1790,11 +1857,9 @@ pub const cpu = struct {
             .slow_lea,
             .slow_two_mem_ops,
             .sse4_2,
-            .ssse3,
             .use_glm_div_sqrt_costs,
             .vzeroupper,
             .x87,
-            .xsave,
             .xsavec,
             .xsaveopt,
             .xsaves,
@@ -1809,7 +1874,6 @@ pub const cpu = struct {
             .clflushopt,
             .cmov,
             .cx16,
-            .cx8,
             .fsgsbase,
             .fxsr,
             .mmx,
@@ -1829,11 +1893,9 @@ pub const cpu = struct {
             .slow_lea,
             .slow_two_mem_ops,
             .sse4_2,
-            .ssse3,
             .use_glm_div_sqrt_costs,
             .vzeroupper,
             .x87,
-            .xsave,
             .xsavec,
             .xsaveopt,
             .xsaves,
@@ -1844,13 +1906,11 @@ pub const cpu = struct {
         .llvm_name = "haswell",
         .features = featureSet(&[_]Feature{
             .@"64bit",
-            .avx,
             .avx2,
             .bmi,
             .bmi2,
             .cmov,
             .cx16,
-            .cx8,
             .ermsb,
             .f16c,
             .false_deps_lzcnt_tzcnt,
@@ -1866,7 +1926,6 @@ pub const cpu = struct {
             .invpcid,
             .lzcnt,
             .macrofusion,
-            .merge_to_threeway_branch,
             .mmx,
             .movbe,
             .nopl,
@@ -1875,66 +1934,20 @@ pub const cpu = struct {
             .rdrnd,
             .sahf,
             .slow_3ops_lea,
-            .sse4_2,
             .vzeroupper,
             .x87,
-            .xsave,
             .xsaveopt,
         }),
     };
-    pub const _i386 = CpuModel{
-        .name = "_i386",
-        .llvm_name = "i386",
-        .features = featureSet(&[_]Feature{
-            .slow_unaligned_mem_16,
-            .vzeroupper,
-            .x87,
-        }),
-    };
-    pub const _i486 = CpuModel{
-        .name = "_i486",
-        .llvm_name = "i486",
-        .features = featureSet(&[_]Feature{
-            .slow_unaligned_mem_16,
-            .vzeroupper,
-            .x87,
-        }),
-    };
-    pub const _i586 = CpuModel{
-        .name = "_i586",
-        .llvm_name = "i586",
-        .features = featureSet(&[_]Feature{
-            .cx8,
-            .slow_unaligned_mem_16,
-            .vzeroupper,
-            .x87,
-        }),
-    };
-    pub const _i686 = CpuModel{
-        .name = "_i686",
-        .llvm_name = "i686",
-        .features = featureSet(&[_]Feature{
-            .cmov,
-            .cx8,
-            .slow_unaligned_mem_16,
-            .vzeroupper,
-            .x87,
-        }),
-    };
     pub const icelake_client = CpuModel{
         .name = "icelake_client",
         .llvm_name = "icelake-client",
         .features = featureSet(&[_]Feature{
             .@"64bit",
             .adx,
-            .aes,
-            .avx,
-            .avx2,
             .avx512bitalg,
-            .avx512bw,
             .avx512cd,
             .avx512dq,
-            .avx512f,
             .avx512ifma,
             .avx512vbmi,
             .avx512vbmi2,
@@ -1947,28 +1960,24 @@ pub const cpu = struct {
             .clwb,
             .cmov,
             .cx16,
-            .cx8,
             .ermsb,
-            .f16c,
             .fast_15bytenop,
             .fast_gather,
             .fast_scalar_fsqrt,
             .fast_shld_rotate,
             .fast_variable_shuffle,
             .fast_vector_fsqrt,
-            .fma,
             .fsgsbase,
+            .fsrm,
             .fxsr,
             .gfni,
             .idivq_to_divl,
             .invpcid,
             .lzcnt,
             .macrofusion,
-            .merge_to_threeway_branch,
             .mmx,
             .movbe,
             .nopl,
-            .pclmul,
             .pku,
             .popcnt,
             .prefer_256_bit,
@@ -1980,12 +1989,10 @@ pub const cpu = struct {
             .sgx,
             .sha,
             .slow_3ops_lea,
-            .sse4_2,
             .vaes,
             .vpclmulqdq,
             .vzeroupper,
             .x87,
-            .xsave,
             .xsavec,
             .xsaveopt,
             .xsaves,
@@ -1997,14 +2004,9 @@ pub const cpu = struct {
         .features = featureSet(&[_]Feature{
             .@"64bit",
             .adx,
-            .aes,
-            .avx,
-            .avx2,
             .avx512bitalg,
-            .avx512bw,
             .avx512cd,
             .avx512dq,
-            .avx512f,
             .avx512ifma,
             .avx512vbmi,
             .avx512vbmi2,
@@ -2017,28 +2019,24 @@ pub const cpu = struct {
             .clwb,
             .cmov,
             .cx16,
-            .cx8,
             .ermsb,
-            .f16c,
             .fast_15bytenop,
             .fast_gather,
             .fast_scalar_fsqrt,
             .fast_shld_rotate,
             .fast_variable_shuffle,
             .fast_vector_fsqrt,
-            .fma,
             .fsgsbase,
+            .fsrm,
             .fxsr,
             .gfni,
             .idivq_to_divl,
             .invpcid,
             .lzcnt,
             .macrofusion,
-            .merge_to_threeway_branch,
             .mmx,
             .movbe,
             .nopl,
-            .pclmul,
             .pconfig,
             .pku,
             .popcnt,
@@ -2051,13 +2049,11 @@ pub const cpu = struct {
             .sgx,
             .sha,
             .slow_3ops_lea,
-            .sse4_2,
             .vaes,
             .vpclmulqdq,
             .vzeroupper,
             .wbnoinvd,
             .x87,
-            .xsave,
             .xsavec,
             .xsaveopt,
             .xsaves,
@@ -2068,10 +2064,8 @@ pub const cpu = struct {
         .llvm_name = "ivybridge",
         .features = featureSet(&[_]Feature{
             .@"64bit",
-            .avx,
             .cmov,
             .cx16,
-            .cx8,
             .f16c,
             .false_deps_popcnt,
             .fast_15bytenop,
@@ -2081,7 +2075,6 @@ pub const cpu = struct {
             .fxsr,
             .idivq_to_divl,
             .macrofusion,
-            .merge_to_threeway_branch,
             .mmx,
             .nopl,
             .pclmul,
@@ -2090,10 +2083,8 @@ pub const cpu = struct {
             .sahf,
             .slow_3ops_lea,
             .slow_unaligned_mem_32,
-            .sse4_2,
             .vzeroupper,
             .x87,
-            .xsave,
             .xsaveopt,
         }),
     };
@@ -2156,7 +2147,6 @@ pub const cpu = struct {
             .@"64bit",
             .cmov,
             .cx16,
-            .cx8,
             .fast_scalar_shift_masks,
             .fxsr,
             .nopl,
@@ -2176,16 +2166,12 @@ pub const cpu = struct {
             .aes,
             .avx512cd,
             .avx512er,
-            .avx512f,
             .avx512pf,
             .bmi,
             .bmi2,
             .cmov,
             .cx16,
-            .cx8,
-            .f16c,
             .fast_gather,
-            .fma,
             .fsgsbase,
             .fxsr,
             .idivq_to_divl,
@@ -2206,7 +2192,6 @@ pub const cpu = struct {
             .slow_pmaddwd,
             .slow_two_mem_ops,
             .x87,
-            .xsave,
             .xsaveopt,
         }),
     };
@@ -2219,17 +2204,13 @@ pub const cpu = struct {
             .aes,
             .avx512cd,
             .avx512er,
-            .avx512f,
             .avx512pf,
             .avx512vpopcntdq,
             .bmi,
             .bmi2,
             .cmov,
             .cx16,
-            .cx8,
-            .f16c,
             .fast_gather,
-            .fma,
             .fsgsbase,
             .fxsr,
             .idivq_to_divl,
@@ -2250,7 +2231,6 @@ pub const cpu = struct {
             .slow_pmaddwd,
             .slow_two_mem_ops,
             .x87,
-            .xsave,
             .xsaveopt,
         }),
     };
@@ -2258,6 +2238,8 @@ pub const cpu = struct {
         .name = "lakemont",
         .llvm_name = "lakemont",
         .features = featureSet(&[_]Feature{
+            .cx8,
+            .slow_unaligned_mem_16,
             .vzeroupper,
         }),
     };
@@ -2268,7 +2250,6 @@ pub const cpu = struct {
             .@"64bit",
             .cmov,
             .cx16,
-            .cx8,
             .fxsr,
             .macrofusion,
             .mmx,
@@ -2287,7 +2268,6 @@ pub const cpu = struct {
             .@"64bit",
             .cmov,
             .cx16,
-            .cx8,
             .fxsr,
             .mmx,
             .nopl,
@@ -2323,7 +2303,6 @@ pub const cpu = struct {
             .@"64bit",
             .cmov,
             .cx16,
-            .cx8,
             .fast_scalar_shift_masks,
             .fxsr,
             .nopl,
@@ -2341,7 +2320,6 @@ pub const cpu = struct {
             .@"64bit",
             .cmov,
             .cx16,
-            .cx8,
             .fxsr,
             .macrofusion,
             .mmx,
@@ -2363,32 +2341,6 @@ pub const cpu = struct {
             .x87,
         }),
     };
-    pub const pentium_m = CpuModel{
-        .name = "pentium_m",
-        .llvm_name = "pentium-m",
-        .features = featureSet(&[_]Feature{
-            .cmov,
-            .cx8,
-            .fxsr,
-            .mmx,
-            .nopl,
-            .slow_unaligned_mem_16,
-            .sse2,
-            .vzeroupper,
-            .x87,
-        }),
-    };
-    pub const pentium_mmx = CpuModel{
-        .name = "pentium_mmx",
-        .llvm_name = "pentium-mmx",
-        .features = featureSet(&[_]Feature{
-            .cx8,
-            .mmx,
-            .slow_unaligned_mem_16,
-            .vzeroupper,
-            .x87,
-        }),
-    };
     pub const pentium2 = CpuModel{
         .name = "pentium2",
         .llvm_name = "pentium2",
@@ -2463,6 +2415,32 @@ pub const cpu = struct {
             .x87,
         }),
     };
+    pub const pentium_m = CpuModel{
+        .name = "pentium_m",
+        .llvm_name = "pentium-m",
+        .features = featureSet(&[_]Feature{
+            .cmov,
+            .cx8,
+            .fxsr,
+            .mmx,
+            .nopl,
+            .slow_unaligned_mem_16,
+            .sse2,
+            .vzeroupper,
+            .x87,
+        }),
+    };
+    pub const pentium_mmx = CpuModel{
+        .name = "pentium_mmx",
+        .llvm_name = "pentium-mmx",
+        .features = featureSet(&[_]Feature{
+            .cx8,
+            .mmx,
+            .slow_unaligned_mem_16,
+            .vzeroupper,
+            .x87,
+        }),
+    };
     pub const pentiumpro = CpuModel{
         .name = "pentiumpro",
         .llvm_name = "pentiumpro",
@@ -2498,7 +2476,6 @@ pub const cpu = struct {
             .avx,
             .cmov,
             .cx16,
-            .cx8,
             .false_deps_popcnt,
             .fast_15bytenop,
             .fast_scalar_fsqrt,
@@ -2506,7 +2483,6 @@ pub const cpu = struct {
             .fxsr,
             .idivq_to_divl,
             .macrofusion,
-            .merge_to_threeway_branch,
             .mmx,
             .nopl,
             .pclmul,
@@ -2514,13 +2490,87 @@ pub const cpu = struct {
             .sahf,
             .slow_3ops_lea,
             .slow_unaligned_mem_32,
-            .sse4_2,
             .vzeroupper,
             .x87,
-            .xsave,
             .xsaveopt,
         }),
     };
+    pub const sapphirerapids = CpuModel{
+        .name = "sapphirerapids",
+        .llvm_name = "sapphirerapids",
+        .features = featureSet(&[_]Feature{
+            .@"64bit",
+            .adx,
+            .amx_bf16,
+            .amx_int8,
+            .avx512bf16,
+            .avx512bitalg,
+            .avx512cd,
+            .avx512dq,
+            .avx512ifma,
+            .avx512vbmi,
+            .avx512vbmi2,
+            .avx512vl,
+            .avx512vnni,
+            .avx512vp2intersect,
+            .avx512vpopcntdq,
+            .avxvnni,
+            .bmi,
+            .bmi2,
+            .cldemote,
+            .clflushopt,
+            .clwb,
+            .cmov,
+            .cx16,
+            .enqcmd,
+            .ermsb,
+            .fast_15bytenop,
+            .fast_gather,
+            .fast_scalar_fsqrt,
+            .fast_shld_rotate,
+            .fast_variable_shuffle,
+            .fast_vector_fsqrt,
+            .fsgsbase,
+            .fsrm,
+            .fxsr,
+            .gfni,
+            .idivq_to_divl,
+            .invpcid,
+            .lzcnt,
+            .macrofusion,
+            .mmx,
+            .movbe,
+            .movdir64b,
+            .movdiri,
+            .nopl,
+            .pconfig,
+            .pku,
+            .popcnt,
+            .prefer_256_bit,
+            .prfchw,
+            .ptwrite,
+            .rdpid,
+            .rdrnd,
+            .rdseed,
+            .sahf,
+            .serialize,
+            .sgx,
+            .sha,
+            .shstk,
+            .slow_3ops_lea,
+            .tsxldtrk,
+            .uintr,
+            .vaes,
+            .vpclmulqdq,
+            .vzeroupper,
+            .waitpkg,
+            .wbnoinvd,
+            .x87,
+            .xsavec,
+            .xsaveopt,
+            .xsaves,
+        }),
+    };
     pub const silvermont = CpuModel{
         .name = "silvermont",
         .llvm_name = "silvermont",
@@ -2528,7 +2578,6 @@ pub const cpu = struct {
             .@"64bit",
             .cmov,
             .cx16,
-            .cx8,
             .false_deps_popcnt,
             .fast_7bytenop,
             .fxsr,
@@ -2546,7 +2595,6 @@ pub const cpu = struct {
             .slow_pmulld,
             .slow_two_mem_ops,
             .sse4_2,
-            .ssse3,
             .vzeroupper,
             .x87,
         }),
@@ -2558,12 +2606,9 @@ pub const cpu = struct {
             .@"64bit",
             .adx,
             .aes,
-            .avx,
-            .avx2,
             .avx512bw,
             .avx512cd,
             .avx512dq,
-            .avx512f,
             .avx512vl,
             .bmi,
             .bmi2,
@@ -2571,9 +2616,7 @@ pub const cpu = struct {
             .clwb,
             .cmov,
             .cx16,
-            .cx8,
             .ermsb,
-            .f16c,
             .false_deps_popcnt,
             .fast_15bytenop,
             .fast_gather,
@@ -2581,14 +2624,12 @@ pub const cpu = struct {
             .fast_shld_rotate,
             .fast_variable_shuffle,
             .fast_vector_fsqrt,
-            .fma,
             .fsgsbase,
             .fxsr,
             .idivq_to_divl,
             .invpcid,
             .lzcnt,
             .macrofusion,
-            .merge_to_threeway_branch,
             .mmx,
             .movbe,
             .nopl,
@@ -2601,10 +2642,8 @@ pub const cpu = struct {
             .rdseed,
             .sahf,
             .slow_3ops_lea,
-            .sse4_2,
             .vzeroupper,
             .x87,
-            .xsave,
             .xsavec,
             .xsaveopt,
             .xsaves,
@@ -2617,14 +2656,12 @@ pub const cpu = struct {
             .@"64bit",
             .adx,
             .aes,
-            .avx,
             .avx2,
             .bmi,
             .bmi2,
             .clflushopt,
             .cmov,
             .cx16,
-            .cx8,
             .ermsb,
             .f16c,
             .false_deps_popcnt,
@@ -2641,7 +2678,6 @@ pub const cpu = struct {
             .invpcid,
             .lzcnt,
             .macrofusion,
-            .merge_to_threeway_branch,
             .mmx,
             .movbe,
             .nopl,
@@ -2653,10 +2689,8 @@ pub const cpu = struct {
             .sahf,
             .sgx,
             .slow_3ops_lea,
-            .sse4_2,
             .vzeroupper,
             .x87,
-            .xsave,
             .xsavec,
             .xsaveopt,
             .xsaves,
@@ -2669,12 +2703,9 @@ pub const cpu = struct {
             .@"64bit",
             .adx,
             .aes,
-            .avx,
-            .avx2,
             .avx512bw,
             .avx512cd,
             .avx512dq,
-            .avx512f,
             .avx512vl,
             .bmi,
             .bmi2,
@@ -2682,9 +2713,7 @@ pub const cpu = struct {
             .clwb,
             .cmov,
             .cx16,
-            .cx8,
             .ermsb,
-            .f16c,
             .false_deps_popcnt,
             .fast_15bytenop,
             .fast_gather,
@@ -2692,14 +2721,12 @@ pub const cpu = struct {
             .fast_shld_rotate,
             .fast_variable_shuffle,
             .fast_vector_fsqrt,
-            .fma,
             .fsgsbase,
             .fxsr,
             .idivq_to_divl,
             .invpcid,
             .lzcnt,
             .macrofusion,
-            .merge_to_threeway_branch,
             .mmx,
             .movbe,
             .nopl,
@@ -2712,10 +2739,8 @@ pub const cpu = struct {
             .rdseed,
             .sahf,
             .slow_3ops_lea,
-            .sse4_2,
             .vzeroupper,
             .x87,
-            .xsave,
             .xsavec,
             .xsaveopt,
             .xsaves,
@@ -2728,7 +2753,6 @@ pub const cpu = struct {
             .@"64bit",
             .cmov,
             .cx16,
-            .cx8,
             .false_deps_popcnt,
             .fast_7bytenop,
             .fxsr,
@@ -2746,7 +2770,6 @@ pub const cpu = struct {
             .slow_pmulld,
             .slow_two_mem_ops,
             .sse4_2,
-            .ssse3,
             .vzeroupper,
             .x87,
         }),
@@ -2757,14 +2780,9 @@ pub const cpu = struct {
         .features = featureSet(&[_]Feature{
             .@"64bit",
             .adx,
-            .aes,
-            .avx,
-            .avx2,
             .avx512bitalg,
-            .avx512bw,
             .avx512cd,
             .avx512dq,
-            .avx512f,
             .avx512ifma,
             .avx512vbmi,
             .avx512vbmi2,
@@ -2778,30 +2796,26 @@ pub const cpu = struct {
             .clwb,
             .cmov,
             .cx16,
-            .cx8,
             .ermsb,
-            .f16c,
             .fast_15bytenop,
             .fast_gather,
             .fast_scalar_fsqrt,
             .fast_shld_rotate,
             .fast_variable_shuffle,
             .fast_vector_fsqrt,
-            .fma,
             .fsgsbase,
+            .fsrm,
             .fxsr,
             .gfni,
             .idivq_to_divl,
             .invpcid,
             .lzcnt,
             .macrofusion,
-            .merge_to_threeway_branch,
             .mmx,
             .movbe,
             .movdir64b,
             .movdiri,
             .nopl,
-            .pclmul,
             .pku,
             .popcnt,
             .prefer_256_bit,
@@ -2814,12 +2828,10 @@ pub const cpu = struct {
             .sha,
             .shstk,
             .slow_3ops_lea,
-            .sse4_2,
             .vaes,
             .vpclmulqdq,
             .vzeroupper,
             .x87,
-            .xsave,
             .xsavec,
             .xsaveopt,
             .xsaves,
@@ -2835,7 +2847,6 @@ pub const cpu = struct {
             .clwb,
             .cmov,
             .cx16,
-            .cx8,
             .fsgsbase,
             .fxsr,
             .gfni,
@@ -2856,11 +2867,9 @@ pub const cpu = struct {
             .slow_lea,
             .slow_two_mem_ops,
             .sse4_2,
-            .ssse3,
             .use_glm_div_sqrt_costs,
             .vzeroupper,
             .x87,
-            .xsave,
             .xsavec,
             .xsaveopt,
             .xsaves,
@@ -2873,7 +2882,6 @@ pub const cpu = struct {
             .@"64bit",
             .cmov,
             .cx16,
-            .cx8,
             .fxsr,
             .macrofusion,
             .mmx,
@@ -2886,16 +2894,6 @@ pub const cpu = struct {
             .x87,
         }),
     };
-    pub const winchip_c6 = CpuModel{
-        .name = "winchip_c6",
-        .llvm_name = "winchip-c6",
-        .features = featureSet(&[_]Feature{
-            .mmx,
-            .slow_unaligned_mem_16,
-            .vzeroupper,
-            .x87,
-        }),
-    };
     pub const winchip2 = CpuModel{
         .name = "winchip2",
         .llvm_name = "winchip2",
@@ -2906,6 +2904,16 @@ pub const cpu = struct {
             .x87,
         }),
     };
+    pub const winchip_c6 = CpuModel{
+        .name = "winchip_c6",
+        .llvm_name = "winchip-c6",
+        .features = featureSet(&[_]Feature{
+            .mmx,
+            .slow_unaligned_mem_16,
+            .vzeroupper,
+            .x87,
+        }),
+    };
     pub const x86_64 = CpuModel{
         .name = "x86_64",
         .llvm_name = "x86-64",
@@ -2925,6 +2933,100 @@ pub const cpu = struct {
             .x87,
         }),
     };
+    pub const x86_64_v2 = CpuModel{
+        .name = "x86_64_v2",
+        .llvm_name = "x86-64-v2",
+        .features = featureSet(&[_]Feature{
+            .@"64bit",
+            .cmov,
+            .cx16,
+            .false_deps_popcnt,
+            .fast_15bytenop,
+            .fast_scalar_fsqrt,
+            .fast_shld_rotate,
+            .fxsr,
+            .idivq_to_divl,
+            .macrofusion,
+            .mmx,
+            .nopl,
+            .popcnt,
+            .sahf,
+            .slow_3ops_lea,
+            .slow_unaligned_mem_32,
+            .sse4_2,
+            .vzeroupper,
+            .x87,
+        }),
+    };
+    pub const x86_64_v3 = CpuModel{
+        .name = "x86_64_v3",
+        .llvm_name = "x86-64-v3",
+        .features = featureSet(&[_]Feature{
+            .@"64bit",
+            .avx2,
+            .bmi,
+            .bmi2,
+            .cmov,
+            .cx16,
+            .f16c,
+            .false_deps_lzcnt_tzcnt,
+            .false_deps_popcnt,
+            .fast_15bytenop,
+            .fast_scalar_fsqrt,
+            .fast_shld_rotate,
+            .fast_variable_shuffle,
+            .fma,
+            .fxsr,
+            .idivq_to_divl,
+            .lzcnt,
+            .macrofusion,
+            .mmx,
+            .movbe,
+            .nopl,
+            .popcnt,
+            .sahf,
+            .slow_3ops_lea,
+            .vzeroupper,
+            .x87,
+            .xsave,
+        }),
+    };
+    pub const x86_64_v4 = CpuModel{
+        .name = "x86_64_v4",
+        .llvm_name = "x86-64-v4",
+        .features = featureSet(&[_]Feature{
+            .@"64bit",
+            .avx512bw,
+            .avx512cd,
+            .avx512dq,
+            .avx512vl,
+            .bmi,
+            .bmi2,
+            .cmov,
+            .cx16,
+            .false_deps_popcnt,
+            .fast_15bytenop,
+            .fast_gather,
+            .fast_scalar_fsqrt,
+            .fast_shld_rotate,
+            .fast_variable_shuffle,
+            .fast_vector_fsqrt,
+            .fxsr,
+            .idivq_to_divl,
+            .lzcnt,
+            .macrofusion,
+            .mmx,
+            .movbe,
+            .nopl,
+            .popcnt,
+            .prefer_256_bit,
+            .sahf,
+            .slow_3ops_lea,
+            .vzeroupper,
+            .x87,
+            .xsave,
+        }),
+    };
     pub const yonah = CpuModel{
         .name = "yonah",
         .llvm_name = "yonah",
@@ -2979,7 +3081,6 @@ pub const cpu = struct {
             .sse4a,
             .vzeroupper,
             .x87,
-            .xsave,
             .xsavec,
             .xsaveopt,
             .xsaves,
@@ -3027,7 +3128,56 @@ pub const cpu = struct {
             .vzeroupper,
             .wbnoinvd,
             .x87,
-            .xsave,
+            .xsavec,
+            .xsaveopt,
+            .xsaves,
+        }),
+    };
+    pub const znver3 = CpuModel{
+        .name = "znver3",
+        .llvm_name = "znver3",
+        .features = featureSet(&[_]Feature{
+            .@"64bit",
+            .adx,
+            .avx2,
+            .bmi,
+            .bmi2,
+            .branchfusion,
+            .clflushopt,
+            .clwb,
+            .clzero,
+            .cmov,
+            .cx16,
+            .f16c,
+            .fast_15bytenop,
+            .fast_bextr,
+            .fast_lzcnt,
+            .fast_scalar_shift_masks,
+            .fma,
+            .fsgsbase,
+            .fsrm,
+            .fxsr,
+            .invpcid,
+            .lzcnt,
+            .mmx,
+            .movbe,
+            .mwaitx,
+            .nopl,
+            .pku,
+            .popcnt,
+            .prfchw,
+            .rdpid,
+            .rdrnd,
+            .rdseed,
+            .sahf,
+            .sha,
+            .slow_shld,
+            .sse4a,
+            .vaes,
+            .vpclmulqdq,
+            .vzeroupper,
+            .wbnoinvd,
+            .x87,
             .xsavec,
             .xsaveopt,
             .xsaves,
diff --git a/lib/std/zig/cross_target.zig b/lib/std/zig/cross_target.zig
index c34dcc2bd3..8d6f63f5e3 100644
--- a/lib/std/zig/cross_target.zig
+++ b/lib/std/zig/cross_target.zig
@@ -111,11 +111,11 @@ pub const CrossTarget = struct {
             .kfreebsd,
             .lv2,
             .solaris,
+            .zos,
             .haiku,
             .minix,
             .rtems,
             .nacl,
-            .cnk,
             .aix,
             .cuda,
             .nvcl,
@@ -714,11 +714,11 @@ pub const CrossTarget = struct {
             .kfreebsd,
             .lv2,
             .solaris,
+            .zos,
             .haiku,
             .minix,
             .rtems,
             .nacl,
-            .cnk,
             .aix,
             .cuda,
             .nvcl,
diff --git a/lib/std/zig/system.zig b/lib/std/zig/system.zig
index 2d9f286dd6..485bd479bf 100644
--- a/lib/std/zig/system.zig
+++ b/lib/std/zig/system.zig
@@ -326,11 +326,33 @@ pub const NativeTargetInfo = struct {
             cpu_detection_unimplemented = true;
             break :backup_cpu_detection Target.Cpu.baseline(cpu_arch);
         };
-        cross_target.updateCpuFeatures(&cpu.features);
-
-        var target = try detectAbiAndDynamicLinker(allocator, cpu, os, cross_target);
-        target.cpu_detection_unimplemented = cpu_detection_unimplemented;
-        return target;
+        var result = try detectAbiAndDynamicLinker(allocator, cpu, os, cross_target);
+        // For x86, we need to populate some CPU feature flags depending on architecture
+        // and mode:
+        //  * 16bit_mode => if the abi is code16
+        //  * 32bit_mode => if the arch is i386
+        // However, the "mode" flags can be used as overrides, so if the user explicitly
+        // sets one of them, that takes precedence.
+        switch (cpu_arch) {
+            .i386 => {
+                if (!std.Target.x86.featureSetHasAny(cross_target.cpu_features_add, .{
+                    .@"16bit_mode", .@"32bit_mode",
+                })) {
+                    switch (result.target.abi) {
+                        .code16 => result.target.cpu.features.addFeature(
+                            @enumToInt(std.Target.x86.Feature.@"16bit_mode"),
+                        ),
+                        else => result.target.cpu.features.addFeature(
+                            @enumToInt(std.Target.x86.Feature.@"32bit_mode"),
+                        ),
+                    }
+                }
+            },
+            else => {},
+        }
+        cross_target.updateCpuFeatures(&result.target.cpu.features);
+        result.cpu_detection_unimplemented = cpu_detection_unimplemented;
+        return result;
     }
 
     /// First we attempt to use the executable's own binary. If it is dynamically
diff --git a/lib/std/zig/system/x86.zig b/lib/std/zig/system/x86.zig
index bda9a17c95..0b645ae8e3 100644
--- a/lib/std/zig/system/x86.zig
+++ b/lib/std/zig/system/x86.zig
@@ -311,6 +311,10 @@ fn detectAMDProcessor(cpu: *Target.Cpu, family: u32, model: u32) void {
             }
             return;
         },
+        25 => {
+            cpu.model = &Target.x86.cpu.znver3;
+            return;
+        },
         else => {
             return;
         },
diff --git a/src/Compilation.zig b/src/Compilation.zig
index f76f13cc77..485d7690f5 100644
--- a/src/Compilation.zig
+++ b/src/Compilation.zig
@@ -2345,6 +2345,7 @@ pub fn addCCArgs(
         }
         try argv.append("-D_LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS");
         try argv.append("-D_LIBCXXABI_DISABLE_VISIBILITY_ANNOTATIONS");
+        try argv.append("-D_LIBCPP_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS");
     }
 
     const llvm_triple = try @import("codegen/llvm.zig").targetTriple(arena, target);
@@ -2927,11 +2928,7 @@ pub fn generateBuiltinZigSource(comp: *Compilation, allocator: *Allocator) ![]u8
         const index = @intCast(std.Target.Cpu.Feature.Set.Index, index_usize);
         const is_enabled = target.cpu.features.isEnabled(index);
         if (is_enabled) {
-            // TODO some kind of "zig identifier escape" function rather than
-            // unconditionally using @"" syntax
-            try buffer.appendSlice("        .@\"");
-            try buffer.appendSlice(feature.name);
-            try buffer.appendSlice("\",\n");
+            try buffer.writer().print("        .{},\n", .{std.zig.fmtId(feature.name)});
         }
     }
 
diff --git a/src/clang.zig b/src/clang.zig
index e87852e9ba..d53accbaf7 100644
--- a/src/clang.zig
+++ b/src/clang.zig
@@ -424,7 +424,7 @@ pub const Expr = opaque {
     extern fn ZigClangExpr_getBeginLoc(*const Expr) SourceLocation;
 
     pub const evaluateAsConstantExpr = ZigClangExpr_EvaluateAsConstantExpr;
-    extern fn ZigClangExpr_EvaluateAsConstantExpr(*const Expr, *ExprEvalResult, Expr_ConstExprUsage, *const ASTContext) bool;
+    extern fn ZigClangExpr_EvaluateAsConstantExpr(*const Expr, *ExprEvalResult, Expr_ConstantExprKind, *const ASTContext) bool;
 };
 
 pub const FieldDecl = opaque {
@@ -1341,6 +1341,8 @@ pub const CK = extern enum {
     IntegralCast,
     IntegralToBoolean,
     IntegralToFloating,
+    FloatingToFixedPoint,
+    FixedPointToFloating,
     FixedPointCast,
     FixedPointToIntegral,
     IntegralToFixedPoint,
@@ -1447,6 +1449,7 @@ pub const DeclKind = extern enum {
     MSGuid,
     OMPDeclareMapper,
     OMPDeclareReduction,
+    TemplateParamObject,
     UnresolvedUsingValue,
     OMPAllocate,
     OMPRequires,
@@ -1557,6 +1560,8 @@ pub const BuiltinTypeKind = extern enum {
     SveFloat64x4,
     SveBFloat16x4,
     SveBool,
+    VectorQuad,
+    VectorPair,
     Void,
     Bool,
     Char_U,
@@ -1710,9 +1715,11 @@ pub const PreprocessedEntity_EntityKind = extern enum {
     InclusionDirectiveKind,
 };
 
-pub const Expr_ConstExprUsage = extern enum {
-    EvaluateForCodeGen,
-    EvaluateForMangling,
+pub const Expr_ConstantExprKind = extern enum {
+    Normal,
+    NonClassTemplateArgument,
+    ClassTemplateArgument,
+    ImmediateInvocation,
 };
 
 pub const UnaryExprOrTypeTrait_Kind = extern enum {
diff --git a/src/clang_options_data.zig b/src/clang_options_data.zig
index d0f00625b7..c800de3edf 100644
--- a/src/clang_options_data.zig
+++ b/src/clang_options_data.zig
@@ -15,7 +15,7 @@ flagpd1("CC"),
 flagpd1("EB"),
 flagpd1("EL"),
 flagpd1("Eonly"),
-flagpd1("fAAPCSBitfieldLoad"),
+flagpd1("faapcs-bitfield-load"),
 flagpd1("H"),
 .{
     .name = "<input>",
@@ -1027,14 +1027,6 @@ flagpsl("MT"),
     .pd2 = false,
     .psl = true,
 },
-.{
-    .name = "Zd",
-    .syntax = .flag,
-    .zig_equivalent = .other,
-    .pd1 = true,
-    .pd2 = false,
-    .psl = true,
-},
 .{
     .name = "Ze",
     .syntax = .flag,
@@ -2073,49 +2065,7 @@ flagpd1("analyzer-viz-egraph-graphviz"),
 flagpd1("analyzer-werror"),
 flagpd1("fslp-vectorize-aggressive"),
 flagpd1("fno-slp-vectorize-aggressive"),
-flagpd1("fno-record-gcc-switches"),
-flagpd1("fexpensive-optimizations"),
-flagpd1("fno-expensive-optimizations"),
-flagpd1("fdefer-pop"),
-flagpd1("fno-defer-pop"),
-flagpd1("fextended-identifiers"),
-flagpd1("fno-extended-identifiers"),
-flagpd1("fhonor-infinites"),
-flagpd1("fno-honor-infinites"),
-flagpd1("findirect-virtual-calls"),
-sepd1("fnew-alignment"),
-flagpd1("faligned-new"),
-flagpd1("fno-aligned-new"),
-flagpd1("fsched-interblock"),
-flagpd1("ftree-vectorize"),
-flagpd1("fno-tree-vectorize"),
-flagpd1("ftree-slp-vectorize"),
-flagpd1("fno-tree-slp-vectorize"),
-flagpd1("fident"),
-flagpd1("fterminated-vtables"),
-flagpd1("grecord-gcc-switches"),
-flagpd1("gno-record-gcc-switches"),
-flagpd1("nocudainc"),
-flagpd1("nocudalib"),
-.{
-    .name = "system-header-prefix",
-    .syntax = .separate,
-    .zig_equivalent = .other,
-    .pd1 = false,
-    .pd2 = true,
-    .psl = false,
-},
-.{
-    .name = "no-system-header-prefix",
-    .syntax = .separate,
-    .zig_equivalent = .other,
-    .pd1 = false,
-    .pd2 = true,
-    .psl = false,
-},
-flagpd1("integrated-as"),
-flagpd1("no-integrated-as"),
-flagpd1("fno-ident"),
+flagpd1("shared-libasan"),
 .{
     .name = "Gs",
     .syntax = .flag,
@@ -2196,7 +2146,6 @@ flagpd1("fno-ident"),
     .pd2 = false,
     .psl = true,
 },
-flagpd1("fcuda-rdc"),
 .{
     .name = "Os",
     .syntax = .flag,
@@ -2237,9 +2186,52 @@ flagpd1("fcuda-rdc"),
     .pd2 = false,
     .psl = true,
 },
-flagpd1("fno-cuda-rdc"),
-flagpd1("shared-libasan"),
 flagpd1("frecord-gcc-switches"),
+flagpd1("fno-record-gcc-switches"),
+flagpd1("fident"),
+flagpd1("fno-ident"),
+flagpd1("fexpensive-optimizations"),
+flagpd1("fno-expensive-optimizations"),
+flagpd1("fdefer-pop"),
+flagpd1("fno-defer-pop"),
+flagpd1("fextended-identifiers"),
+flagpd1("fno-extended-identifiers"),
+flagpd1("fhonor-infinites"),
+flagpd1("fno-honor-infinites"),
+flagpd1("fcuda-rdc"),
+flagpd1("fno-cuda-rdc"),
+flagpd1("findirect-virtual-calls"),
+sepd1("fnew-alignment"),
+flagpd1("faligned-new"),
+flagpd1("fno-aligned-new"),
+flagpd1("fsched-interblock"),
+flagpd1("ftree-vectorize"),
+flagpd1("fno-tree-vectorize"),
+flagpd1("ftree-slp-vectorize"),
+flagpd1("fno-tree-slp-vectorize"),
+flagpd1("fterminated-vtables"),
+flagpd1("grecord-gcc-switches"),
+flagpd1("gno-record-gcc-switches"),
+flagpd1("nocudainc"),
+flagpd1("nocudalib"),
+.{
+    .name = "system-header-prefix",
+    .syntax = .separate,
+    .zig_equivalent = .other,
+    .pd1 = false,
+    .pd2 = true,
+    .psl = false,
+},
+.{
+    .name = "no-system-header-prefix",
+    .syntax = .separate,
+    .zig_equivalent = .other,
+    .pd1 = false,
+    .pd2 = true,
+    .psl = false,
+},
+flagpd1("integrated-as"),
+flagpd1("no-integrated-as"),
 .{
     .name = "ansi",
     .syntax = .flag,
@@ -2251,11 +2243,8 @@ flagpd1("frecord-gcc-switches"),
 sepd1("arch"),
 flagpd1("arch_errors_fatal"),
 sepd1("arch_only"),
-flagpd1("arcmt-check"),
-flagpd1("arcmt-migrate"),
 flagpd1("arcmt-migrate-emit-errors"),
 sepd1("arcmt-migrate-report-output"),
-flagpd1("arcmt-modify"),
 flagpd1("ast-dump"),
 flagpd1("ast-dump-all"),
 flagpd1("ast-dump-decl-types"),
@@ -2458,6 +2447,7 @@ flagpd1("emit-pch"),
 },
 flagpd1("enable-trivial-auto-var-init-zero-knowing-it-will-be-removed-from-clang"),
 sepd1("error-on-deserialized-decl"),
+sepd1("exception-model"),
 sepd1("exported_symbols_list"),
 .{
     .name = "fPIC",
@@ -2475,7 +2465,8 @@ sepd1("exported_symbols_list"),
     .pd2 = false,
     .psl = false,
 },
-flagpd1("fno-access-control"),
+flagpd1("faapcs-bitfield-width"),
+flagpd1("faccess-control"),
 flagpd1("faddrsig"),
 flagpd1("faggressive-function-elimination"),
 flagpd1("falign-commons"),
@@ -2488,6 +2479,7 @@ flagpd1("fall-intrinsics"),
 flagpd1("fallow-editor-placeholders"),
 flagpd1("fallow-half-arguments-and-returns"),
 flagpd1("fallow-pch-with-compiler-errors"),
+flagpd1("fallow-pcm-with-compiler-errors"),
 flagpd1("fallow-unsupported"),
 flagpd1("faltivec"),
 flagpd1("fkeep-inline-functions"),
@@ -2497,6 +2489,7 @@ flagpd1("fapple-link-rtlib"),
 flagpd1("fapple-pragma-pack"),
 flagpd1("fapplication-extension"),
 flagpd1("fapply-global-visibility-to-externs"),
+flagpd1("fapprox-func"),
 flagpd1("fasm"),
 flagpd1("fasm-blocks"),
 flagpd1("fassociative-math"),
@@ -2597,6 +2590,7 @@ flagpd1("fdiagnostics-show-note-include-stack"),
 flagpd1("fdiagnostics-show-option"),
 flagpd1("fdiagnostics-show-template-tree"),
 flagpd1("fdigraphs"),
+flagpd1("fdirect-access-external-data"),
 flagpd1("fdisable-module-hash"),
 flagpd1("fdiscard-value-names"),
 flagpd1("fdollar-ok"),
@@ -2617,14 +2611,13 @@ flagpd1("feliminate-unused-debug-types"),
 flagpd1("fembed-bitcode"),
 flagpd1("fembed-bitcode-marker"),
 flagpd1("femit-all-decls"),
-flagpd1("femit-coverage-data"),
-flagpd1("femit-coverage-notes"),
 flagpd1("femulated-tls"),
 flagpd1("fenable-matrix"),
 flagpd1("fencode-extended-block-signature"),
 sepd1("ferror-limit"),
 flagpd1("fescaping-block-tail-calls"),
 flagpd1("fexceptions"),
+flagpd1("fexperimental-debug-variable-locations"),
 flagpd1("fexperimental-isel"),
 flagpd1("fexperimental-new-constant-interpreter"),
 flagpd1("fexperimental-new-pass-manager"),
@@ -2638,6 +2631,7 @@ flagpd1("ffast-math"),
 flagpd1("ffat-lto-objects"),
 flagpd1("fcheck-new"),
 flagpd1("ffine-grained-bitfield-accesses"),
+flagpd1("ffinite-loops"),
 flagpd1("ffinite-math-only"),
 flagpd1("finline-limit"),
 flagpd1("ffixed-form"),
@@ -2698,7 +2692,10 @@ flagpd1("fgnu-inline-asm"),
 flagpd1("fgnu-keywords"),
 flagpd1("fgnu-runtime"),
 flagpd1("fgpu-allow-device-init"),
+flagpd1("fgpu-defer-diag"),
+flagpd1("fgpu-exclude-wrong-side-overloads"),
 flagpd1("fgpu-rdc"),
+flagpd1("fhalf-no-semantic-interposition"),
 flagpd1("fheinous-gnu-extensions"),
 flagpd1("fhip-dump-offload-linker-script"),
 flagpd1("fhip-new-launch-api"),
@@ -2738,6 +2735,7 @@ flagpd1("fjump-tables"),
 flagpd1("fkeep-static-consts"),
 flagpd1("flat_namespace"),
 flagpd1("flax-vector-conversions"),
+flagpd1("flegacy-pass-manager"),
 flagpd1("flimit-debug-info"),
 .{
     .name = "flto",
@@ -2752,6 +2750,7 @@ flagpd1("flto-visibility-public-std"),
 sepd1("fmacro-backtrace-limit"),
 flagpd1("fmath-errno"),
 flagpd1("fmax-identifier-length"),
+flagpd1("fmemory-profile"),
 flagpd1("fmerge-all-constants"),
 flagpd1("fmerge-constants"),
 flagpd1("fmerge-functions"),
@@ -2804,7 +2803,8 @@ flagpd1("fnext-runtime"),
     .pd2 = false,
     .psl = false,
 },
-flagpd1("fno-no-access-control"),
+flagpd1("fno-aapcs-bitfield-width"),
+flagpd1("fno-access-control"),
 flagpd1("fno-addrsig"),
 flagpd1("fno-aggressive-function-elimination"),
 flagpd1("fno-align-commons"),
@@ -2882,6 +2882,7 @@ flagpd1("fno-diagnostics-show-note-include-stack"),
 flagpd1("fno-diagnostics-show-option"),
 flagpd1("fno-diagnostics-use-presumed-location"),
 flagpd1("fno-digraphs"),
+flagpd1("fno-direct-access-external-data"),
 flagpd1("fno-discard-value-names"),
 flagpd1("fno-dllexport-inlines"),
 flagpd1("fno-dollar-ok"),
@@ -2908,6 +2909,7 @@ flagpd1("fno-fast-math"),
 flagpd1("fno-fat-lto-objects"),
 flagpd1("fno-check-new"),
 flagpd1("fno-fine-grained-bitfield-accesses"),
+flagpd1("fno-finite-loops"),
 flagpd1("fno-finite-math-only"),
 flagpd1("fno-inline-limit"),
 flagpd1("fno-fixed-form"),
@@ -2932,6 +2934,8 @@ flagpd1("fno-gnu89-inline"),
 flagpd1("fno-gnu-inline-asm"),
 flagpd1("fno-gnu-keywords"),
 flagpd1("fno-gpu-allow-device-init"),
+flagpd1("fno-gpu-defer-diag"),
+flagpd1("fno-gpu-exclude-wrong-side-overloads"),
 flagpd1("fno-gpu-rdc"),
 flagpd1("fno-hip-new-launch-api"),
 flagpd1("fno-honor-infinities"),
@@ -2955,6 +2959,7 @@ flagpd1("fno-ivopts"),
 flagpd1("fno-jump-tables"),
 flagpd1("fno-keep-static-consts"),
 flagpd1("fno-lax-vector-conversions"),
+flagpd1("fno-legacy-pass-manager"),
 flagpd1("fno-limit-debug-info"),
 .{
     .name = "fno-lto",
@@ -2969,6 +2974,7 @@ flagpd1("fno-math-builtin"),
 flagpd1("fno-math-errno"),
 flagpd1("fno-max-identifier-length"),
 flagpd1("fno-max-type-align"),
+flagpd1("fno-memory-profile"),
 flagpd1("fno-merge-all-constants"),
 flagpd1("fno-merge-constants"),
 flagpd1("fno-module-file-deps"),
@@ -3018,6 +3024,7 @@ flagpd1("fno-permissive"),
 flagpd1("fno-pic"),
 flagpd1("fno-pie"),
 flagpd1("fno-plt"),
+flagpd1("fno-prebuilt-implicit-modules"),
 flagpd1("fno-prefetch-loop-arrays"),
 flagpd1("fno-preserve-as-comments"),
 flagpd1("fno-printf"),
@@ -3034,6 +3041,7 @@ flagpd1("fno-profile-sample-use"),
 flagpd1("fno-profile-use"),
 flagpd1("fno-profile-values"),
 flagpd1("fno-protect-parens"),
+flagpd1("fno-pseudo-probe-for-profiling"),
 flagpd1("fno-range-check"),
 flagpd1("fno-real-4-real-10"),
 flagpd1("fno-real-4-real-16"),
@@ -3104,6 +3112,7 @@ flagpd1("fno-spec-constr-count"),
 flagpd1("fno-spell-checking"),
 flagpd1("fno-split-dwarf-inlining"),
 flagpd1("fno-split-lto-unit"),
+flagpd1("fno-split-machine-functions"),
 flagpd1("fno-stack-arrays"),
 flagpd1("fno-stack-check"),
 flagpd1("fno-stack-clash-protection"),
@@ -3154,6 +3163,8 @@ flagpd1("fno-vect-cost-model"),
 flagpd1("fno-vectorize"),
 flagpd1("fno-verbose-asm"),
 flagpd1("fno-virtual-function-elimination"),
+flagpd1("fno-visibility-from-dllstorageclass"),
+flagpd1("fno-visibility-inlines-hidden-static-local-var"),
 flagpd1("fno-wchar"),
 flagpd1("fno-web"),
 flagpd1("fno-whole-file"),
@@ -3161,6 +3172,7 @@ flagpd1("fno-whole-program"),
 flagpd1("fno-whole-program-vtables"),
 flagpd1("fno-working-directory"),
 flagpd1("fno-wrapv"),
+flagpd1("fno-xl-pragma-pack"),
 flagpd1("fno-xray-always-emit-customevents"),
 flagpd1("fno-xray-always-emit-typedevents"),
 flagpd1("fno-xray-function-index"),
@@ -3212,6 +3224,7 @@ flagpd1("fpack-struct"),
 flagpd1("fpadding-on-unsigned-fixed-point"),
 flagpd1("fparse-all-comments"),
 flagpd1("fpascal-strings"),
+flagpd1("fpass-by-value-is-noalias"),
 flagpd1("fpcc-struct-return"),
 flagpd1("fpch-codegen"),
 flagpd1("fpch-debuginfo"),
@@ -3223,6 +3236,7 @@ flagpd1("fpermissive"),
 flagpd1("fpic"),
 flagpd1("fpie"),
 flagpd1("fplt"),
+flagpd1("fprebuilt-implicit-modules"),
 flagpd1("fprefetch-loop-arrays"),
 flagpd1("fpreserve-as-comments"),
 flagpd1("fpreserve-vec3-type"),
@@ -3241,6 +3255,7 @@ flagpd1("fprofile-sample-use"),
 flagpd1("fprofile-use"),
 flagpd1("fprofile-values"),
 flagpd1("fprotect-parens"),
+flagpd1("fpseudo-probe-for-profiling"),
 .{
     .name = "framework",
     .syntax = .separate,
@@ -3340,6 +3355,7 @@ flagpd1("fspell-checking"),
 sepd1("fspell-checking-limit"),
 flagpd1("fsplit-dwarf-inlining"),
 flagpd1("fsplit-lto-unit"),
+flagpd1("fsplit-machine-functions"),
 flagpd1("fsplit-stack"),
 flagpd1("fstack-arrays"),
 flagpd1("fstack-check"),
@@ -3395,6 +3411,7 @@ flagpd1("funsigned-bitfields"),
 flagpd1("funsigned-char"),
 flagpd1("funswitch-loops"),
 flagpd1("funwind-tables"),
+flagpd1("fuse-ctor-homing"),
 flagpd1("fuse-cxa-atexit"),
 flagpd1("fuse-init-array"),
 flagpd1("fuse-line-directives"),
@@ -3407,8 +3424,10 @@ flagpd1("fvectorize"),
 flagpd1("fverbose-asm"),
 flagpd1("fvirtual-function-elimination"),
 sepd1("fvisibility"),
+flagpd1("fvisibility-from-dllstorageclass"),
 flagpd1("fvisibility-global-new-delete-hidden"),
 flagpd1("fvisibility-inlines-hidden"),
+flagpd1("fvisibility-inlines-hidden-static-local-var"),
 flagpd1("fvisibility-ms-compat"),
 flagpd1("fwasm-exceptions"),
 flagpd1("fweb"),
@@ -3417,6 +3436,7 @@ flagpd1("fwhole-program"),
 flagpd1("fwhole-program-vtables"),
 flagpd1("fwrapv"),
 flagpd1("fwritable-strings"),
+flagpd1("fxl-pragma-pack"),
 flagpd1("fxray-always-emit-customevents"),
 flagpd1("fxray-always-emit-typedevents"),
 flagpd1("fxray-function-index"),
@@ -3449,6 +3469,8 @@ flagpd1("gcodeview"),
 flagpd1("gcodeview-ghash"),
 flagpd1("gcolumn-info"),
 flagpd1("gdwarf"),
+flagpd1("gdwarf32"),
+flagpd1("gdwarf64"),
 flagpd1("gdwarf-2"),
 flagpd1("gdwarf-3"),
 flagpd1("gdwarf-4"),
@@ -3484,6 +3506,7 @@ flagpd1("gno-gnu-pubnames"),
 flagpd1("gno-inline-line-tables"),
 flagpd1("gno-pubnames"),
 flagpd1("gno-record-command-line"),
+flagpd1("gno-split-dwarf"),
 flagpd1("gno-strict-dwarf"),
 .{
     .name = "gpu-use-aux-triple-only",
@@ -3538,6 +3561,8 @@ flagpd1("m64"),
 flagpd1("m80387"),
 flagpd1("mseses"),
 flagpd1("mabi=ieeelongdouble"),
+flagpd1("mabi=vec-default"),
+flagpd1("mabi=vec-extabi"),
 flagpd1("mabicalls"),
 flagpd1("madx"),
 flagpd1("maes"),
@@ -3568,6 +3593,7 @@ flagpd1("mavx512vl"),
 flagpd1("mavx512vnni"),
 flagpd1("mavx512vp2intersect"),
 flagpd1("mavx512vpopcntdq"),
+flagpd1("mavxvnni"),
 flagpd1("mbackchain"),
 flagpd1("mbig-endian"),
 flagpd1("mbmi"),
@@ -3600,6 +3626,7 @@ flagpd1("mdouble-float"),
 flagpd1("mdsp"),
 flagpd1("mdspr2"),
 sepd1("meabi"),
+flagpd1("mefpu2"),
 flagpd1("membedded-data"),
 flagpd1("menable-experimental-extensions"),
 flagpd1("menable-no-infs"),
@@ -3633,10 +3660,12 @@ flagpd1("mglobal-merge"),
 flagpd1("mgpopt"),
 flagpd1("mhard-float"),
 flagpd1("mhvx"),
+flagpd1("mhreset"),
 flagpd1("mhtm"),
 flagpd1("miamcu"),
 flagpd1("mieee-fp"),
 flagpd1("mieee-rnd-near"),
+flagpd1("mignore-xcoff-visibility"),
 flagpd1("migrate"),
 flagpd1("no-finalize-removal"),
 flagpd1("no-ns-alloc-error"),
@@ -3663,6 +3692,7 @@ flagpd1("mips64r5"),
 flagpd1("mips64r6"),
 flagpd1("misel"),
 flagpd1("mkernel"),
+flagpd1("mkl"),
 flagpd1("mldc1-sdc1"),
 sepd1("mlimit-float-precision"),
 sepd1("mlink-bitcode-file"),
@@ -3681,10 +3711,12 @@ flagpd1("mlvi-hardening"),
 flagpd1("mlwp"),
 flagpd1("mlzcnt"),
 flagpd1("mmadd4"),
+flagpd1("mmark-bti-property"),
 flagpd1("mmemops"),
 flagpd1("mmfcrf"),
 flagpd1("mmfocrf"),
 flagpd1("mmicromips"),
+flagpd1("mmma"),
 flagpd1("mmmx"),
 flagpd1("mmovbe"),
 flagpd1("mmovdir64b"),
@@ -3724,6 +3756,7 @@ flagpd1("mno-avx512vl"),
 flagpd1("mno-avx512vnni"),
 flagpd1("mno-avx512vp2intersect"),
 flagpd1("mno-avx512vpopcntdq"),
+flagpd1("mno-avxvnni"),
 flagpd1("mno-backchain"),
 flagpd1("mno-bmi"),
 flagpd1("mno-bmi2"),
@@ -3762,6 +3795,7 @@ flagpd1("mno-ginv"),
 flagpd1("mno-global-merge"),
 flagpd1("mno-gpopt"),
 flagpd1("mno-hvx"),
+flagpd1("mno-hreset"),
 flagpd1("mno-htm"),
 flagpd1("mno-iamcu"),
 flagpd1("mno-implicit-float"),
@@ -3770,6 +3804,7 @@ flagpd1("mno-inline-all-stringops"),
 flagpd1("mno-invariant-function-descriptors"),
 flagpd1("mno-invpcid"),
 flagpd1("mno-isel"),
+flagpd1("mno-kl"),
 flagpd1("mno-ldc1-sdc1"),
 flagpd1("mno-local-sdata"),
 flagpd1("mno-long-calls"),
@@ -3784,6 +3819,7 @@ flagpd1("mno-mfcrf"),
 flagpd1("mno-mfocrf"),
 flagpd1("mno-micromips"),
 flagpd1("mno-mips16"),
+flagpd1("mno-mma"),
 flagpd1("mno-mmx"),
 flagpd1("mno-movbe"),
 flagpd1("mno-movdir64b"),
@@ -3803,6 +3839,7 @@ flagpd1("mno-nvs"),
 flagpd1("mno-odd-spreg"),
 flagpd1("mno-omit-leaf-frame-pointer"),
 flagpd1("mno-outline"),
+flagpd1("mno-outline-atomics"),
 flagpd1("mno-packed-stack"),
 flagpd1("mno-packets"),
 flagpd1("mno-pascal-strings"),
@@ -3820,7 +3857,6 @@ flagpd1("mno-prefetchwt1"),
 flagpd1("mno-prfchw"),
 flagpd1("mno-ptwrite"),
 flagpd1("mno-pure-code"),
-flagpd1("mno-qpx"),
 flagpd1("mno-rdpid"),
 flagpd1("mno-rdrnd"),
 flagpd1("mno-rdseed"),
@@ -3853,7 +3889,6 @@ flagpd1("mno-simd128"),
 flagpd1("mno-soft-float"),
 flagpd1("mno-spe"),
 flagpd1("mno-speculative-load-hardening"),
-flagpd1("mno-sram-ecc"),
 flagpd1("mno-sse"),
 flagpd1("mno-sse2"),
 flagpd1("mno-sse3"),
@@ -3869,8 +3904,10 @@ flagpd1("mno-tbm"),
 flagpd1("mno-thumb"),
 flagpd1("mno-tls-direct-seg-refs"),
 flagpd1("mno-tsxldtrk"),
+flagpd1("mno-uintr"),
 flagpd1("mno-unaligned-access"),
 flagpd1("mno-unimplemented-simd128"),
+flagpd1("mno-unsafe-fp-atomics"),
 flagpd1("mno-vaes"),
 flagpd1("mno-virt"),
 flagpd1("mno-vpclmulqdq"),
@@ -3881,9 +3918,9 @@ flagpd1("mno-waitpkg"),
 flagpd1("mno-warn-nonportable-cfstrings"),
 flagpd1("mno-wavefrontsize64"),
 flagpd1("mno-wbnoinvd"),
+flagpd1("mno-widekl"),
 flagpd1("mno-x87"),
 flagpd1("mno-xgot"),
-flagpd1("mno-xnack"),
 flagpd1("mno-xop"),
 flagpd1("mno-xsave"),
 flagpd1("mno-xsavec"),
@@ -3894,6 +3931,7 @@ flagpd1("mnocrc"),
 flagpd1("mno-direct-move"),
 flagpd1("mnontrapping-fptoint"),
 flagpd1("mnop-mcount"),
+flagpd1("mno-paired-vector-memops"),
 flagpd1("mno-crypto"),
 flagpd1("mnvj"),
 flagpd1("mnvs"),
@@ -3903,8 +3941,10 @@ flagpd1("module-file-deps"),
 flagpd1("module-file-info"),
 flagpd1("momit-leaf-frame-pointer"),
 flagpd1("moutline"),
+flagpd1("moutline-atomics"),
 flagpd1("mpacked-stack"),
 flagpd1("mpackets"),
+flagpd1("mpaired-vector-memops"),
 flagpd1("mpascal-strings"),
 flagpd1("mpclmul"),
 flagpd1("mpconfig"),
@@ -3922,7 +3962,6 @@ flagpd1("mprfchw"),
 flagpd1("mptwrite"),
 flagpd1("mpure-code"),
 flagpd1("mqdsp6-compat"),
-flagpd1("mqpx"),
 flagpd1("mrdpid"),
 flagpd1("mrdrnd"),
 flagpd1("mrdseed"),
@@ -3965,13 +4004,13 @@ flagpd1("msgx"),
 flagpd1("msha"),
 flagpd1("mshstk"),
 flagpd1("msign-ext"),
+flagpd1("msim"),
 flagpd1("msimd128"),
 flagpd1("msingle-float"),
 sepd1("msmall-data-limit"),
 flagpd1("msoft-float"),
 flagpd1("mspe"),
 flagpd1("mspeculative-load-hardening"),
-flagpd1("msram-ecc"),
 flagpd1("msse"),
 flagpd1("msse2"),
 flagpd1("msse3"),
@@ -3995,11 +4034,13 @@ sepd1("mtp"),
 flagpd1("mtsxldtrk"),
 flagpd1("mtune=?"),
 flagpd1("muclibc"),
+flagpd1("muintr"),
 flagpd1("multi_module"),
 sepd1("multiply_defined"),
 sepd1("multiply_defined_unused"),
 flagpd1("munaligned-access"),
 flagpd1("munimplemented-simd128"),
+flagpd1("munsafe-fp-atomics"),
 flagpd1("munwind-tables"),
 flagpd1("mv5"),
 flagpd1("mv55"),
@@ -4019,10 +4060,10 @@ flagpd1("mwaitpkg"),
 flagpd1("mwarn-nonportable-cfstrings"),
 flagpd1("mwavefrontsize64"),
 flagpd1("mwbnoinvd"),
+flagpd1("mwidekl"),
 flagpd1("mx32"),
 flagpd1("mx87"),
 flagpd1("mxgot"),
-flagpd1("mxnack"),
 flagpd1("mxop"),
 flagpd1("mxsave"),
 flagpd1("mxsavec"),
@@ -4462,7 +4503,7 @@ sepd1("target-feature"),
 },
 sepd1("target-linker-version"),
 flagpd1("templight-dump"),
-flagpd1("test-coverage"),
+flagpd1("test-io"),
 flagpd1("time"),
 .{
     .name = "traditional",
@@ -4490,6 +4531,7 @@ flagpd1("time"),
 },
 flagpd1("trim-egraph"),
 sepd1("triple"),
+sepd1("tune-cpu"),
 flagpd1("twolevel_namespace"),
 flagpd1("twolevel_namespace_hints"),
 sepd1("umbrella"),
@@ -4541,6 +4583,7 @@ flagpd1("whyload"),
 },
 joinpd1("fsanitize-undefined-strip-path-components="),
 joinpd1("fopenmp-cuda-teams-reduction-recs-num="),
+joinpd1("fvisibility-externs-nodllstorageclass="),
 joinpd1("analyzer-config-compatibility-mode="),
 joinpd1("ftrivial-auto-var-init-stop-after="),
 joinpd1("fpatchable-function-entry-offset="),
@@ -4549,7 +4592,11 @@ joinpd1("fsanitize-address-field-padding="),
 joinpd1("fdiagnostics-hotness-threshold="),
 joinpd1("fsanitize-memory-track-origins="),
 joinpd1("mwatchos-simulator-version-min="),
+joinpd1("fvisibility-externs-dllimport="),
+joinpd1("fvisibility-nodllstorageclass="),
+joinpd1("fxray-selected-function-group="),
 joinpd1("mappletvsimulator-version-min="),
+joinpd1("mstack-protector-guard-offset="),
 joinpd1("fsanitize-coverage-whitelist="),
 joinpd1("fsanitize-coverage-blacklist="),
 joinpd1("fobjc-nonfragile-abi-version="),
@@ -4582,7 +4629,16 @@ joinpd1("fopenmp-cuda-blocks-per-sm="),
 joinpd1("fsanitize-system-blacklist="),
 jspd1("fxray-instruction-threshold"),
 joinpd1("headerpad_max_install_names"),
+.{
+    .name = "libomptarget-nvptx-bc-path=",
+    .syntax = .joined,
+    .zig_equivalent = .other,
+    .pd1 = false,
+    .pd2 = true,
+    .psl = false,
+},
 joinpd1("mios-simulator-version-min="),
+joinpd1("mstack-protector-guard-reg="),
 .{
     .name = "include-with-prefix-after=",
     .syntax = .joined,
@@ -4655,14 +4711,6 @@ joinpd1("fsanitize-hwaddress-abi="),
 joinpd1("ftime-trace-granularity="),
 jspd1("fxray-always-instrument="),
 jspd1("internal-externc-isystem"),
-.{
-    .name = "libomptarget-nvptx-path=",
-    .syntax = .joined,
-    .zig_equivalent = .other,
-    .pd1 = false,
-    .pd2 = true,
-    .psl = false,
-},
 .{
     .name = "no-system-header-prefix=",
     .syntax = .joined,
@@ -4693,6 +4741,7 @@ jspd1("fxray-never-instrument="),
 jspd1("interface-stub-version="),
 joinpd1("malign-branch-boundary="),
 joinpd1("mappletvos-version-min="),
+joinpd1("mstack-protector-guard="),
 joinpd1("Wnonportable-cfstrings"),
 joinpd1("fbasic-block-sections="),
 joinpd1("fdefault-calling-conv="),
@@ -4712,6 +4761,8 @@ joinpd1("foperator-arrow-depth="),
 joinpd1("fprebuilt-module-path="),
 joinpd1("fprofile-filter-files="),
 joinpd1("fspell-checking-limit="),
+joinpd1("fvisibility-dllexport="),
+joinpd1("fxray-function-groups="),
 joinpd1("miphoneos-version-min="),
 joinpd1("msmall-data-threshold="),
 joinpd1("Wlarge-by-value-copy="),
@@ -4730,7 +4781,7 @@ joinpd1("fmodules-prune-after="),
     .psl = false,
 },
 jspd1("iframeworkwithsysroot"),
-joinpd1("mamdgpu-debugger-abi="),
+joinpd1("mcode-object-version="),
 joinpd1("mpad-max-prefix-size="),
 joinpd1("mprefer-vector-width="),
 joinpd1("msign-return-address="),
@@ -4774,6 +4825,7 @@ joinpd1("fmax-stack-var-size="),
 joinpd1("fmodules-cache-path="),
 joinpd1("fmodules-embed-file="),
 joinpd1("fprofile-instrument="),
+joinpd1("fprofile-prefix-map="),
 joinpd1("fprofile-sample-use="),
 joinpd1("fsanitize-blacklist="),
 joinpd1("mmacosx-version-min="),
@@ -4802,6 +4854,14 @@ joinpd1("fprofile-instr-use="),
     .psl = false,
 },
 joinpd1("fthin-link-bitcode="),
+.{
+    .name = "gpu-instrument-lib=",
+    .syntax = .joined,
+    .zig_equivalent = .other,
+    .pd1 = false,
+    .pd2 = true,
+    .psl = false,
+},
 joinpd1("mbranch-protection="),
 joinpd1("mmacos-version-min="),
 joinpd1("pch-through-header="),
@@ -4838,6 +4898,7 @@ joinpd1("target-sdk-version="),
     .pd2 = true,
     .psl = false,
 },
+joinpd1("fbinutils-version="),
 joinpd1("fclang-abi-compat="),
 joinpd1("fcompile-resource="),
 joinpd1("fdebug-prefix-map="),
@@ -4855,6 +4916,7 @@ joinpd1("fmacro-prefix-map="),
 },
 joinpd1("fobjc-abi-version="),
 joinpd1("foutput-class-dir="),
+joinpd1("fproc-stat-report="),
 joinpd1("fprofile-generate="),
 joinpd1("frewrite-map-file="),
 .{
@@ -4888,10 +4950,12 @@ joinpd1("fconstexpr-steps="),
 joinpd1("ffile-prefix-map="),
 joinpd1("fmodule-map-file="),
 joinpd1("fobjc-arc-cxxlib="),
+joinpd1("fproc-stat-report"),
 jspd1("iwithprefixbefore"),
 joinpd1("malign-functions="),
 joinpd1("mios-version-min="),
 joinpd1("mstack-alignment="),
+joinpd1("msve-vector-bits="),
 .{
     .name = "no-cuda-gpu-arch=",
     .syntax = .joined,
@@ -4912,9 +4976,11 @@ joinpd1("analyzer-output="),
 },
 joinpd1("debug-info-kind="),
 joinpd1("debugger-tuning="),
+joinpd1("exception-model="),
 joinpd1("fcf-runtime-abi="),
 joinpd1("finit-character="),
 joinpd1("fmax-type-align="),
+joinpd1("fmemory-profile="),
 joinpd1("fmessage-length="),
 .{
     .name = "fopenmp-targets=",
@@ -4925,6 +4991,7 @@ joinpd1("fmessage-length="),
     .psl = false,
 },
 joinpd1("fopenmp-version="),
+joinpd1("fprofile-update="),
 joinpd1("fshow-overloads="),
 joinpd1("ftemplate-depth-"),
 joinpd1("ftemplate-depth="),
@@ -5074,6 +5141,7 @@ joinpd1("finit-integer="),
 joinpd1("finit-logical="),
 joinpd1("finline-limit="),
 joinpd1("fobjc-runtime="),
+joinpd1("fprofile-list="),
 .{
     .name = "gcc-toolchain=",
     .syntax = .joined,
@@ -5111,6 +5179,7 @@ joinpd1("Wlarger-than="),
     .pd2 = true,
     .psl = false,
 },
+joinpd1("arcmt-action="),
 joinpd1("ast-dump-all="),
 .{
     .name = "autocomplete=",
@@ -5141,6 +5210,7 @@ joinpd1("fpass-plugin="),
 joinpd1("fprofile-dir="),
 joinpd1("fprofile-use="),
 joinpd1("frandom-seed="),
+joinpd1("ftime-report="),
 joinpd1("gsplit-dwarf="),
 jspd1("isystem-after"),
 joinpd1("malign-jumps="),
@@ -5300,6 +5370,14 @@ jspd1("sub_library"),
     .pd2 = false,
     .psl = true,
 },
+.{
+    .name = "vctoolsdir",
+    .syntax = .joined_or_separate,
+    .zig_equivalent = .other,
+    .pd1 = true,
+    .pd2 = false,
+    .psl = true,
+},
 .{
     .name = "classpath=",
     .syntax = .joined,
@@ -5429,11 +5507,20 @@ jspd1("undefined"),
     .pd2 = true,
     .psl = false,
 },
+jspd1("dsym-dir"),
 joinpd1("fopenmp="),
 joinpd1("fplugin="),
 joinpd1("fuse-ld="),
 joinpd1("fveclib="),
 jspd1("isysroot"),
+.{
+    .name = "ld-path=",
+    .syntax = .joined,
+    .zig_equivalent = .other,
+    .pd1 = false,
+    .pd2 = true,
+    .psl = false,
+},
 joinpd1("mcmodel="),
 joinpd1("mconsole"),
 joinpd1("mdouble="),
@@ -5658,6 +5745,14 @@ jspd1("Ttext"),
     .pd2 = false,
     .psl = true,
 },
+.{
+    .name = "tune:",
+    .syntax = .joined,
+    .zig_equivalent = .other,
+    .pd1 = true,
+    .pd2 = false,
+    .psl = true,
+},
 .{
     .name = "warn-",
     .syntax = .joined,
@@ -5743,6 +5838,14 @@ joinpd1("mtp="),
     .pd2 = false,
     .psl = false,
 },
+.{
+    .name = "Fe:",
+    .syntax = .joined,
+    .zig_equivalent = .other,
+    .pd1 = true,
+    .pd2 = false,
+    .psl = true,
+},
 .{
     .name = "RTC",
     .syntax = .joined,
diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig
index af6c890861..94fb1de8c4 100644
--- a/src/codegen/llvm.zig
+++ b/src/codegen/llvm.zig
@@ -28,6 +28,7 @@ pub fn targetTriple(allocator: *Allocator, target: std.Target) ![:0]u8 {
         .avr => "avr",
         .bpfel => "bpfel",
         .bpfeb => "bpfeb",
+        .csky => "csky",
         .hexagon => "hexagon",
         .mips => "mips",
         .mipsel => "mipsel",
@@ -35,6 +36,7 @@ pub fn targetTriple(allocator: *Allocator, target: std.Target) ![:0]u8 {
         .mips64el => "mips64el",
         .msp430 => "msp430",
         .powerpc => "powerpc",
+        .powerpcle => "powerpcle",
         .powerpc64 => "powerpc64",
         .powerpc64le => "powerpc64le",
         .r600 => "r600",
@@ -91,11 +93,11 @@ pub fn targetTriple(allocator: *Allocator, target: std.Target) ![:0]u8 {
         .openbsd => "openbsd",
         .solaris => "solaris",
         .windows => "windows",
+        .zos => "zos",
         .haiku => "haiku",
         .minix => "minix",
         .rtems => "rtems",
         .nacl => "nacl",
-        .cnk => "cnk",
         .aix => "aix",
         .cuda => "cuda",
         .nvcl => "nvcl",
@@ -126,6 +128,7 @@ pub fn targetTriple(allocator: *Allocator, target: std.Target) ![:0]u8 {
         .gnueabi => "gnueabi",
         .gnueabihf => "gnueabihf",
         .gnux32 => "gnux32",
+        .gnuilp32 => "gnuilp32",
         .code16 => "code16",
         .eabi => "eabi",
         .eabihf => "eabihf",
diff --git a/src/codegen/llvm/bindings.zig b/src/codegen/llvm/bindings.zig
index 7217ca381e..63ac7e86a0 100644
--- a/src/codegen/llvm/bindings.zig
+++ b/src/codegen/llvm/bindings.zig
@@ -508,6 +508,7 @@ pub const ObjectFormatType = extern enum(c_int) {
     Unknown,
     COFF,
     ELF,
+    GOFF,
     MachO,
     Wasm,
     XCOFF,
@@ -528,97 +529,99 @@ extern fn ZigLLVMWriteArchive(
 ) bool;
 
 pub const OSType = extern enum(c_int) {
-    UnknownOS = 0,
-    Ananas = 1,
-    CloudABI = 2,
-    Darwin = 3,
-    DragonFly = 4,
-    FreeBSD = 5,
-    Fuchsia = 6,
-    IOS = 7,
-    KFreeBSD = 8,
-    Linux = 9,
-    Lv2 = 10,
-    MacOSX = 11,
-    NetBSD = 12,
-    OpenBSD = 13,
-    Solaris = 14,
-    Win32 = 15,
-    Haiku = 16,
-    Minix = 17,
-    RTEMS = 18,
-    NaCl = 19,
-    CNK = 20,
-    AIX = 21,
-    CUDA = 22,
-    NVCL = 23,
-    AMDHSA = 24,
-    PS4 = 25,
-    ELFIAMCU = 26,
-    TvOS = 27,
-    WatchOS = 28,
-    Mesa3D = 29,
-    Contiki = 30,
-    AMDPAL = 31,
-    HermitCore = 32,
-    Hurd = 33,
-    WASI = 34,
-    Emscripten = 35,
+    UnknownOS,
+    Ananas,
+    CloudABI,
+    Darwin,
+    DragonFly,
+    FreeBSD,
+    Fuchsia,
+    IOS,
+    KFreeBSD,
+    Linux,
+    Lv2,
+    MacOSX,
+    NetBSD,
+    OpenBSD,
+    Solaris,
+    Win32,
+    ZOS,
+    Haiku,
+    Minix,
+    RTEMS,
+    NaCl,
+    AIX,
+    CUDA,
+    NVCL,
+    AMDHSA,
+    PS4,
+    ELFIAMCU,
+    TvOS,
+    WatchOS,
+    Mesa3D,
+    Contiki,
+    AMDPAL,
+    HermitCore,
+    Hurd,
+    WASI,
+    Emscripten,
 };
 
 pub const ArchType = extern enum(c_int) {
-    UnknownArch = 0,
-    arm = 1,
-    armeb = 2,
-    aarch64 = 3,
-    aarch64_be = 4,
-    aarch64_32 = 5,
-    arc = 6,
-    avr = 7,
-    bpfel = 8,
-    bpfeb = 9,
-    hexagon = 10,
-    mips = 11,
-    mipsel = 12,
-    mips64 = 13,
-    mips64el = 14,
-    msp430 = 15,
-    ppc = 16,
-    ppc64 = 17,
-    ppc64le = 18,
-    r600 = 19,
-    amdgcn = 20,
-    riscv32 = 21,
-    riscv64 = 22,
-    sparc = 23,
-    sparcv9 = 24,
-    sparcel = 25,
-    systemz = 26,
-    tce = 27,
-    tcele = 28,
-    thumb = 29,
-    thumbeb = 30,
-    x86 = 31,
-    x86_64 = 32,
-    xcore = 33,
-    nvptx = 34,
-    nvptx64 = 35,
-    le32 = 36,
-    le64 = 37,
-    amdil = 38,
-    amdil64 = 39,
-    hsail = 40,
-    hsail64 = 41,
-    spir = 42,
-    spir64 = 43,
-    kalimba = 44,
-    shave = 45,
-    lanai = 46,
-    wasm32 = 47,
-    wasm64 = 48,
-    renderscript32 = 49,
-    renderscript64 = 50,
-    ve = 51,
+    UnknownArch,
+    arm,
+    armeb,
+    aarch64,
+    aarch64_be,
+    aarch64_32,
+    arc,
+    avr,
+    bpfel,
+    bpfeb,
+    csky,
+    hexagon,
+    mips,
+    mipsel,
+    mips64,
+    mips64el,
+    msp430,
+    ppc,
+    ppcle,
+    ppc64,
+    ppc64le,
+    r600,
+    amdgcn,
+    riscv32,
+    riscv64,
+    sparc,
+    sparcv9,
+    sparcel,
+    systemz,
+    tce,
+    tcele,
+    thumb,
+    thumbeb,
+    x86,
+    x86_64,
+    xcore,
+    nvptx,
+    nvptx64,
+    le32,
+    le64,
+    amdil,
+    amdil64,
+    hsail,
+    hsail64,
+    spir,
+    spir64,
+    kalimba,
+    shave,
+    lanai,
+    wasm32,
+    wasm64,
+    renderscript32,
+    renderscript64,
+    ve,
 };
 
 pub const ParseCommandLineOptions = ZigLLVMParseCommandLineOptions;
diff --git a/src/glibc.zig b/src/glibc.zig
index 1acf57e267..a1f02e2f11 100644
--- a/src/glibc.zig
+++ b/src/glibc.zig
@@ -820,26 +820,26 @@ pub fn buildSharedObjects(comp: *Compilation) !void {
                         // .globl _Exit_2_2_5
                         // .type _Exit_2_2_5, %function;
                         // .symver _Exit_2_2_5, _Exit@@GLIBC_2.2.5
-                        // .hidden _Exit_2_2_5
                         // _Exit_2_2_5:
                         const ver_index = ver_list.versions[ver_i];
                         const ver = metadata.all_versions[ver_index];
                         const sym_name = libc_fn.name;
                         // Default symbol version definition vs normal symbol version definition
-                        const want_two_ats = chosen_def_ver_index != 255 and ver_index == chosen_def_ver_index;
-                        const at_sign_str = "@@"[0 .. @boolToInt(want_two_ats) + @as(usize, 1)];
-
+                        const want_default = chosen_def_ver_index != 255 and ver_index == chosen_def_ver_index;
+                        const at_sign_str: []const u8 = if (want_default) "@@" else "@";
                         if (ver.patch == 0) {
-                            const sym_plus_ver = try std.fmt.allocPrint(
-                                arena,
-                                "{s}_{d}_{d}",
-                                .{ sym_name, ver.major, ver.minor },
-                            );
+                            const sym_plus_ver = if (want_default)
+                                sym_name
+                            else
+                                try std.fmt.allocPrint(
+                                    arena,
+                                    "{s}_GLIBC_{d}_{d}",
+                                    .{ sym_name, ver.major, ver.minor },
+                                );
                             try zig_body.writer().print(
                                 \\.globl {s}
                                 \\.type {s}, %function;
                                 \\.symver {s}, {s}{s}GLIBC_{d}.{d}
-                                \\.hidden {s}
                                 \\{s}:
                                 \\
                             , .{
@@ -851,19 +851,20 @@ pub fn buildSharedObjects(comp: *Compilation) !void {
                                 ver.major,
                                 ver.minor,
                                 sym_plus_ver,
-                                sym_plus_ver,
                             });
                         } else {
-                            const sym_plus_ver = try std.fmt.allocPrint(
-                                arena,
-                                "{s}_{d}_{d}_{d}",
-                                .{ sym_name, ver.major, ver.minor, ver.patch },
-                            );
+                            const sym_plus_ver = if (want_default)
+                                sym_name
+                            else
+                                try std.fmt.allocPrint(
+                                    arena,
+                                    "{s}_GLIBC_{d}_{d}_{d}",
+                                    .{ sym_name, ver.major, ver.minor, ver.patch },
+                                );
                             try zig_body.writer().print(
                                 \\.globl {s}
                                 \\.type {s}, %function;
                                 \\.symver {s}, {s}{s}GLIBC_{d}.{d}.{d}
-                                \\.hidden {s}
                                 \\{s}:
                                 \\
                             , .{
@@ -876,7 +877,6 @@ pub fn buildSharedObjects(comp: *Compilation) !void {
                                 ver.minor,
                                 ver.patch,
                                 sym_plus_ver,
-                                sym_plus_ver,
                             });
                         }
                     }
diff --git a/src/libcxx.zig b/src/libcxx.zig
index 11989998e9..e235403d40 100644
--- a/src/libcxx.zig
+++ b/src/libcxx.zig
@@ -49,6 +49,7 @@ const libcxx_files = [_][]const u8{
     "src/future.cpp",
     "src/hash.cpp",
     "src/ios.cpp",
+    "src/ios.instantiations.cpp",
     "src/iostream.cpp",
     "src/locale.cpp",
     "src/memory.cpp",
@@ -127,6 +128,7 @@ pub fn buildLibCXX(comp: *Compilation) !void {
         try cflags.append("-DLIBCXX_BUILDING_LIBCXXABI");
         try cflags.append("-D_LIBCXXABI_DISABLE_VISIBILITY_ANNOTATIONS");
         try cflags.append("-D_LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS");
+        try cflags.append("-D_LIBCPP_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS");
 
         if (target.abi.isMusl()) {
             try cflags.append("-D_LIBCPP_HAS_MUSL_LIBC");
diff --git a/src/stage1/codegen.cpp b/src/stage1/codegen.cpp
index 1dec1f8b27..6d219c517d 100644
--- a/src/stage1/codegen.cpp
+++ b/src/stage1/codegen.cpp
@@ -537,14 +537,12 @@ static LLVMValueRef make_fn_llvm_value(CodeGen *g, ZigFn *fn) {
     if (fn->body_node != nullptr) {
         maybe_export_dll(g, llvm_fn, linkage);
 
-        bool want_fn_safety = g->build_mode != BuildModeFastRelease &&
+        bool want_ssp_attrs = g->build_mode != BuildModeFastRelease &&
                               g->build_mode != BuildModeSmallRelease &&
-                              !fn->def_scope->safety_off;
-        if (want_fn_safety) {
-            if (g->link_libc) {
-                addLLVMFnAttr(llvm_fn, "sspstrong");
-                addLLVMFnAttrStr(llvm_fn, "stack-protector-buffer-size", "4");
-            }
+                              g->link_libc;
+        if (want_ssp_attrs) {
+            addLLVMFnAttr(llvm_fn, "sspstrong");
+            addLLVMFnAttrStr(llvm_fn, "stack-protector-buffer-size", "4");
         }
         if (g->have_stack_probing && !fn->def_scope->safety_off) {
             addLLVMFnAttrStr(llvm_fn, "probe-stack", "__zig_probe_stack");
@@ -598,7 +596,7 @@ static LLVMValueRef make_fn_llvm_value(CodeGen *g, ZigFn *fn) {
         } else if (want_first_arg_sret(g, &fn_type->data.fn.fn_type_id)) {
             // Sret pointers must not be address 0
             addLLVMArgAttr(llvm_fn, 0, "nonnull");
-            addLLVMArgAttr(llvm_fn, 0, "sret");
+            ZigLLVMAddSretAttr(llvm_fn, 0, get_llvm_type(g, return_type));
             if (cc_want_sret_attr(cc)) {
                 addLLVMArgAttr(llvm_fn, 0, "noalias");
             }
@@ -1644,35 +1642,16 @@ static const BuildBinOpFunc unsigned_op[3] = { LLVMBuildNUWAdd, LLVMBuildNUWSub,
 static LLVMValueRef gen_overflow_op(CodeGen *g, ZigType *operand_type, AddSubMul op,
         LLVMValueRef val1, LLVMValueRef val2)
 {
-    LLVMValueRef overflow_bit;
-    LLVMValueRef result;
-
+    LLVMValueRef fn_val = get_int_overflow_fn(g, operand_type, op);
+    LLVMValueRef params[] = {
+        val1,
+        val2,
+    };
+    LLVMValueRef result_struct = LLVMBuildCall(g->builder, fn_val, params, 2, "");
+    LLVMValueRef result = LLVMBuildExtractValue(g->builder, result_struct, 0, "");
+    LLVMValueRef overflow_bit = LLVMBuildExtractValue(g->builder, result_struct, 1, "");
     if (operand_type->id == ZigTypeIdVector) {
-        ZigType *int_type = operand_type->data.vector.elem_type;
-        assert(int_type->id == ZigTypeIdInt);
-        LLVMTypeRef one_more_bit_int = LLVMIntType(int_type->data.integral.bit_count + 1);
-        LLVMTypeRef one_more_bit_int_vector = LLVMVectorType(one_more_bit_int, operand_type->data.vector.len);
-        const auto buildExtFn = int_type->data.integral.is_signed ? LLVMBuildSExt : LLVMBuildZExt;
-        LLVMValueRef extended1 = buildExtFn(g->builder, val1, one_more_bit_int_vector, "");
-        LLVMValueRef extended2 = buildExtFn(g->builder, val2, one_more_bit_int_vector, "");
-        LLVMValueRef extended_result = wrap_op[op](g->builder, extended1, extended2, "");
-        result = LLVMBuildTrunc(g->builder, extended_result, get_llvm_type(g, operand_type), "");
-
-        LLVMValueRef re_extended_result = buildExtFn(g->builder, result, one_more_bit_int_vector, "");
-        LLVMValueRef overflow_vector = LLVMBuildICmp(g->builder, LLVMIntNE, extended_result, re_extended_result, "");
-        LLVMTypeRef bitcast_int_type = LLVMIntType(operand_type->data.vector.len);
-        LLVMValueRef bitcasted_overflow = LLVMBuildBitCast(g->builder, overflow_vector, bitcast_int_type, "");
-        LLVMValueRef zero = LLVMConstNull(bitcast_int_type);
-        overflow_bit = LLVMBuildICmp(g->builder, LLVMIntNE, bitcasted_overflow, zero, "");
-    } else {
-        LLVMValueRef fn_val = get_int_overflow_fn(g, operand_type, op);
-        LLVMValueRef params[] = {
-            val1,
-            val2,
-        };
-        LLVMValueRef result_struct = LLVMBuildCall(g->builder, fn_val, params, 2, "");
-        result = LLVMBuildExtractValue(g->builder, result_struct, 0, "");
-        overflow_bit = LLVMBuildExtractValue(g->builder, result_struct, 1, "");
+        overflow_bit = ZigLLVMBuildOrReduce(g->builder, overflow_bit);
     }
 
     LLVMBasicBlockRef fail_block = LLVMAppendBasicBlock(g->cur_fn_val, "OverflowFail");
@@ -2040,7 +2019,7 @@ static bool iter_function_params_c_abi(CodeGen *g, ZigType *fn_type, FnWalk *fn_
             switch (fn_walk->id) {
                 case FnWalkIdAttrs:
                     if (abi_class != X64CABIClass_MEMORY_nobyval) {
-                        ZigLLVMAddByValAttr(llvm_fn, fn_walk->data.attrs.gen_i + 1, get_llvm_type(g, ty));
+                        ZigLLVMAddByValAttr(llvm_fn, fn_walk->data.attrs.gen_i, get_llvm_type(g, ty));
                         addLLVMArgAttrInt(llvm_fn, fn_walk->data.attrs.gen_i, "align", get_abi_alignment(g, ty));
                     } else if (g->zig_target->arch == ZigLLVM_aarch64 ||
                             g->zig_target->arch == ZigLLVM_aarch64_be)
@@ -4105,12 +4084,6 @@ static void gen_set_stack_pointer(CodeGen *g, LLVMValueRef aligned_end_addr) {
     LLVMBuildCall(g->builder, write_register_fn_val, params, 2, "");
 }
 
-static void set_call_instr_sret(CodeGen *g, LLVMValueRef call_instr) {
-    unsigned attr_kind_id = LLVMGetEnumAttributeKindForName("sret", 4);
-    LLVMAttributeRef sret_attr = LLVMCreateEnumAttribute(LLVMGetGlobalContext(), attr_kind_id, 0);
-    LLVMAddCallSiteAttribute(call_instr, 1, sret_attr);
-}
-
 static void render_async_spills(CodeGen *g) {
     ZigType *fn_type = g->cur_fn->type_entry;
     ZigType *import = get_scope_import(&g->cur_fn->fndef_scope->base);
@@ -4634,7 +4607,7 @@ static LLVMValueRef ir_render_call(CodeGen *g, IrExecutableGen *executable, IrIn
     } else if (!ret_has_bits) {
         return nullptr;
     } else if (first_arg_ret) {
-        set_call_instr_sret(g, result);
+        ZigLLVMSetCallSret(result, get_llvm_type(g, src_return_type));
         return result_loc;
     } else if (handle_is_ptr(g, src_return_type)) {
         LLVMValueRef store_instr = LLVMBuildStore(g->builder, result, result_loc);
diff --git a/src/stage1/stage1.h b/src/stage1/stage1.h
index 7fa1576d64..59632b9877 100644
--- a/src/stage1/stage1.h
+++ b/src/stage1/stage1.h
@@ -73,11 +73,11 @@ enum Os {
     OsOpenBSD,
     OsSolaris,
     OsWindows,
+    OsZOS,
     OsHaiku,
     OsMinix,
     OsRTEMS,
     OsNaCl,       // Native Client
-    OsCNK,        // BG/P Compute-Node Kernel
     OsAIX,
     OsCUDA,       // NVIDIA CUDA
     OsNVCL,       // NVIDIA OpenCL
diff --git a/src/stage1/target.cpp b/src/stage1/target.cpp
index f2fbc30da8..028f6e5fa8 100644
--- a/src/stage1/target.cpp
+++ b/src/stage1/target.cpp
@@ -23,6 +23,7 @@ static const ZigLLVM_ArchType arch_list[] = {
     ZigLLVM_avr,            // AVR: Atmel AVR microcontroller
     ZigLLVM_bpfel,          // eBPF or extended BPF or 64-bit BPF (little endian)
     ZigLLVM_bpfeb,          // eBPF or extended BPF or 64-bit BPF (big endian)
+    ZigLLVM_csky,           // CSKY: csky
     ZigLLVM_hexagon,        // Hexagon: hexagon
     ZigLLVM_mips,           // MIPS: mips, mipsallegrex, mipsr6
     ZigLLVM_mipsel,         // MIPSEL: mipsel, mipsallegrexe, mipsr6el
@@ -30,6 +31,7 @@ static const ZigLLVM_ArchType arch_list[] = {
     ZigLLVM_mips64el,       // MIPS64EL: mips64el, mips64r6el, mipsn32el, mipsn32r6el
     ZigLLVM_msp430,         // MSP430: msp430
     ZigLLVM_ppc,            // PPC: powerpc
+    ZigLLVM_ppcle,          // PPCLE: powerpc (little endian)
     ZigLLVM_ppc64,          // PPC64: powerpc64, ppu
     ZigLLVM_ppc64le,        // PPC64LE: powerpc64le
     ZigLLVM_r600,           // R600: AMD GPUs HD2XXX - HD6XXX
@@ -71,8 +73,6 @@ static const ZigLLVM_VendorType vendor_list[] = {
     ZigLLVM_Apple,
     ZigLLVM_PC,
     ZigLLVM_SCEI,
-    ZigLLVM_BGP,
-    ZigLLVM_BGQ,
     ZigLLVM_Freescale,
     ZigLLVM_IBM,
     ZigLLVM_ImaginationTechnologies,
@@ -101,11 +101,11 @@ static const Os os_list[] = {
     OsOpenBSD,
     OsSolaris,
     OsWindows,
+    OsZOS,
     OsHaiku,
     OsMinix,
     OsRTEMS,
     OsNaCl,       // Native Client
-    OsCNK,        // BG/P Compute-Node Kernel
     OsAIX,
     OsCUDA,       // NVIDIA CUDA
     OsNVCL,       // NVIDIA OpenCL
@@ -135,6 +135,7 @@ static const ZigLLVM_EnvironmentType abi_list[] = {
     ZigLLVM_GNUEABI,
     ZigLLVM_GNUEABIHF,
     ZigLLVM_GNUX32,
+    ZigLLVM_GNUILP32,
     ZigLLVM_CODE16,
     ZigLLVM_EABI,
     ZigLLVM_EABIHF,
@@ -155,8 +156,10 @@ static const ZigLLVM_ObjectFormatType oformat_list[] = {
     ZigLLVM_UnknownObjectFormat,
     ZigLLVM_COFF,
     ZigLLVM_ELF,
+    ZigLLVM_GOFF,
     ZigLLVM_MachO,
     ZigLLVM_Wasm,
+    ZigLLVM_XCOFF,
 };
 
 size_t target_oformat_count(void) {
@@ -173,6 +176,7 @@ const char *target_oformat_name(ZigLLVM_ObjectFormatType oformat) {
         case ZigLLVM_UnknownObjectFormat: return "unknown";
         case ZigLLVM_COFF: return "coff";
         case ZigLLVM_ELF: return "elf";
+        case ZigLLVM_GOFF: return "goff";
         case ZigLLVM_MachO: return "macho";
         case ZigLLVM_Wasm: return "wasm";
         case ZigLLVM_XCOFF: return "xcoff";
@@ -240,6 +244,8 @@ ZigLLVM_OSType get_llvm_os_type(Os os_type) {
         case OsWindows:
         case OsUefi:
             return ZigLLVM_Win32;
+        case OsZOS:
+            return ZigLLVM_ZOS;
         case OsHaiku:
             return ZigLLVM_Haiku;
         case OsMinix:
@@ -248,8 +254,6 @@ ZigLLVM_OSType get_llvm_os_type(Os os_type) {
             return ZigLLVM_RTEMS;
         case OsNaCl:
             return ZigLLVM_NaCl;
-        case OsCNK:
-            return ZigLLVM_CNK;
         case OsAIX:
             return ZigLLVM_AIX;
         case OsCUDA:
@@ -306,11 +310,11 @@ const char *target_os_name(Os os_type) {
         case OsOpenBSD:
         case OsSolaris:
         case OsWindows:
+        case OsZOS:
         case OsHaiku:
         case OsMinix:
         case OsRTEMS:
         case OsNaCl:       // Native Client
-        case OsCNK:        // BG/P Compute-Node Kernel
         case OsAIX:
         case OsCUDA:       // NVIDIA CUDA
         case OsNVCL:       // NVIDIA OpenCL
@@ -485,6 +489,7 @@ uint32_t target_arch_pointer_bit_width(ZigLLVM_ArchType arch) {
         case ZigLLVM_mipsel:
         case ZigLLVM_nvptx:
         case ZigLLVM_ppc:
+        case ZigLLVM_ppcle:
         case ZigLLVM_r600:
         case ZigLLVM_riscv32:
         case ZigLLVM_sparc:
@@ -504,6 +509,7 @@ uint32_t target_arch_pointer_bit_width(ZigLLVM_ArchType arch) {
         case ZigLLVM_wasm32:
         case ZigLLVM_renderscript32:
         case ZigLLVM_aarch64_32:
+        case ZigLLVM_csky:
             return 32;
 
         case ZigLLVM_aarch64:
@@ -550,6 +556,7 @@ uint32_t target_arch_largest_atomic_bits(ZigLLVM_ArchType arch) {
         case ZigLLVM_mipsel:
         case ZigLLVM_nvptx:
         case ZigLLVM_ppc:
+        case ZigLLVM_ppcle:
         case ZigLLVM_r600:
         case ZigLLVM_riscv32:
         case ZigLLVM_sparc:
@@ -568,6 +575,7 @@ uint32_t target_arch_largest_atomic_bits(ZigLLVM_ArchType arch) {
         case ZigLLVM_shave:
         case ZigLLVM_wasm32:
         case ZigLLVM_renderscript32:
+        case ZigLLVM_csky:
             return 32;
 
         case ZigLLVM_aarch64:
@@ -707,10 +715,10 @@ uint32_t target_c_type_size_in_bits(const ZigTarget *target, CIntType id) {
         case OsKFreeBSD:
         case OsLv2:
         case OsSolaris:
+        case OsZOS:
         case OsMinix:
         case OsRTEMS:
         case OsNaCl:
-        case OsCNK:
         case OsAIX:
         case OsCUDA:
         case OsNVCL:
@@ -799,6 +807,7 @@ const char *arch_stack_pointer_register_name(ZigLLVM_ArchType arch) {
         case ZigLLVM_riscv64:
         case ZigLLVM_mipsel:
         case ZigLLVM_ppc:
+        case ZigLLVM_ppcle:
         case ZigLLVM_ppc64:
         case ZigLLVM_ppc64le:
             return "sp";
@@ -814,6 +823,7 @@ const char *arch_stack_pointer_register_name(ZigLLVM_ArchType arch) {
         case ZigLLVM_avr:
         case ZigLLVM_bpfeb:
         case ZigLLVM_bpfel:
+        case ZigLLVM_csky:
         case ZigLLVM_hexagon:
         case ZigLLVM_lanai:
         case ZigLLVM_hsail:
@@ -868,6 +878,7 @@ bool target_is_arm(const ZigTarget *target) {
         case ZigLLVM_avr:
         case ZigLLVM_bpfeb:
         case ZigLLVM_bpfel:
+        case ZigLLVM_csky:
         case ZigLLVM_hexagon:
         case ZigLLVM_lanai:
         case ZigLLVM_hsail:
@@ -901,6 +912,7 @@ bool target_is_arm(const ZigTarget *target) {
         case ZigLLVM_wasm64:
         case ZigLLVM_xcore:
         case ZigLLVM_ppc:
+        case ZigLLVM_ppcle:
         case ZigLLVM_ppc64:
         case ZigLLVM_ve:
             return false;
@@ -951,11 +963,10 @@ ZigLLVM_EnvironmentType target_default_abi(ZigLLVM_ArchType arch, Os os) {
         case OsCloudABI:
         case OsLv2:
         case OsSolaris:
-        case OsHaiku:
+        case OsZOS:
         case OsMinix:
         case OsRTEMS:
         case OsNaCl:
-        case OsCNK:
         case OsAIX:
         case OsCUDA:
         case OsNVCL:
@@ -979,6 +990,7 @@ ZigLLVM_EnvironmentType target_default_abi(ZigLLVM_ArchType arch, Os os) {
         case OsNetBSD:
         case OsDragonFly:
         case OsHurd:
+        case OsHaiku:
             return ZigLLVM_GNU;
         case OsUefi:
         case OsWindows:
@@ -1039,6 +1051,8 @@ static const AvailableLibC libcs_available[] = {
     {ZigLLVM_arm, OsLinux, ZigLLVM_MuslEABI},
     {ZigLLVM_arm, OsLinux, ZigLLVM_MuslEABIHF},
     {ZigLLVM_arm, OsWindows, ZigLLVM_GNU},
+    {ZigLLVM_csky, OsLinux, ZigLLVM_GNUEABI},
+    {ZigLLVM_csky, OsLinux, ZigLLVM_GNUEABIHF},
     {ZigLLVM_x86, OsLinux, ZigLLVM_GNU},
     {ZigLLVM_x86, OsLinux, ZigLLVM_Musl},
     {ZigLLVM_x86, OsWindows, ZigLLVM_GNU},
@@ -1094,6 +1108,7 @@ const char *target_libc_generic_name(const ZigTarget *target) {
         case ZigLLVM_GNUEABI:
         case ZigLLVM_GNUEABIHF:
         case ZigLLVM_GNUX32:
+        case ZigLLVM_GNUILP32:
             return "glibc";
         case ZigLLVM_Musl:
         case ZigLLVM_MuslEABI:
diff --git a/src/stage1/zig0.cpp b/src/stage1/zig0.cpp
index bcc9dbc00a..e16f79455a 100644
--- a/src/stage1/zig0.cpp
+++ b/src/stage1/zig0.cpp
@@ -94,6 +94,8 @@ static Os get_zig_os_type(ZigLLVM_OSType os_type) {
             return OsSolaris;
         case ZigLLVM_Win32:
             return OsWindows;
+        case ZigLLVM_ZOS:
+            return OsZOS;
         case ZigLLVM_Haiku:
             return OsHaiku;
         case ZigLLVM_Minix:
@@ -102,8 +104,6 @@ static Os get_zig_os_type(ZigLLVM_OSType os_type) {
             return OsRTEMS;
         case ZigLLVM_NaCl:
             return OsNaCl;
-        case ZigLLVM_CNK:
-            return OsCNK;
         case ZigLLVM_AIX:
             return OsAIX;
         case ZigLLVM_CUDA:
diff --git a/src/target.zig b/src/target.zig
index 5a884271cc..25ed726fe6 100644
--- a/src/target.zig
+++ b/src/target.zig
@@ -25,6 +25,8 @@ pub const available_libcs = [_]ArchOsAbi{
     .{ .arch = .arm, .os = .linux, .abi = .musleabi },
     .{ .arch = .arm, .os = .linux, .abi = .musleabihf },
     .{ .arch = .arm, .os = .windows, .abi = .gnu },
+    .{ .arch = .csky, .os = .linux, .abi = .gnueabi },
+    .{ .arch = .csky, .os = .linux, .abi = .gnueabihf },
     .{ .arch = .i386, .os = .linux, .abi = .gnu },
     .{ .arch = .i386, .os = .linux, .abi = .musl },
     .{ .arch = .i386, .os = .windows, .abi = .gnu },
@@ -71,6 +73,7 @@ pub fn libCGenericName(target: std.Target) [:0]const u8 {
         .gnueabi,
         .gnueabihf,
         .gnux32,
+        .gnuilp32,
         => return "glibc",
         .musl,
         .musleabi,
@@ -204,11 +207,11 @@ pub fn osToLLVM(os_tag: std.Target.Os.Tag) llvm.OSType {
         .netbsd => .NetBSD,
         .openbsd => .OpenBSD,
         .solaris => .Solaris,
+        .zos => .ZOS,
         .haiku => .Haiku,
         .minix => .Minix,
         .rtems => .RTEMS,
         .nacl => .NaCl,
-        .cnk => .CNK,
         .aix => .AIX,
         .cuda => .CUDA,
         .nvcl => .NVCL,
@@ -238,6 +241,7 @@ pub fn archToLLVM(arch_tag: std.Target.Cpu.Arch) llvm.ArchType {
         .avr => .avr,
         .bpfel => .bpfel,
         .bpfeb => .bpfeb,
+        .csky => .csky,
         .hexagon => .hexagon,
         .mips => .mips,
         .mipsel => .mipsel,
@@ -245,6 +249,7 @@ pub fn archToLLVM(arch_tag: std.Target.Cpu.Arch) llvm.ArchType {
         .mips64el => .mips64el,
         .msp430 => .msp430,
         .powerpc => .ppc,
+        .powerpcle => .ppcle,
         .powerpc64 => .ppc64,
         .powerpc64le => .ppc64le,
         .r600 => .r600,
diff --git a/src/translate_c.zig b/src/translate_c.zig
index bd6fd190a8..9a1215abd6 100644
--- a/src/translate_c.zig
+++ b/src/translate_c.zig
@@ -2914,7 +2914,7 @@ fn transSwitchProngStmtInline(
 
 fn transConstantExpr(c: *Context, scope: *Scope, expr: *const clang.Expr, used: ResultUsed) TransError!Node {
     var result: clang.ExprEvalResult = undefined;
-    if (!expr.evaluateAsConstantExpr(&result, .EvaluateForCodeGen, c.clang_context))
+    if (!expr.evaluateAsConstantExpr(&result, .Normal, c.clang_context))
         return fail(c, error.UnsupportedTranslation, expr.getBeginLoc(), "invalid constant expression", .{});
 
     switch (result.Val.getKind()) {
diff --git a/src/type.zig b/src/type.zig
index ab31991c36..74c38f7e0d 100644
--- a/src/type.zig
+++ b/src/type.zig
@@ -2695,11 +2695,11 @@ pub const CType = enum {
             .kfreebsd,
             .lv2,
             .solaris,
+            .zos,
             .haiku,
             .minix,
             .rtems,
             .nacl,
-            .cnk,
             .aix,
             .cuda,
             .nvcl,
diff --git a/src/zig_clang.cpp b/src/zig_clang.cpp
index 526371b792..32bb9b4487 100644
--- a/src/zig_clang.cpp
+++ b/src/zig_clang.cpp
@@ -182,6 +182,8 @@ void ZigClang_detect_enum_CK(clang::CastKind x) {
         case clang::CK_IntegralToBoolean:
         case clang::CK_IntegralToFloating:
         case clang::CK_IntegralToPointer:
+        case clang::CK_FloatingToFixedPoint:
+        case clang::CK_FixedPointToFloating:
         case clang::CK_LValueBitCast:
         case clang::CK_LValueToRValueBitCast:
         case clang::CK_LValueToRValue:
@@ -237,6 +239,8 @@ static_assert((clang::CastKind)ZigClangCK_VectorSplat == clang::CK_VectorSplat,
 static_assert((clang::CastKind)ZigClangCK_IntegralCast == clang::CK_IntegralCast, "");
 static_assert((clang::CastKind)ZigClangCK_IntegralToBoolean == clang::CK_IntegralToBoolean, "");
 static_assert((clang::CastKind)ZigClangCK_IntegralToFloating == clang::CK_IntegralToFloating, "");
+static_assert((clang::CastKind)ZigClangCK_FloatingToFixedPoint == clang::CK_FloatingToFixedPoint, "");
+static_assert((clang::CastKind)ZigClangCK_FixedPointToFloating == clang::CK_FixedPointToFloating, "");
 static_assert((clang::CastKind)ZigClangCK_FixedPointCast == clang::CK_FixedPointCast, "");
 static_assert((clang::CastKind)ZigClangCK_FixedPointToIntegral == clang::CK_FixedPointToIntegral, "");
 static_assert((clang::CastKind)ZigClangCK_IntegralToFixedPoint == clang::CK_IntegralToFixedPoint, "");
@@ -928,6 +932,7 @@ void ZigClang_detect_enum_DeclKind(clang::Decl::Kind x) {
         case clang::Decl::MSGuid:
         case clang::Decl::OMPDeclareMapper:
         case clang::Decl::OMPDeclareReduction:
+        case clang::Decl::TemplateParamObject:
         case clang::Decl::UnresolvedUsingValue:
         case clang::Decl::OMPAllocate:
         case clang::Decl::OMPRequires:
@@ -1013,6 +1018,7 @@ static_assert((clang::Decl::Kind)ZigClangDeclIndirectField == clang::Decl::Indir
 static_assert((clang::Decl::Kind)ZigClangDeclMSGuid == clang::Decl::MSGuid, "");
 static_assert((clang::Decl::Kind)ZigClangDeclOMPDeclareMapper == clang::Decl::OMPDeclareMapper, "");
 static_assert((clang::Decl::Kind)ZigClangDeclOMPDeclareReduction == clang::Decl::OMPDeclareReduction, "");
+static_assert((clang::Decl::Kind)ZigClangDeclTemplateParamObject == clang::Decl::TemplateParamObject, "");
 static_assert((clang::Decl::Kind)ZigClangDeclUnresolvedUsingValue == clang::Decl::UnresolvedUsingValue, "");
 static_assert((clang::Decl::Kind)ZigClangDeclOMPRequires == clang::Decl::OMPRequires, "");
 static_assert((clang::Decl::Kind)ZigClangDeclOMPThreadPrivate == clang::Decl::OMPThreadPrivate, "");
@@ -1122,6 +1128,8 @@ void ZigClang_detect_enum_BuiltinTypeKind(clang::BuiltinType::Kind x) {
         case clang::BuiltinType::SveFloat64x4:
         case clang::BuiltinType::SveBFloat16x4:
         case clang::BuiltinType::SveBool:
+        case clang::BuiltinType::VectorQuad:
+        case clang::BuiltinType::VectorPair:
         case clang::BuiltinType::Void:
         case clang::BuiltinType::Bool:
         case clang::BuiltinType::Char_U:
@@ -1295,6 +1303,8 @@ static_assert((clang::BuiltinType::Kind)ZigClangBuiltinTypeSveFloat32x4 == clang
 static_assert((clang::BuiltinType::Kind)ZigClangBuiltinTypeSveFloat64x4 == clang::BuiltinType::SveFloat64x4, "");
 static_assert((clang::BuiltinType::Kind)ZigClangBuiltinTypeSveBFloat16x4 == clang::BuiltinType::SveBFloat16x4, "");
 static_assert((clang::BuiltinType::Kind)ZigClangBuiltinTypeSveBool == clang::BuiltinType::SveBool, "");
+static_assert((clang::BuiltinType::Kind)ZigClangBuiltinTypeVectorQuad == clang::BuiltinType::VectorQuad, "");
+static_assert((clang::BuiltinType::Kind)ZigClangBuiltinTypeVectorPair == clang::BuiltinType::VectorPair, "");
 static_assert((clang::BuiltinType::Kind)ZigClangBuiltinTypeVoid == clang::BuiltinType::Void, "");
 static_assert((clang::BuiltinType::Kind)ZigClangBuiltinTypeBool == clang::BuiltinType::Bool, "");
 static_assert((clang::BuiltinType::Kind)ZigClangBuiltinTypeChar_U == clang::BuiltinType::Char_U, "");
@@ -1517,15 +1527,19 @@ static_assert((clang::PreprocessedEntity::EntityKind)ZigClangPreprocessedEntity_
 static_assert((clang::PreprocessedEntity::EntityKind)ZigClangPreprocessedEntity_InclusionDirectiveKind == clang::PreprocessedEntity::InclusionDirectiveKind, "");
 
 
-void ZigClang_detect_enum_ConstExprUsage(clang::Expr::ConstExprUsage x) {
+void ZigClang_detect_enum_ConstantExprKind(clang::Expr::ConstantExprKind x) {
     switch (x) {
-        case clang::Expr::EvaluateForCodeGen:
-        case clang::Expr::EvaluateForMangling:
+        case clang::Expr::ConstantExprKind::Normal:
+        case clang::Expr::ConstantExprKind::NonClassTemplateArgument:
+        case clang::Expr::ConstantExprKind::ClassTemplateArgument:
+        case clang::Expr::ConstantExprKind::ImmediateInvocation:
             break;
     }
 }
-static_assert((clang::Expr::ConstExprUsage)ZigClangExpr_EvaluateForCodeGen == clang::Expr::EvaluateForCodeGen, "");
-static_assert((clang::Expr::ConstExprUsage)ZigClangExpr_EvaluateForMangling == clang::Expr::EvaluateForMangling, "");
+static_assert((clang::Expr::ConstantExprKind)ZigClangExpr_ContantExprKind_Normal == clang::Expr::ConstantExprKind::Normal, "");
+static_assert((clang::Expr::ConstantExprKind)ZigClangExpr_ContantExprKind_NonClassTemplateArgument == clang::Expr::ConstantExprKind::NonClassTemplateArgument, "");
+static_assert((clang::Expr::ConstantExprKind)ZigClangExpr_ContantExprKind_ClassTemplateArgument == clang::Expr::ConstantExprKind::ClassTemplateArgument, "");
+static_assert((clang::Expr::ConstantExprKind)ZigClangExpr_ContantExprKind_ImmediateInvocation == clang::Expr::ConstantExprKind::ImmediateInvocation, "");
 
 
 static_assert(sizeof(ZigClangAPValue) == sizeof(clang::APValue), "");
@@ -2148,12 +2162,12 @@ bool ZigClangExpr_EvaluateAsFloat(const ZigClangExpr *self, ZigClangAPFloat **re
 }
 
 bool ZigClangExpr_EvaluateAsConstantExpr(const ZigClangExpr *self, ZigClangExprEvalResult *result,
-        ZigClangExpr_ConstExprUsage usage, const struct ZigClangASTContext *ctx)
+        ZigClangExpr_ConstantExprKind kind, const struct ZigClangASTContext *ctx)
 {
     auto casted_self = reinterpret_cast<const clang::Expr *>(self);
     auto casted_ctx = reinterpret_cast<const clang::ASTContext *>(ctx);
     clang::Expr::EvalResult eval_result;
-    if (!casted_self->EvaluateAsConstantExpr(eval_result, (clang::Expr::ConstExprUsage)usage, *casted_ctx)) {
+    if (!casted_self->EvaluateAsConstantExpr(eval_result, *casted_ctx, (clang::Expr::ConstantExprKind)kind)) {
         return false;
     }
     *result = bitcast(eval_result);
diff --git a/src/zig_clang.h b/src/zig_clang.h
index b9d875c61a..eac3592692 100644
--- a/src/zig_clang.h
+++ b/src/zig_clang.h
@@ -541,6 +541,8 @@ enum ZigClangCK {
     ZigClangCK_IntegralCast,
     ZigClangCK_IntegralToBoolean,
     ZigClangCK_IntegralToFloating,
+    ZigClangCK_FloatingToFixedPoint,
+    ZigClangCK_FixedPointToFloating,
     ZigClangCK_FixedPointCast,
     ZigClangCK_FixedPointToIntegral,
     ZigClangCK_IntegralToFixedPoint,
@@ -647,6 +649,7 @@ enum ZigClangDeclKind {
     ZigClangDeclMSGuid,
     ZigClangDeclOMPDeclareMapper,
     ZigClangDeclOMPDeclareReduction,
+    ZigClangDeclTemplateParamObject,
     ZigClangDeclUnresolvedUsingValue,
     ZigClangDeclOMPAllocate,
     ZigClangDeclOMPRequires,
@@ -757,6 +760,8 @@ enum ZigClangBuiltinTypeKind {
     ZigClangBuiltinTypeSveFloat64x4,
     ZigClangBuiltinTypeSveBFloat16x4,
     ZigClangBuiltinTypeSveBool,
+    ZigClangBuiltinTypeVectorQuad,
+    ZigClangBuiltinTypeVectorPair,
     ZigClangBuiltinTypeVoid,
     ZigClangBuiltinTypeBool,
     ZigClangBuiltinTypeChar_U,
@@ -915,9 +920,11 @@ enum ZigClangPreprocessedEntity_EntityKind {
     ZigClangPreprocessedEntity_InclusionDirectiveKind,
 };
 
-enum ZigClangExpr_ConstExprUsage {
-    ZigClangExpr_EvaluateForCodeGen,
-    ZigClangExpr_EvaluateForMangling,
+enum ZigClangExpr_ConstantExprKind {
+    ZigClangExpr_ContantExprKind_Normal,
+    ZigClangExpr_ContantExprKind_NonClassTemplateArgument,
+    ZigClangExpr_ContantExprKind_ClassTemplateArgument,
+    ZigClangExpr_ContantExprKind_ImmediateInvocation,
 };
 
 enum ZigClangUnaryExprOrTypeTrait_Kind {
@@ -1084,7 +1091,7 @@ ZIG_EXTERN_C bool ZigClangExpr_EvaluateAsBooleanCondition(const struct ZigClangE
 ZIG_EXTERN_C bool ZigClangExpr_EvaluateAsFloat(const struct ZigClangExpr *self,
         ZigClangAPFloat **result, const struct ZigClangASTContext *ctx);
 ZIG_EXTERN_C bool ZigClangExpr_EvaluateAsConstantExpr(const struct ZigClangExpr *,
-        struct ZigClangExprEvalResult *, ZigClangExpr_ConstExprUsage, const struct ZigClangASTContext *);
+        struct ZigClangExprEvalResult *, ZigClangExpr_ConstantExprKind, const struct ZigClangASTContext *);
 
 ZIG_EXTERN_C const ZigClangExpr *ZigClangInitListExpr_getInit(const ZigClangInitListExpr *, unsigned);
 ZIG_EXTERN_C const ZigClangExpr *ZigClangInitListExpr_getArrayFiller(const ZigClangInitListExpr *);
diff --git a/src/zig_clang_cc1_main.cpp b/src/zig_clang_cc1_main.cpp
index 0872015e0a..0918860015 100644
--- a/src/zig_clang_cc1_main.cpp
+++ b/src/zig_clang_cc1_main.cpp
@@ -251,8 +251,7 @@ int cc1_main(ArrayRef<const char *> Argv, const char *Argv0, void *MainAddr) {
     if (auto profilerOutput =
             Clang->createOutputFile(Path.str(),
                                     /*Binary=*/false,
-                                    /*RemoveFileOnSignal=*/false, "",
-                                    /*Extension=*/"json",
+                                    /*RemoveFileOnSignal=*/false,
                                     /*useTemporary=*/false)) {
 
       llvm::timeTraceProfilerWrite(*profilerOutput);
diff --git a/src/zig_clang_cc1as_main.cpp b/src/zig_clang_cc1as_main.cpp
index 77b99b2013..de71026fbf 100644
--- a/src/zig_clang_cc1as_main.cpp
+++ b/src/zig_clang_cc1as_main.cpp
@@ -221,19 +221,13 @@ bool AssemblerInvocation::CreateFromArgs(AssemblerInvocation &Opts,
   // Any DebugInfoKind implies GenDwarfForAssembly.
   Opts.GenDwarfForAssembly = Args.hasArg(OPT_debug_info_kind_EQ);
 
-  if (const Arg *A = Args.getLastArg(OPT_compress_debug_sections,
-                                     OPT_compress_debug_sections_EQ)) {
-    if (A->getOption().getID() == OPT_compress_debug_sections) {
-      // TODO: be more clever about the compression type auto-detection
-      Opts.CompressDebugSections = llvm::DebugCompressionType::GNU;
-    } else {
-      Opts.CompressDebugSections =
-          llvm::StringSwitch<llvm::DebugCompressionType>(A->getValue())
-              .Case("none", llvm::DebugCompressionType::None)
-              .Case("zlib", llvm::DebugCompressionType::Z)
-              .Case("zlib-gnu", llvm::DebugCompressionType::GNU)
-              .Default(llvm::DebugCompressionType::None);
-    }
+  if (const Arg *A = Args.getLastArg(OPT_compress_debug_sections_EQ)) {
+    Opts.CompressDebugSections =
+        llvm::StringSwitch<llvm::DebugCompressionType>(A->getValue())
+            .Case("none", llvm::DebugCompressionType::None)
+            .Case("zlib", llvm::DebugCompressionType::Z)
+            .Case("zlib-gnu", llvm::DebugCompressionType::GNU)
+            .Default(llvm::DebugCompressionType::None);
   }
 
   Opts.RelaxELFRelocations = Args.hasArg(OPT_mrelax_relocations);
@@ -434,8 +428,11 @@ static bool ExecuteAssembler(AssemblerInvocation &Opts,
   std::unique_ptr<MCStreamer> Str;
 
   std::unique_ptr<MCInstrInfo> MCII(TheTarget->createMCInstrInfo());
+  assert(MCII && "Unable to create instruction info!");
+
   std::unique_ptr<MCSubtargetInfo> STI(
       TheTarget->createMCSubtargetInfo(Opts.Triple, Opts.CPU, FS));
+  assert(STI && "Unable to create subtarget info!");
 
   raw_pwrite_stream *Out = FDOS.get();
   std::unique_ptr<buffer_ostream> BOS;
@@ -474,6 +471,8 @@ static bool ExecuteAssembler(AssemblerInvocation &Opts,
         TheTarget->createMCCodeEmitter(*MCII, *MRI, Ctx));
     std::unique_ptr<MCAsmBackend> MAB(
         TheTarget->createMCAsmBackend(*STI, *MRI, MCOptions));
+    assert(MAB && "Unable to create asm backend!");
+
     std::unique_ptr<MCObjectWriter> OW =
         DwoOS ? MAB->createDwoObjectWriter(*Out, *DwoOS)
               : MAB->createObjectWriter(*Out);
@@ -526,8 +525,8 @@ static bool ExecuteAssembler(AssemblerInvocation &Opts,
     Failed = Parser->Run(Opts.NoInitialTextSection);
   }
 
-  // Close Streamer first.
-  // It might have a reference to the output stream.
+  // Parser has a reference to the output stream (Str), so close Parser first.
+  Parser.reset();
   Str.reset();
   // Close the output stream early.
   BOS.reset();
diff --git a/src/zig_clang_driver.cpp b/src/zig_clang_driver.cpp
index fbe407a06c..ac892f95e8 100644
--- a/src/zig_clang_driver.cpp
+++ b/src/zig_clang_driver.cpp
@@ -528,6 +528,13 @@ int ZigClang_main(int argc_, const char **argv_) {
       IsCrash = CommandRes < 0 || CommandRes == 70;
 #ifdef _WIN32
       IsCrash |= CommandRes == 3;
+#endif
+#if LLVM_ON_UNIX
+      // When running in integrated-cc1 mode, the CrashRecoveryContext returns
+      // the same codes as if the program crashed. See section "Exit Status for
+      // Commands":
+      // https://pubs.opengroup.org/onlinepubs/9699919799/xrat/V4_xcu_chap02.html
+      IsCrash |= CommandRes > 128;
 #endif
       if (IsCrash) {
         TheDriver.generateCompilationDiagnostics(*C, *FailingCommand);
diff --git a/src/zig_llvm.cpp b/src/zig_llvm.cpp
index 280920ac74..7866537c64 100644
--- a/src/zig_llvm.cpp
+++ b/src/zig_llvm.cpp
@@ -20,6 +20,7 @@
 #pragma GCC diagnostic ignored "-Winit-list-lifetime"
 #endif
 
+#include <llvm/Analysis/AliasAnalysis.h>
 #include <llvm/Analysis/TargetLibraryInfo.h>
 #include <llvm/Analysis/TargetTransformInfo.h>
 #include <llvm/Bitcode/BitcodeWriter.h>
@@ -30,9 +31,12 @@
 #include <llvm/IR/Instructions.h>
 #include <llvm/IR/LegacyPassManager.h>
 #include <llvm/IR/Module.h>
+#include <llvm/IR/PassManager.h>
 #include <llvm/IR/Verifier.h>
 #include <llvm/InitializePasses.h>
 #include <llvm/MC/SubtargetFeature.h>
+#include <llvm/Passes/PassBuilder.h>
+#include <llvm/Passes/StandardInstrumentations.h>
 #include <llvm/Object/Archive.h>
 #include <llvm/Object/ArchiveWriter.h>
 #include <llvm/Object/COFF.h>
@@ -54,6 +58,9 @@
 #include <llvm/Transforms/Instrumentation/ThreadSanitizer.h>
 #include <llvm/Transforms/Scalar.h>
 #include <llvm/Transforms/Utils.h>
+#include <llvm/Transforms/Utils/AddDiscriminators.h>
+#include <llvm/Transforms/Utils/CanonicalizeAliases.h>
+#include <llvm/Transforms/Utils/NameAnonGlobals.h>
 
 #include <lld/Common/Driver.h>
 
@@ -91,14 +98,6 @@ char *ZigLLVMGetNativeFeatures(void) {
     return strdup((const char *)StringRef(features.getString()).bytes_begin());
 }
 
-static void addDiscriminatorsPass(const PassManagerBuilder &Builder, legacy::PassManagerBase &PM) {
-    PM.add(createAddDiscriminatorsPass());
-}
-
-static void addThreadSanitizerPass(const PassManagerBuilder &Builder, legacy::PassManagerBase &PM) {
-    PM.add(createThreadSanitizerLegacyPassPass());
-}
-
 #ifndef NDEBUG
 static const bool assertions_on = true;
 #else
@@ -153,6 +152,11 @@ LLVMTargetMachineRef ZigLLVMCreateTargetMachine(LLVMTargetRef T, const char *Tri
     }
 
     TargetOptions opt;
+
+    // Work around the missing initialization of this field in the default
+    // constructor. Use -1 so that the default value is used.
+    opt.StackProtectorGuardOffset = (unsigned)-1;
+
     opt.FunctionSections = function_sections;
     switch (float_abi) {
         case ZigLLVMABITypeDefault:
@@ -188,14 +192,16 @@ bool ZigLLVMTargetMachineEmitToFile(LLVMTargetMachineRef targ_machine_ref, LLVMM
         bool is_small, bool time_report, bool tsan, bool lto,
         const char *asm_filename, const char *bin_filename, const char *llvm_ir_filename)
 {
+    // TODO: Maybe we should collect time trace rather than using timer
+    // to get a more hierarchical timeline view
     TimePassesIsEnabled = time_report;
 
-    raw_fd_ostream *dest_asm = nullptr;
-    raw_fd_ostream *dest_bin = nullptr;
+    raw_fd_ostream *dest_asm_ptr = nullptr;
+    raw_fd_ostream *dest_bin_ptr = nullptr;
 
     if (asm_filename) {
         std::error_code EC;
-        dest_asm = new(std::nothrow) raw_fd_ostream(asm_filename, EC, sys::fs::F_None);
+        dest_asm_ptr = new(std::nothrow) raw_fd_ostream(asm_filename, EC, sys::fs::F_None);
         if (EC) {
             *error_message = strdup((const char *)StringRef(EC.message()).bytes_begin());
             return true;
@@ -203,116 +209,147 @@ bool ZigLLVMTargetMachineEmitToFile(LLVMTargetMachineRef targ_machine_ref, LLVMM
     }
     if (bin_filename) {
         std::error_code EC;
-        dest_bin = new(std::nothrow) raw_fd_ostream(bin_filename, EC, sys::fs::F_None);
+        dest_bin_ptr = new(std::nothrow) raw_fd_ostream(bin_filename, EC, sys::fs::F_None);
         if (EC) {
             *error_message = strdup((const char *)StringRef(EC.message()).bytes_begin());
             return true;
         }
     }
 
-    TargetMachine* target_machine = reinterpret_cast<TargetMachine*>(targ_machine_ref);
-    target_machine->setO0WantsFastISel(true);
+    std::unique_ptr<raw_fd_ostream> dest_asm(dest_asm_ptr),
+                                    dest_bin(dest_bin_ptr);
 
-    Module* module = unwrap(module_ref);
+    TargetMachine &target_machine = *reinterpret_cast<TargetMachine*>(targ_machine_ref);
+    target_machine.setO0WantsFastISel(true);
 
-    PassManagerBuilder *PMBuilder = new(std::nothrow) PassManagerBuilder();
-    if (PMBuilder == nullptr) {
-        *error_message = strdup("memory allocation failure");
-        return true;
-    }
-    PMBuilder->OptLevel = target_machine->getOptLevel();
-    PMBuilder->SizeLevel = is_small ? 2 : 0;
+    Module &module = *unwrap(module_ref);
 
-    PMBuilder->DisableTailCalls = is_debug;
-    PMBuilder->DisableUnrollLoops = is_debug;
-    PMBuilder->SLPVectorize = !is_debug;
-    PMBuilder->LoopVectorize = !is_debug;
-    PMBuilder->LoopsInterleaved = !PMBuilder->DisableUnrollLoops;
-    PMBuilder->RerollLoops = !is_debug;
-    // Leaving NewGVN as default (off) because when on it caused issue #673
-    //PMBuilder->NewGVN = !is_debug;
-    PMBuilder->DisableGVNLoadPRE = is_debug;
-    PMBuilder->VerifyInput = assertions_on;
-    PMBuilder->VerifyOutput = assertions_on;
-    PMBuilder->MergeFunctions = !is_debug;
-    PMBuilder->PrepareForLTO = lto;
-    PMBuilder->PrepareForThinLTO = false;
-    PMBuilder->PerformThinLTO = false;
+    // Pipeline configurations
+    PipelineTuningOptions pipeline_opts;
+    pipeline_opts.LoopUnrolling = !is_debug;
+    pipeline_opts.SLPVectorization = !is_debug;
+    pipeline_opts.LoopVectorization = !is_debug;
+    pipeline_opts.LoopInterleaving = !is_debug;
+    pipeline_opts.MergeFunctions = !is_debug;
 
-    TargetLibraryInfoImpl tlii(Triple(module->getTargetTriple()));
-    PMBuilder->LibraryInfo = &tlii;
+    // Instrumentations
+    PassInstrumentationCallbacks instr_callbacks;
+    StandardInstrumentations std_instrumentations(false);
+    std_instrumentations.registerCallbacks(instr_callbacks);
 
-    if (is_debug) {
-        PMBuilder->Inliner = createAlwaysInlinerLegacyPass(false);
-    } else {
-        target_machine->adjustPassManager(*PMBuilder);
+    PassBuilder pass_builder(false, &target_machine, pipeline_opts,
+                             None, &instr_callbacks);
+    using OptimizationLevel = typename PassBuilder::OptimizationLevel;
 
-        PMBuilder->addExtension(PassManagerBuilder::EP_EarlyAsPossible, addDiscriminatorsPass);
-        PMBuilder->Inliner = createFunctionInliningPass(PMBuilder->OptLevel, PMBuilder->SizeLevel, false);
-    }
+    LoopAnalysisManager loop_am;
+    FunctionAnalysisManager function_am;
+    CGSCCAnalysisManager cgscc_am;
+    ModuleAnalysisManager module_am;
 
-    if (tsan) {
-        PMBuilder->addExtension(PassManagerBuilder::EP_OptimizerLast, addThreadSanitizerPass);
-        PMBuilder->addExtension(PassManagerBuilder::EP_EnabledOnOptLevel0, addThreadSanitizerPass);
-    }
+    // Register the AA manager first so that our version is the one used
+    function_am.registerPass([&] {
+      return pass_builder.buildDefaultAAPipeline();
+    });
 
-    // Set up the per-function pass manager.
-    legacy::FunctionPassManager FPM = legacy::FunctionPassManager(module);
-    auto tliwp = new(std::nothrow) TargetLibraryInfoWrapperPass(tlii);
-    FPM.add(tliwp);
-    FPM.add(createTargetTransformInfoWrapperPass(target_machine->getTargetIRAnalysis()));
+    Triple target_triple(module.getTargetTriple());
+    auto tlii = std::make_unique<TargetLibraryInfoImpl>(target_triple);
+    function_am.registerPass([&] { return TargetLibraryAnalysis(*tlii); });
+
+    // Initialize the AnalysisManagers
+    pass_builder.registerModuleAnalyses(module_am);
+    pass_builder.registerCGSCCAnalyses(cgscc_am);
+    pass_builder.registerFunctionAnalyses(function_am);
+    pass_builder.registerLoopAnalyses(loop_am);
+    pass_builder.crossRegisterProxies(loop_am, function_am,
+                                      cgscc_am, module_am);
+
+    // IR verification
     if (assertions_on) {
-        FPM.add(createVerifierPass());
-    }
-    PMBuilder->populateFunctionPassManager(FPM);
-
-    {
-        // Set up the per-module pass manager.
-        legacy::PassManager MPM;
-        MPM.add(createTargetTransformInfoWrapperPass(target_machine->getTargetIRAnalysis()));
-        PMBuilder->populateModulePassManager(MPM);
-
-        // Set output passes.
-        if (dest_bin && !lto) {
-            if (target_machine->addPassesToEmitFile(MPM, *dest_bin, nullptr, CGFT_ObjectFile)) {
-                *error_message = strdup("TargetMachine can't emit an object file");
-                return true;
-            }
-        }
-        if (dest_asm) {
-            if (target_machine->addPassesToEmitFile(MPM, *dest_asm, nullptr, CGFT_AssemblyFile)) {
-                *error_message = strdup("TargetMachine can't emit an assembly file");
-                return true;
-            }
-        }
-
-        // run per function optimization passes
-        FPM.doInitialization();
-        for (Function &F : *module)
-        if (!F.isDeclaration())
-            FPM.run(F);
-        FPM.doFinalization();
-
-        MPM.run(*module);
-
-        if (llvm_ir_filename) {
-            if (LLVMPrintModuleToFile(module_ref, llvm_ir_filename, error_message)) {
-                return true;
-            }
-        }
-        if (dest_bin && lto) {
-            WriteBitcodeToFile(*module, *dest_bin);
-        }
-
-        if (time_report) {
-            TimerGroup::printAll(errs());
-        }
-
-        // MPM goes out of scope and writes to the out streams
+      // Verify the input
+      pass_builder.registerPipelineStartEPCallback(
+        [](ModulePassManager &module_pm, OptimizationLevel OL) {
+          module_pm.addPass(VerifierPass());
+        });
+      // Verify the output
+      pass_builder.registerOptimizerLastEPCallback(
+        [](ModulePassManager &module_pm, OptimizationLevel OL) {
+          module_pm.addPass(VerifierPass());
+        });
     }
 
-    delete dest_asm;
-    delete dest_bin;
+    // Passes specific for release build
+    if (!is_debug) {
+      pass_builder.registerPipelineStartEPCallback(
+        [](ModulePassManager &module_pm, OptimizationLevel OL) {
+          module_pm.addPass(
+            createModuleToFunctionPassAdaptor(AddDiscriminatorsPass()));
+        });
+    }
+
+    // Thread sanitizer
+    if (tsan) {
+      pass_builder.registerOptimizerLastEPCallback(
+        [](ModulePassManager &module_pm, OptimizationLevel level) {
+          module_pm.addPass(ThreadSanitizerPass());
+        });
+    }
+
+    ModulePassManager module_pm;
+    OptimizationLevel opt_level;
+    // Setting up the optimization level
+    if (is_debug)
+      opt_level = OptimizationLevel::O0;
+    else if (is_small)
+      opt_level = OptimizationLevel::Oz;
+    else
+      opt_level = OptimizationLevel::O3;
+
+    // Initialize the PassManager
+    if (opt_level == OptimizationLevel::O0) {
+      module_pm = pass_builder.buildO0DefaultPipeline(opt_level, lto);
+    } else if (lto) {
+      module_pm = pass_builder.buildLTOPreLinkDefaultPipeline(opt_level);
+    } else {
+      module_pm = pass_builder.buildPerModuleDefaultPipeline(opt_level);
+    }
+
+    // Unfortunately we don't have new PM for code generation
+    legacy::PassManager codegen_pm;
+    codegen_pm.add(
+      createTargetTransformInfoWrapperPass(target_machine.getTargetIRAnalysis()));
+
+    if (dest_bin && !lto) {
+        if (target_machine.addPassesToEmitFile(codegen_pm, *dest_bin, nullptr, CGFT_ObjectFile)) {
+            *error_message = strdup("TargetMachine can't emit an object file");
+            return true;
+        }
+    }
+    if (dest_asm) {
+        if (target_machine.addPassesToEmitFile(codegen_pm, *dest_asm, nullptr, CGFT_AssemblyFile)) {
+            *error_message = strdup("TargetMachine can't emit an assembly file");
+            return true;
+        }
+    }
+
+    // Optimization phase
+    module_pm.run(module, module_am);
+
+    // Code generation phase
+    codegen_pm.run(module);
+
+    if (llvm_ir_filename) {
+        if (LLVMPrintModuleToFile(module_ref, llvm_ir_filename, error_message)) {
+            return true;
+        }
+    }
+
+    if (dest_bin && lto) {
+        WriteBitcodeToFile(module, *dest_bin);
+    }
+
+    if (time_report) {
+        TimerGroup::printAll(errs());
+    }
 
     return false;
 }
@@ -614,8 +651,9 @@ void ZigLLVMDisposeDIBuilder(ZigLLVMDIBuilder *dbuilder) {
 }
 
 void ZigLLVMSetCurrentDebugLocation(LLVMBuilderRef builder, int line, int column, ZigLLVMDIScope *scope) {
-    unwrap(builder)->SetCurrentDebugLocation(DebugLoc::get(
-                line, column, reinterpret_cast<DIScope*>(scope)));
+    DIScope* di_scope = reinterpret_cast<DIScope*>(scope);
+    DebugLoc debug_loc = DILocation::get(di_scope->getContext(), line, column, di_scope, nullptr, false);
+    unwrap(builder)->SetCurrentDebugLocation(debug_loc);
 }
 
 void ZigLLVMClearCurrentDebugLocation(LLVMBuilderRef builder) {
@@ -776,7 +814,8 @@ LLVMValueRef ZigLLVMInsertDeclare(ZigLLVMDIBuilder *dibuilder, LLVMValueRef stor
 }
 
 ZigLLVMDILocation *ZigLLVMGetDebugLoc(unsigned line, unsigned col, ZigLLVMDIScope *scope) {
-    DebugLoc debug_loc = DebugLoc::get(line, col, reinterpret_cast<DIScope*>(scope), nullptr);
+    DIScope* di_scope = reinterpret_cast<DIScope*>(scope);
+    DebugLoc debug_loc = DILocation::get(di_scope->getContext(), line, col, di_scope, nullptr, false);
     return reinterpret_cast<ZigLLVMDILocation*>(debug_loc.get());
 }
 
@@ -796,7 +835,17 @@ void ZigLLVMAddByValAttr(LLVMValueRef fn_ref, unsigned ArgNo, LLVMTypeRef type_v
     AttrBuilder attr_builder;
     Type *llvm_type = unwrap<Type>(type_val);
     attr_builder.addByValAttr(llvm_type);
-    const AttributeList new_attr_set = attr_set.addAttributes(func->getContext(), ArgNo, attr_builder);
+    const AttributeList new_attr_set = attr_set.addAttributes(func->getContext(), ArgNo + 1, attr_builder);
+    func->setAttributes(new_attr_set);
+}
+
+void ZigLLVMAddSretAttr(LLVMValueRef fn_ref, unsigned ArgNo, LLVMTypeRef type_val) {
+    Function *func = unwrap<Function>(fn_ref);
+    const AttributeList attr_set = func->getAttributes();
+    AttrBuilder attr_builder;
+    Type *llvm_type = unwrap<Type>(type_val);
+    attr_builder.addStructRetAttr(llvm_type);
+    const AttributeList new_attr_set = attr_set.addAttributes(func->getContext(), ArgNo + 1, attr_builder);
     func->setAttributes(new_attr_set);
 }
 
@@ -928,6 +977,15 @@ void ZigLLVMSetTailCall(LLVMValueRef Call) {
     unwrap<CallInst>(Call)->setTailCallKind(CallInst::TCK_MustTail);
 } 
 
+void ZigLLVMSetCallSret(LLVMValueRef Call, LLVMTypeRef return_type) {
+    const AttributeList attr_set = unwrap<CallInst>(Call)->getAttributes();
+    AttrBuilder attr_builder;
+    Type *llvm_type = unwrap<Type>(return_type);
+    attr_builder.addStructRetAttr(llvm_type);
+    const AttributeList new_attr_set = attr_set.addAttributes(unwrap<CallInst>(Call)->getContext(), 1, attr_builder);
+    unwrap<CallInst>(Call)->setAttributes(new_attr_set);
+}
+
 void ZigLLVMFunctionSetPrefixData(LLVMValueRef function, LLVMValueRef data) {
     unwrap<Function>(function)->setPrefixData(unwrap<Constant>(data));
 }
@@ -1194,6 +1252,7 @@ static_assert((Triple::ArchType)ZigLLVM_arc == Triple::arc, "");
 static_assert((Triple::ArchType)ZigLLVM_avr == Triple::avr, "");
 static_assert((Triple::ArchType)ZigLLVM_bpfel == Triple::bpfel, "");
 static_assert((Triple::ArchType)ZigLLVM_bpfeb == Triple::bpfeb, "");
+static_assert((Triple::ArchType)ZigLLVM_csky == Triple::csky, "");
 static_assert((Triple::ArchType)ZigLLVM_hexagon == Triple::hexagon, "");
 static_assert((Triple::ArchType)ZigLLVM_mips == Triple::mips, "");
 static_assert((Triple::ArchType)ZigLLVM_mipsel == Triple::mipsel, "");
@@ -1201,6 +1260,7 @@ static_assert((Triple::ArchType)ZigLLVM_mips64 == Triple::mips64, "");
 static_assert((Triple::ArchType)ZigLLVM_mips64el == Triple::mips64el, "");
 static_assert((Triple::ArchType)ZigLLVM_msp430 == Triple::msp430, "");
 static_assert((Triple::ArchType)ZigLLVM_ppc == Triple::ppc, "");
+static_assert((Triple::ArchType)ZigLLVM_ppcle == Triple::ppcle, "");
 static_assert((Triple::ArchType)ZigLLVM_ppc64 == Triple::ppc64, "");
 static_assert((Triple::ArchType)ZigLLVM_ppc64le == Triple::ppc64le, "");
 static_assert((Triple::ArchType)ZigLLVM_r600 == Triple::r600, "");
@@ -1242,8 +1302,6 @@ static_assert((Triple::VendorType)ZigLLVM_UnknownVendor == Triple::UnknownVendor
 static_assert((Triple::VendorType)ZigLLVM_Apple == Triple::Apple, "");
 static_assert((Triple::VendorType)ZigLLVM_PC == Triple::PC, "");
 static_assert((Triple::VendorType)ZigLLVM_SCEI == Triple::SCEI, "");
-static_assert((Triple::VendorType)ZigLLVM_BGP == Triple::BGP, "");
-static_assert((Triple::VendorType)ZigLLVM_BGQ == Triple::BGQ, "");
 static_assert((Triple::VendorType)ZigLLVM_Freescale == Triple::Freescale, "");
 static_assert((Triple::VendorType)ZigLLVM_IBM == Triple::IBM, "");
 static_assert((Triple::VendorType)ZigLLVM_ImaginationTechnologies == Triple::ImaginationTechnologies, "");
@@ -1275,11 +1333,11 @@ static_assert((Triple::OSType)ZigLLVM_NetBSD == Triple::NetBSD, "");
 static_assert((Triple::OSType)ZigLLVM_OpenBSD == Triple::OpenBSD, "");
 static_assert((Triple::OSType)ZigLLVM_Solaris == Triple::Solaris, "");
 static_assert((Triple::OSType)ZigLLVM_Win32 == Triple::Win32, "");
+static_assert((Triple::OSType)ZigLLVM_ZOS == Triple::ZOS, "");
 static_assert((Triple::OSType)ZigLLVM_Haiku == Triple::Haiku, "");
 static_assert((Triple::OSType)ZigLLVM_Minix == Triple::Minix, "");
 static_assert((Triple::OSType)ZigLLVM_RTEMS == Triple::RTEMS, "");
 static_assert((Triple::OSType)ZigLLVM_NaCl == Triple::NaCl, "");
-static_assert((Triple::OSType)ZigLLVM_CNK == Triple::CNK, "");
 static_assert((Triple::OSType)ZigLLVM_AIX == Triple::AIX, "");
 static_assert((Triple::OSType)ZigLLVM_CUDA == Triple::CUDA, "");
 static_assert((Triple::OSType)ZigLLVM_NVCL == Triple::NVCL, "");
@@ -1304,6 +1362,7 @@ static_assert((Triple::EnvironmentType)ZigLLVM_GNUABI64 == Triple::GNUABI64, "")
 static_assert((Triple::EnvironmentType)ZigLLVM_GNUEABI == Triple::GNUEABI, "");
 static_assert((Triple::EnvironmentType)ZigLLVM_GNUEABIHF == Triple::GNUEABIHF, "");
 static_assert((Triple::EnvironmentType)ZigLLVM_GNUX32 == Triple::GNUX32, "");
+static_assert((Triple::EnvironmentType)ZigLLVM_GNUILP32 == Triple::GNUILP32, "");
 static_assert((Triple::EnvironmentType)ZigLLVM_CODE16 == Triple::CODE16, "");
 static_assert((Triple::EnvironmentType)ZigLLVM_EABI == Triple::EABI, "");
 static_assert((Triple::EnvironmentType)ZigLLVM_EABIHF == Triple::EABIHF, "");
@@ -1322,6 +1381,7 @@ static_assert((Triple::EnvironmentType)ZigLLVM_LastEnvironmentType == Triple::La
 static_assert((Triple::ObjectFormatType)ZigLLVM_UnknownObjectFormat == Triple::UnknownObjectFormat, "");
 static_assert((Triple::ObjectFormatType)ZigLLVM_COFF == Triple::COFF, "");
 static_assert((Triple::ObjectFormatType)ZigLLVM_ELF == Triple::ELF, "");
+static_assert((Triple::ObjectFormatType)ZigLLVM_GOFF == Triple::GOFF, "");
 static_assert((Triple::ObjectFormatType)ZigLLVM_MachO == Triple::MachO, "");
 static_assert((Triple::ObjectFormatType)ZigLLVM_Wasm == Triple::Wasm, "");
 static_assert((Triple::ObjectFormatType)ZigLLVM_XCOFF == Triple::XCOFF, "");
diff --git a/src/zig_llvm.h b/src/zig_llvm.h
index 504ea4ec01..0d08980835 100644
--- a/src/zig_llvm.h
+++ b/src/zig_llvm.h
@@ -264,11 +264,13 @@ ZIG_EXTERN_C struct ZigLLVMDILocation *ZigLLVMGetDebugLoc(unsigned line, unsigne
 
 ZIG_EXTERN_C void ZigLLVMSetFastMath(LLVMBuilderRef builder_wrapped, bool on_state);
 ZIG_EXTERN_C void ZigLLVMSetTailCall(LLVMValueRef Call);
+ZIG_EXTERN_C void ZigLLVMSetCallSret(LLVMValueRef Call, LLVMTypeRef return_type);
 ZIG_EXTERN_C void ZigLLVMFunctionSetPrefixData(LLVMValueRef fn, LLVMValueRef data);
 ZIG_EXTERN_C void ZigLLVMFunctionSetCallingConv(LLVMValueRef function, enum ZigLLVM_CallingConv cc);
 
 ZIG_EXTERN_C void ZigLLVMAddFunctionAttr(LLVMValueRef fn, const char *attr_name, const char *attr_value);
 ZIG_EXTERN_C void ZigLLVMAddByValAttr(LLVMValueRef fn_ref, unsigned ArgNo, LLVMTypeRef type_val);
+ZIG_EXTERN_C void ZigLLVMAddSretAttr(LLVMValueRef fn_ref, unsigned ArgNo, LLVMTypeRef type_val);
 ZIG_EXTERN_C void ZigLLVMAddFunctionAttrCold(LLVMValueRef fn);
 
 ZIG_EXTERN_C void ZigLLVMParseCommandLineOptions(size_t argc, const char *const *argv);
@@ -288,6 +290,7 @@ enum ZigLLVM_ArchType {
     ZigLLVM_avr,            // AVR: Atmel AVR microcontroller
     ZigLLVM_bpfel,          // eBPF or extended BPF or 64-bit BPF (little endian)
     ZigLLVM_bpfeb,          // eBPF or extended BPF or 64-bit BPF (big endian)
+    ZigLLVM_csky,           // CSKY: csky
     ZigLLVM_hexagon,        // Hexagon: hexagon
     ZigLLVM_mips,           // MIPS: mips, mipsallegrex, mipsr6
     ZigLLVM_mipsel,         // MIPSEL: mipsel, mipsallegrexe, mipsr6el
@@ -295,6 +298,7 @@ enum ZigLLVM_ArchType {
     ZigLLVM_mips64el,       // MIPS64EL: mips64el, mips64r6el, mipsn32el, mipsn32r6el
     ZigLLVM_msp430,         // MSP430: msp430
     ZigLLVM_ppc,            // PPC: powerpc
+    ZigLLVM_ppcle,          // PPCLE: powerpc (little endian)
     ZigLLVM_ppc64,          // PPC64: powerpc64, ppu
     ZigLLVM_ppc64le,        // PPC64LE: powerpc64le
     ZigLLVM_r600,           // R600: AMD GPUs HD2XXX - HD6XXX
@@ -340,8 +344,6 @@ enum ZigLLVM_VendorType {
     ZigLLVM_Apple,
     ZigLLVM_PC,
     ZigLLVM_SCEI,
-    ZigLLVM_BGP,
-    ZigLLVM_BGQ,
     ZigLLVM_Freescale,
     ZigLLVM_IBM,
     ZigLLVM_ImaginationTechnologies,
@@ -375,11 +377,11 @@ enum ZigLLVM_OSType {
     ZigLLVM_OpenBSD,
     ZigLLVM_Solaris,
     ZigLLVM_Win32,
+    ZigLLVM_ZOS,
     ZigLLVM_Haiku,
     ZigLLVM_Minix,
     ZigLLVM_RTEMS,
     ZigLLVM_NaCl,       // Native Client
-    ZigLLVM_CNK,        // BG/P Compute-Node Kernel
     ZigLLVM_AIX,
     ZigLLVM_CUDA,       // NVIDIA CUDA
     ZigLLVM_NVCL,       // NVIDIA OpenCL
@@ -409,6 +411,7 @@ enum ZigLLVM_EnvironmentType {
     ZigLLVM_GNUEABI,
     ZigLLVM_GNUEABIHF,
     ZigLLVM_GNUX32,
+    ZigLLVM_GNUILP32,
     ZigLLVM_CODE16,
     ZigLLVM_EABI,
     ZigLLVM_EABIHF,
@@ -432,6 +435,7 @@ enum ZigLLVM_ObjectFormatType {
 
     ZigLLVM_COFF,
     ZigLLVM_ELF,
+    ZigLLVM_GOFF,
     ZigLLVM_MachO,
     ZigLLVM_Wasm,
     ZigLLVM_XCOFF,
diff --git a/test/stage1/behavior/vector.zig b/test/stage1/behavior/vector.zig
index 4b88ce020a..d3276496de 100644
--- a/test/stage1/behavior/vector.zig
+++ b/test/stage1/behavior/vector.zig
@@ -510,6 +510,19 @@ test "vector reduce operation" {
             const N = @typeInfo(@TypeOf(x)).Array.len;
             const TX = @typeInfo(@TypeOf(x)).Array.child;
 
+            // wasmtime: unknown import: `env::fminf` has not been defined
+            // https://github.com/ziglang/zig/issues/8131
+            switch (std.builtin.arch) {
+                .wasm32 => switch (@typeInfo(TX)) {
+                    .Float => switch (op) {
+                        .Min, .Max, => return,
+                        else => {},
+                    },
+                    else => {},
+                },
+                else => {},
+            }
+
             var r = @reduce(op, @as(Vector(N, TX), x));
             switch (@typeInfo(TX)) {
                 .Int, .Bool => expectEqual(expected, r),
diff --git a/test/tests.zig b/test/tests.zig
index a7d7952aca..be8d8d0a3f 100644
--- a/test/tests.zig
+++ b/test/tests.zig
@@ -154,21 +154,25 @@ const test_targets = blk: {
         //    .link_libc = true,
         //},
 
-        TestTarget{
-            .target = .{
-                .cpu_arch = .mips,
-                .os_tag = .linux,
-                .abi = .none,
-            },
-        },
-        TestTarget{
-            .target = .{
-                .cpu_arch = .mips,
-                .os_tag = .linux,
-                .abi = .musl,
-            },
-            .link_libc = true,
-        },
+        // https://github.com/ziglang/zig/issues/8155
+        //TestTarget{
+        //    .target = .{
+        //        .cpu_arch = .mips,
+        //        .os_tag = .linux,
+        //        .abi = .none,
+        //    },
+        //},
+
+        // https://github.com/ziglang/zig/issues/8155
+        //TestTarget{
+        //    .target = .{
+        //        .cpu_arch = .mips,
+        //        .os_tag = .linux,
+        //        .abi = .musl,
+        //    },
+        //    .link_libc = true,
+        //},
+
         // https://github.com/ziglang/zig/issues/4927
         //TestTarget{
         //    .target = .{
@@ -179,21 +183,25 @@ const test_targets = blk: {
         //    .link_libc = true,
         //},
 
-        TestTarget{
-            .target = .{
-                .cpu_arch = .mipsel,
-                .os_tag = .linux,
-                .abi = .none,
-            },
-        },
-        TestTarget{
-            .target = .{
-                .cpu_arch = .mipsel,
-                .os_tag = .linux,
-                .abi = .musl,
-            },
-            .link_libc = true,
-        },
+        // https://github.com/ziglang/zig/issues/8155
+        //TestTarget{
+        //    .target = .{
+        //        .cpu_arch = .mipsel,
+        //        .os_tag = .linux,
+        //        .abi = .none,
+        //    },
+        //},
+
+        // https://github.com/ziglang/zig/issues/8155
+        //TestTarget{
+        //    .target = .{
+        //        .cpu_arch = .mipsel,
+        //        .os_tag = .linux,
+        //        .abi = .musl,
+        //    },
+        //    .link_libc = true,
+        //},
+
         // https://github.com/ziglang/zig/issues/4927
         //TestTarget{
         //    .target = .{
diff --git a/tools/process_headers.zig b/tools/process_headers.zig
index 9c0822a9f3..5f4055c909 100644
--- a/tools/process_headers.zig
+++ b/tools/process_headers.zig
@@ -84,6 +84,16 @@ const glibc_targets = [_]LibCTarget{
         .arch = MultiArch{ .specific = Arch.arm },
         .abi = MultiAbi{ .specific = Abi.gnueabihf },
     },
+    LibCTarget{
+        .name = "csky-linux-gnuabiv2",
+        .arch = MultiArch{ .specific = Arch.csky },
+        .abi = MultiAbi{ .specific = Abi.gnueabihf },
+    },
+    LibCTarget{
+        .name = "csky-linux-gnuabiv2-soft",
+        .arch = MultiArch{ .specific = Arch.csky },
+        .abi = MultiAbi{ .specific = Abi.gnueabi },
+    },
     LibCTarget{
         .name = "i686-linux-gnu",
         .arch = MultiArch{ .specific = Arch.i386 },
diff --git a/tools/update_cpu_features.zig b/tools/update_cpu_features.zig
new file mode 100644
index 0000000000..7e3c636c31
--- /dev/null
+++ b/tools/update_cpu_features.zig
@@ -0,0 +1,1292 @@
+const std = @import("std");
+const fs = std.fs;
+const mem = std.mem;
+const json = std.json;
+const assert = std.debug.assert;
+
+// All references to other features are based on "zig name" as the key.
+
+const FeatureOverride = struct {
+    llvm_name: []const u8,
+    /// If true, completely omit the feature; as if it does not exist.
+    omit: bool = false,
+    /// If true, omit the feature, but all the dependencies of the feature
+    /// are added in its place.
+    flatten: bool = false,
+    zig_name: ?[]const u8 = null,
+    desc: ?[]const u8 = null,
+    extra_deps: []const []const u8 = &.{},
+};
+
+const Cpu = struct {
+    llvm_name: ?[]const u8,
+    zig_name: []const u8,
+    features: []const []const u8,
+};
+
+const Feature = struct {
+    llvm_name: ?[]const u8 = null,
+    zig_name: []const u8,
+    desc: []const u8,
+    deps: []const []const u8,
+    flatten: bool = false,
+};
+
+const LlvmTarget = struct {
+    zig_name: []const u8,
+    llvm_name: []const u8,
+    td_name: []const u8,
+    feature_overrides: []const FeatureOverride = &.{},
+    extra_cpus: []const Cpu = &.{},
+    extra_features: []const Feature = &.{},
+    branch_quota: ?usize = null,
+};
+
+const llvm_targets = [_]LlvmTarget{
+    .{
+        .zig_name = "aarch64",
+        .llvm_name = "AArch64",
+        .td_name = "AArch64.td",
+        .branch_quota = 2000,
+        .feature_overrides = &.{
+            .{
+                .llvm_name = "CONTEXTIDREL2",
+                .zig_name = "contextidr_el2",
+                .desc = "Enable RW operand Context ID Register (EL2)",
+            },
+            .{
+                .llvm_name = "neoversee1",
+                .zig_name = "neoverse_e1",
+            },
+            .{
+                .llvm_name = "neoversen1",
+                .zig_name = "neoverse_n1",
+            },
+            .{
+                .llvm_name = "neoversen2",
+                .zig_name = "neoverse_n2",
+            },
+            .{
+                .llvm_name = "neoversev1",
+                .zig_name = "neoverse_v1",
+            },
+            .{
+                .llvm_name = "exynosm3",
+                .zig_name = "exynos_m3",
+                .flatten = true,
+                .extra_deps = &.{"v8a"},
+            },
+            .{
+                .llvm_name = "exynosm4",
+                .zig_name = "exynos_m4",
+            },
+            .{
+                .llvm_name = "v8.1a",
+                .extra_deps = &.{"v8a"},
+            },
+            .{
+                .llvm_name = "a35",
+                .flatten = true,
+                .extra_deps = &.{"v8a"},
+            },
+            .{
+                .llvm_name = "a53",
+                .flatten = true,
+                .extra_deps = &.{"v8a"},
+            },
+            .{
+                .llvm_name = "a55",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "a57",
+                .flatten = true,
+                .extra_deps = &.{"v8a"},
+            },
+            .{
+                .llvm_name = "a64fx",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "a72",
+                .flatten = true,
+                .extra_deps = &.{"v8a"},
+            },
+            .{
+                .llvm_name = "a73",
+                .flatten = true,
+                .extra_deps = &.{"v8a"},
+            },
+            .{
+                .llvm_name = "a75",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "a77",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "apple-a10",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "apple-a11",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "apple-a14",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "carmel",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "cortex-a78",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "cortex-x1",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "falkor",
+                .flatten = true,
+                .extra_deps = &.{"v8a"},
+            },
+            .{
+                .llvm_name = "kryo",
+                .flatten = true,
+                .extra_deps = &.{"v8a"},
+            },
+            .{
+                .llvm_name = "saphira",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "thunderx",
+                .flatten = true,
+                .extra_deps = &.{"v8a"},
+            },
+            .{
+                .llvm_name = "thunderx2t99",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "thunderx3t110",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "thunderxt81",
+                .flatten = true,
+                .extra_deps = &.{"v8a"},
+            },
+            .{
+                .llvm_name = "thunderxt83",
+                .flatten = true,
+                .extra_deps = &.{"v8a"},
+            },
+            .{
+                .llvm_name = "thunderxt88",
+                .flatten = true,
+                .extra_deps = &.{"v8a"},
+            },
+            .{
+                .llvm_name = "tsv110",
+                .flatten = true,
+            },
+        },
+        .extra_features = &.{
+            .{
+                .zig_name = "v8a",
+                .desc = "Support ARM v8a instructions",
+                .deps = &.{ "fp_armv8", "neon" },
+            },
+        },
+        .extra_cpus = &.{
+            .{
+                .llvm_name = null,
+                .zig_name = "exynos_m1",
+                .features = &.{
+                    "crc",
+                    "crypto",
+                    "exynos_cheap_as_move",
+                    "force_32bit_jump_tables",
+                    "fuse_aes",
+                    "perfmon",
+                    "slow_misaligned_128store",
+                    "slow_paired_128",
+                    "use_postra_scheduler",
+                    "use_reciprocal_square_root",
+                    "v8a",
+                    "zcz_fp",
+                },
+            },
+            .{
+                .llvm_name = null,
+                .zig_name = "exynos_m2",
+                .features = &.{
+                    "crc",
+                    "crypto",
+                    "exynos_cheap_as_move",
+                    "force_32bit_jump_tables",
+                    "fuse_aes",
+                    "perfmon",
+                    "slow_misaligned_128store",
+                    "slow_paired_128",
+                    "use_postra_scheduler",
+                    "v8a",
+                    "zcz_fp",
+                },
+            },
+        },
+    },
+    .{
+        .zig_name = "amdgpu",
+        .llvm_name = "AMDGPU",
+        .td_name = "AMDGPU.td",
+        .feature_overrides = &.{
+            .{
+                .llvm_name = "DumpCode",
+                .omit = true,
+            },
+            .{
+                .llvm_name = "dumpcode",
+                .omit = true,
+            },
+        },
+    },
+    .{
+        .zig_name = "arc",
+        .llvm_name = "ARC",
+        .td_name = "ARC.td",
+    },
+    .{
+        .zig_name = "arm",
+        .llvm_name = "ARM",
+        .td_name = "ARM.td",
+        .branch_quota = 10000,
+        .extra_cpus = &.{
+            .{
+                .llvm_name = "generic",
+                .zig_name = "baseline",
+                .features = &.{"v7a"},
+            },
+            .{
+                .llvm_name = null,
+                .zig_name = "exynos_m1",
+                .features = &.{ "v8a", "exynos" },
+            },
+            .{
+                .llvm_name = null,
+                .zig_name = "exynos_m2",
+                .features = &.{ "v8a", "exynos" },
+            },
+        },
+        .feature_overrides = &.{
+            .{
+                .llvm_name = "cortex-a78",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "r5",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "r52",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "r7",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "m7",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "krait",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "kryo",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "cortex-x1",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "neoverse-v1",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "a5",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "a7",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "a8",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "a9",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "a12",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "a15",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "a17",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "a32",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "a35",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "a53",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "a55",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "a57",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "a72",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "a73",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "a75",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "a77",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "a78c",
+                .flatten = true,
+            },
+            .{
+                .llvm_name = "armv2",
+                .zig_name = "v2",
+                .extra_deps = &.{"strict_align"},
+            },
+            .{
+                .llvm_name = "armv2a",
+                .zig_name = "v2a",
+                .extra_deps = &.{"strict_align"},
+            },
+            .{
+                .llvm_name = "armv3",
+                .zig_name = "v3",
+                .extra_deps = &.{"strict_align"},
+            },
+            .{
+                .llvm_name = "armv3m",
+                .zig_name = "v3m",
+                .extra_deps = &.{"strict_align"},
+            },
+            .{
+                .llvm_name = "armv4",
+                .zig_name = "v4",
+                .extra_deps = &.{"strict_align"},
+            },
+            .{
+                .llvm_name = "armv4t",
+                .zig_name = "v4t",
+                .extra_deps = &.{"strict_align"},
+            },
+            .{
+                .llvm_name = "armv5t",
+                .zig_name = "v5t",
+                .extra_deps = &.{"strict_align"},
+            },
+            .{
+                .llvm_name = "armv5te",
+                .zig_name = "v5te",
+                .extra_deps = &.{"strict_align"},
+            },
+            .{
+                .llvm_name = "armv5tej",
+                .zig_name = "v5tej",
+                .extra_deps = &.{"strict_align"},
+            },
+            .{
+                .llvm_name = "armv6",
+                .zig_name = "v6",
+            },
+            .{
+                .llvm_name = "armv6-m",
+                .zig_name = "v6m",
+            },
+            .{
+                .llvm_name = "armv6j",
+                .zig_name = "v6j",
+            },
+            .{
+                .llvm_name = "armv6k",
+                .zig_name = "v6k",
+            },
+            .{
+                .llvm_name = "armv6kz",
+                .zig_name = "v6kz",
+            },
+            .{
+                .llvm_name = "armv6s-m",
+                .zig_name = "v6sm",
+            },
+            .{
+                .llvm_name = "armv6t2",
+                .zig_name = "v6t2",
+            },
+            .{
+                .llvm_name = "armv7-a",
+                .zig_name = "v7a",
+            },
+            .{
+                .llvm_name = "armv7-m",
+                .zig_name = "v7m",
+            },
+            .{
+                .llvm_name = "armv7-r",
+                .zig_name = "v7r",
+            },
+            .{
+                .llvm_name = "armv7e-m",
+                .zig_name = "v7em",
+            },
+            .{
+                .llvm_name = "armv7k",
+                .zig_name = "v7k",
+            },
+            .{
+                .llvm_name = "armv7s",
+                .zig_name = "v7s",
+            },
+            .{
+                .llvm_name = "armv7ve",
+                .zig_name = "v7ve",
+            },
+            .{
+                .llvm_name = "armv8.1-a",
+                .zig_name = "v8_1a",
+            },
+            .{
+                .llvm_name = "armv8.1-m.main",
+                .zig_name = "v8_1m_main",
+            },
+            .{
+                .llvm_name = "armv8.2-a",
+                .zig_name = "v8_2a",
+            },
+            .{
+                .llvm_name = "armv8.3-a",
+                .zig_name = "v8_3a",
+            },
+            .{
+                .llvm_name = "armv8.4-a",
+                .zig_name = "v8_4a",
+            },
+            .{
+                .llvm_name = "armv8.5-a",
+                .zig_name = "v8_5a",
+            },
+            .{
+                .llvm_name = "armv8.6-a",
+                .zig_name = "v8_6a",
+            },
+            .{
+                .llvm_name = "armv8.7-a",
+                .zig_name = "v8_7a",
+            },
+            .{
+                .llvm_name = "armv8-a",
+                .zig_name = "v8a",
+            },
+            .{
+                .llvm_name = "armv8-m.base",
+                .zig_name = "v8m",
+            },
+            .{
+                .llvm_name = "armv8-m.main",
+                .zig_name = "v8m_main",
+            },
+            .{
+                .llvm_name = "armv8-r",
+                .zig_name = "v8r",
+            },
+            .{
+                .llvm_name = "v4t",
+                .zig_name = "has_v4t",
+            },
+            .{
+                .llvm_name = "v5t",
+                .zig_name = "has_v5t",
+            },
+            .{
+                .llvm_name = "v5te",
+                .zig_name = "has_v5te",
+            },
+            .{
+                .llvm_name = "v6",
+                .zig_name = "has_v6",
+            },
+            .{
+                .llvm_name = "v6k",
+                .zig_name = "has_v6k",
+            },
+            .{
+                .llvm_name = "v6m",
+                .zig_name = "has_v6m",
+            },
+            .{
+                .llvm_name = "v6t2",
+                .zig_name = "has_v6t2",
+            },
+            .{
+                .llvm_name = "v7",
+                .zig_name = "has_v7",
+            },
+            .{
+                .llvm_name = "v7clrex",
+                .zig_name = "has_v7clrex",
+            },
+            .{
+                .llvm_name = "v8",
+                .zig_name = "has_v8",
+            },
+            .{
+                .llvm_name = "v8m",
+                .zig_name = "has_v8m",
+            },
+            .{
+                .llvm_name = "v8m.main",
+                .zig_name = "has_v8m_main",
+            },
+            .{
+                .llvm_name = "v8.1a",
+                .zig_name = "has_v8_1a",
+            },
+            .{
+                .llvm_name = "v8.1m.main",
+                .zig_name = "has_v8_1m_main",
+            },
+            .{
+                .llvm_name = "v8.2a",
+                .zig_name = "has_v8_2a",
+            },
+            .{
+                .llvm_name = "v8.3a",
+                .zig_name = "has_v8_3a",
+            },
+            .{
+                .llvm_name = "v8.4a",
+                .zig_name = "has_v8_4a",
+            },
+            .{
+                .llvm_name = "v8.5a",
+                .zig_name = "has_v8_5a",
+            },
+            .{
+                .llvm_name = "v8.6a",
+                .zig_name = "has_v8_6a",
+            },
+            .{
+                .llvm_name = "v8.7a",
+                .zig_name = "has_v8_7a",
+            },
+        },
+    },
+    .{
+        .zig_name = "avr",
+        .llvm_name = "AVR",
+        .td_name = "AVR.td",
+    },
+    .{
+        .zig_name = "bpf",
+        .llvm_name = "BPF",
+        .td_name = "BPF.td",
+    },
+    .{
+        .zig_name = "csky",
+        .llvm_name = "CSKY",
+        .td_name = "CSKY.td",
+    },
+    .{
+        .zig_name = "hexagon",
+        .llvm_name = "Hexagon",
+        .td_name = "Hexagon.td",
+    },
+    .{
+        .zig_name = "lanai",
+        .llvm_name = "Lanai",
+        .td_name = "Lanai.td",
+    },
+    .{
+        .zig_name = "msp430",
+        .llvm_name = "MSP430",
+        .td_name = "MSP430.td",
+    },
+    .{
+        .zig_name = "mips",
+        .llvm_name = "Mips",
+        .td_name = "Mips.td",
+    },
+    .{
+        .zig_name = "nvptx",
+        .llvm_name = "NVPTX",
+        .td_name = "NVPTX.td",
+    },
+    .{
+        .zig_name = "powerpc",
+        .llvm_name = "PowerPC",
+        .td_name = "PPC.td",
+    },
+    .{
+        .zig_name = "riscv",
+        .llvm_name = "RISCV",
+        .td_name = "RISCV.td",
+        .extra_cpus = &.{
+            .{
+                .llvm_name = null,
+                .zig_name = "baseline_rv32",
+                .features = &.{ "a", "c", "d", "f", "m" },
+            },
+            .{
+                .llvm_name = null,
+                .zig_name = "baseline_rv64",
+                .features = &.{ "64bit", "a", "c", "d", "f", "m" },
+            },
+        },
+    },
+    .{
+        .zig_name = "sparc",
+        .llvm_name = "Sparc",
+        .td_name = "Sparc.td",
+    },
+    .{
+        .zig_name = "systemz",
+        .llvm_name = "SystemZ",
+        .td_name = "SystemZ.td",
+    },
+    .{
+        .zig_name = "ve",
+        .llvm_name = "VE",
+        .td_name = "VE.td",
+    },
+    .{
+        .zig_name = "wasm",
+        .llvm_name = "WebAssembly",
+        .td_name = "WebAssembly.td",
+    },
+    .{
+        .zig_name = "x86",
+        .llvm_name = "X86",
+        .td_name = "X86.td",
+        .feature_overrides = &.{
+            .{
+                .llvm_name = "64bit-mode",
+                .omit = true,
+            },
+            .{
+                .llvm_name = "i386",
+                .zig_name = "_i386",
+            },
+            .{
+                .llvm_name = "i486",
+                .zig_name = "_i486",
+            },
+            .{
+                .llvm_name = "i586",
+                .zig_name = "_i586",
+            },
+            .{
+                .llvm_name = "i686",
+                .zig_name = "_i686",
+            },
+        },
+    },
+    .{
+        .zig_name = "xcore",
+        .llvm_name = "XCore",
+        .td_name = "XCore.td",
+    },
+};
+
+pub fn main() anyerror!void {
+    var arena_state = std.heap.ArenaAllocator.init(std.heap.page_allocator);
+    defer arena_state.deinit();
+    const arena = &arena_state.allocator;
+
+    const args = try std.process.argsAlloc(arena);
+    if (args.len <= 1) {
+        usageAndExit(std.io.getStdErr(), args[0], 1);
+    }
+    if (std.mem.eql(u8, args[1], "--help")) {
+        usageAndExit(std.io.getStdOut(), args[0], 0);
+    }
+    if (args.len < 4) {
+        usageAndExit(std.io.getStdErr(), args[0], 1);
+    }
+
+    const llvm_tblgen_exe = args[1];
+    if (std.mem.startsWith(u8, llvm_tblgen_exe, "-")) {
+        usageAndExit(std.io.getStdErr(), args[0], 1);
+    }
+
+    const llvm_src_root = args[2];
+    if (std.mem.startsWith(u8, llvm_src_root, "-")) {
+        usageAndExit(std.io.getStdErr(), args[0], 1);
+    }
+
+    const zig_src_root = args[3];
+    if (std.mem.startsWith(u8, zig_src_root, "-")) {
+        usageAndExit(std.io.getStdErr(), args[0], 1);
+    }
+
+    var zig_src_dir = try fs.cwd().openDir(zig_src_root, .{});
+    defer zig_src_dir.close();
+
+    var progress = std.Progress{};
+    const root_progress = try progress.start("", llvm_targets.len);
+    defer root_progress.end();
+
+    if (std.builtin.single_threaded) {
+        for (llvm_targets) |llvm_target| {
+            try processOneTarget(Job{
+                .llvm_tblgen_exe = llvm_tblgen_exe,
+                .llvm_src_root = llvm_src_root,
+                .zig_src_dir = zig_src_dir,
+                .root_progress = root_progress,
+                .llvm_target = llvm_target,
+            });
+        }
+    } else {
+        var threads = try arena.alloc(*std.Thread, llvm_targets.len);
+        for (llvm_targets) |llvm_target, i| {
+            threads[i] = try std.Thread.spawn(processOneTarget, .{
+                .llvm_tblgen_exe = llvm_tblgen_exe,
+                .llvm_src_root = llvm_src_root,
+                .zig_src_dir = zig_src_dir,
+                .root_progress = root_progress,
+                .llvm_target = llvm_target,
+            });
+        }
+        for (threads) |thread| {
+            thread.wait();
+        }
+    }
+}
+
+const Job = struct {
+    llvm_tblgen_exe: []const u8,
+    llvm_src_root: []const u8,
+    zig_src_dir: std.fs.Dir,
+    root_progress: *std.Progress.Node,
+    llvm_target: LlvmTarget,
+};
+
+fn processOneTarget(job: Job) anyerror!void {
+    const llvm_target = job.llvm_target;
+
+    var arena_state = std.heap.ArenaAllocator.init(std.heap.page_allocator);
+    defer arena_state.deinit();
+    const arena = &arena_state.allocator;
+
+    var progress_node = job.root_progress.start(llvm_target.zig_name, 3);
+    progress_node.activate();
+    defer progress_node.end();
+
+    var tblgen_progress = progress_node.start("invoke llvm-tblgen", 0);
+    tblgen_progress.activate();
+
+    const child_args = [_][]const u8{
+        job.llvm_tblgen_exe,
+        "--dump-json",
+        try std.fmt.allocPrint(arena, "{s}/llvm/lib/Target/{s}/{s}", .{
+            job.llvm_src_root,
+            llvm_target.llvm_name,
+            llvm_target.td_name,
+        }),
+        try std.fmt.allocPrint(arena, "-I={s}/llvm/include", .{job.llvm_src_root}),
+        try std.fmt.allocPrint(arena, "-I={s}/llvm/lib/Target/{s}", .{
+            job.llvm_src_root, llvm_target.llvm_name,
+        }),
+    };
+
+    const child_result = try std.ChildProcess.exec(.{
+        .allocator = arena,
+        .argv = &child_args,
+        .max_output_bytes = 200 * 1024 * 1024,
+    });
+    tblgen_progress.end();
+    if (child_result.stderr.len != 0) {
+        std.debug.warn("{s}\n", .{child_result.stderr});
+    }
+
+    const json_text = switch (child_result.term) {
+        .Exited => |code| if (code == 0) child_result.stdout else {
+            std.debug.warn("llvm-tblgen exited with code {d}\n", .{code});
+            std.process.exit(1);
+        },
+        else => {
+            std.debug.warn("llvm-tblgen crashed\n", .{});
+            std.process.exit(1);
+        },
+    };
+
+    var json_parse_progress = progress_node.start("parse JSON", 0);
+    json_parse_progress.activate();
+
+    var parser = json.Parser.init(arena, false);
+    const tree = try parser.parse(json_text);
+    json_parse_progress.end();
+
+    var render_progress = progress_node.start("render zig code", 0);
+    render_progress.activate();
+
+    const root_map = &tree.root.Object;
+    var features_table = std.StringHashMap(Feature).init(arena);
+    var all_features = std.ArrayList(Feature).init(arena);
+    var all_cpus = std.ArrayList(Cpu).init(arena);
+    {
+        var it = root_map.iterator();
+        root_it: while (it.next()) |kv| {
+            if (kv.key.len == 0) continue;
+            if (kv.key[0] == '!') continue;
+            if (kv.value != .Object) continue;
+            if (hasSuperclass(&kv.value.Object, "SubtargetFeature")) {
+                const llvm_name = kv.value.Object.get("Name").?.String;
+                if (llvm_name.len == 0) continue;
+
+                var zig_name = try llvmNameToZigName(arena, llvm_name);
+                var desc = kv.value.Object.get("Desc").?.String;
+                var deps = std.ArrayList([]const u8).init(arena);
+                var omit = false;
+                var flatten = false;
+                const implies = kv.value.Object.get("Implies").?.Array;
+                for (implies.items) |imply| {
+                    const other_key = imply.Object.get("def").?.String;
+                    const other_obj = &root_map.getEntry(other_key).?.value.Object;
+                    const other_llvm_name = other_obj.get("Name").?.String;
+                    const other_zig_name = (try llvmNameToZigNameOmit(
+                        arena,
+                        llvm_target,
+                        other_llvm_name,
+                    )) orelse continue;
+                    try deps.append(other_zig_name);
+                }
+                for (llvm_target.feature_overrides) |feature_override| {
+                    if (mem.eql(u8, llvm_name, feature_override.llvm_name)) {
+                        if (feature_override.omit) {
+                            // Still put the feature into the table so that we can
+                            // expand dependencies for the feature overrides marked `flatten`.
+                            omit = true;
+                        }
+                        if (feature_override.flatten) {
+                            flatten = true;
+                        }
+                        if (feature_override.zig_name) |override_name| {
+                            zig_name = override_name;
+                        }
+                        if (feature_override.desc) |override_desc| {
+                            desc = override_desc;
+                        }
+                        for (feature_override.extra_deps) |extra_dep| {
+                            try deps.append(extra_dep);
+                        }
+                        break;
+                    }
+                }
+                const feature: Feature = .{
+                    .llvm_name = llvm_name,
+                    .zig_name = zig_name,
+                    .desc = desc,
+                    .deps = deps.items,
+                    .flatten = flatten,
+                };
+                try features_table.put(zig_name, feature);
+                if (!omit and !flatten) {
+                    try all_features.append(feature);
+                }
+            }
+            if (hasSuperclass(&kv.value.Object, "Processor")) {
+                const llvm_name = kv.value.Object.get("Name").?.String;
+                if (llvm_name.len == 0) continue;
+
+                var zig_name = try llvmNameToZigName(arena, llvm_name);
+                var deps = std.ArrayList([]const u8).init(arena);
+                const features = kv.value.Object.get("Features").?.Array;
+                for (features.items) |feature| {
+                    const feature_key = feature.Object.get("def").?.String;
+                    const feature_obj = &root_map.getEntry(feature_key).?.value.Object;
+                    const feature_llvm_name = feature_obj.get("Name").?.String;
+                    if (feature_llvm_name.len == 0) continue;
+                    const feature_zig_name = (try llvmNameToZigNameOmit(
+                        arena,
+                        llvm_target,
+                        feature_llvm_name,
+                    )) orelse continue;
+                    try deps.append(feature_zig_name);
+                }
+                const tune_features = kv.value.Object.get("TuneFeatures").?.Array;
+                for (tune_features.items) |feature| {
+                    const feature_key = feature.Object.get("def").?.String;
+                    const feature_obj = &root_map.getEntry(feature_key).?.value.Object;
+                    const feature_llvm_name = feature_obj.get("Name").?.String;
+                    if (feature_llvm_name.len == 0) continue;
+                    const feature_zig_name = (try llvmNameToZigNameOmit(
+                        arena,
+                        llvm_target,
+                        feature_llvm_name,
+                    )) orelse continue;
+                    try deps.append(feature_zig_name);
+                }
+                for (llvm_target.feature_overrides) |feature_override| {
+                    if (mem.eql(u8, llvm_name, feature_override.llvm_name)) {
+                        if (feature_override.omit) {
+                            continue :root_it;
+                        }
+                        if (feature_override.zig_name) |override_name| {
+                            zig_name = override_name;
+                        }
+                        for (feature_override.extra_deps) |extra_dep| {
+                            try deps.append(extra_dep);
+                        }
+                        break;
+                    }
+                }
+                try all_cpus.append(.{
+                    .llvm_name = llvm_name,
+                    .zig_name = zig_name,
+                    .features = deps.items,
+                });
+            }
+        }
+    }
+    for (llvm_target.extra_features) |extra_feature| {
+        try features_table.put(extra_feature.zig_name, extra_feature);
+        try all_features.append(extra_feature);
+    }
+    for (llvm_target.extra_cpus) |extra_cpu| {
+        try all_cpus.append(extra_cpu);
+    }
+    std.sort.sort(Feature, all_features.items, {}, featureLessThan);
+    std.sort.sort(Cpu, all_cpus.items, {}, cpuLessThan);
+
+    const target_sub_path = try fs.path.join(arena, &.{ "lib", "std", "target" });
+    var target_dir = try job.zig_src_dir.makeOpenPath(target_sub_path, .{});
+    defer target_dir.close();
+
+    const zig_code_basename = try std.fmt.allocPrint(arena, "{s}.zig", .{llvm_target.zig_name});
+
+    if (all_features.items.len == 0) {
+        // We represent this with an empty file.
+        try target_dir.deleteTree(zig_code_basename);
+        return;
+    }
+
+    var zig_code_file = try target_dir.createFile(zig_code_basename, .{});
+    defer zig_code_file.close();
+
+    var bw = std.io.bufferedWriter(zig_code_file.writer());
+    const w = bw.writer();
+
+    try w.writeAll(
+        \\//! This file is auto-generated by tools/update_cpu_features.zig.
+        \\
+        \\const std = @import("../std.zig");
+        \\const CpuFeature = std.Target.Cpu.Feature;
+        \\const CpuModel = std.Target.Cpu.Model;
+        \\
+        \\pub const Feature = enum {
+        \\
+    );
+
+    for (all_features.items) |feature| {
+        try w.print("    {},\n", .{std.zig.fmtId(feature.zig_name)});
+    }
+
+    try w.writeAll(
+        \\};
+        \\
+        \\pub usingnamespace CpuFeature.feature_set_fns(Feature);
+        \\
+        \\pub const all_features = blk: {
+        \\
+    );
+    if (llvm_target.branch_quota) |branch_quota| {
+        try w.print("    @setEvalBranchQuota({d});\n", .{branch_quota});
+    }
+    try w.writeAll(
+        \\    const len = @typeInfo(Feature).Enum.fields.len;
+        \\    std.debug.assert(len <= CpuFeature.Set.needed_bit_count);
+        \\    var result: [len]CpuFeature = undefined;
+        \\
+    );
+
+    for (all_features.items) |feature| {
+        if (feature.llvm_name) |llvm_name| {
+            try w.print(
+                \\    result[@enumToInt(Feature.{})] = .{{
+                \\        .llvm_name = "{}",
+                \\        .description = "{}",
+                \\        .dependencies = featureSet(&[_]Feature{{
+            ,
+                .{
+                    std.zig.fmtId(feature.zig_name),
+                    std.zig.fmtEscapes(llvm_name),
+                    std.zig.fmtEscapes(feature.desc),
+                },
+            );
+        } else {
+            try w.print(
+                \\    result[@enumToInt(Feature.{})] = .{{
+                \\        .llvm_name = null,
+                \\        .description = "{}",
+                \\        .dependencies = featureSet(&[_]Feature{{
+            ,
+                .{
+                    std.zig.fmtId(feature.zig_name),
+                    std.zig.fmtEscapes(feature.desc),
+                },
+            );
+        }
+        var deps_set = std.StringHashMap(void).init(arena);
+        for (feature.deps) |dep| {
+            try putDep(&deps_set, features_table, dep);
+        }
+        try pruneFeatures(arena, features_table, &deps_set);
+        var dependencies = std.ArrayList([]const u8).init(arena);
+        {
+            var it = deps_set.iterator();
+            while (it.next()) |entry| {
+                try dependencies.append(entry.key);
+            }
+        }
+        std.sort.sort([]const u8, dependencies.items, {}, asciiLessThan);
+
+        if (dependencies.items.len == 0) {
+            try w.writeAll(
+                \\}),
+                \\    };
+                \\
+            );
+        } else {
+            try w.writeAll("\n");
+            for (dependencies.items) |dep| {
+                try w.print("            .{},\n", .{std.zig.fmtId(dep)});
+            }
+            try w.writeAll(
+                \\        }),
+                \\    };
+                \\
+            );
+        }
+    }
+    try w.writeAll(
+        \\    const ti = @typeInfo(Feature);
+        \\    for (result) |*elem, i| {
+        \\        elem.index = i;
+        \\        elem.name = ti.Enum.fields[i].name;
+        \\    }
+        \\    break :blk result;
+        \\};
+        \\
+        \\pub const cpu = struct {
+        \\
+    );
+    for (all_cpus.items) |cpu| {
+        var deps_set = std.StringHashMap(void).init(arena);
+        for (cpu.features) |feature_zig_name| {
+            try putDep(&deps_set, features_table, feature_zig_name);
+        }
+        try pruneFeatures(arena, features_table, &deps_set);
+        var cpu_features = std.ArrayList([]const u8).init(arena);
+        {
+            var it = deps_set.iterator();
+            while (it.next()) |entry| {
+                try cpu_features.append(entry.key);
+            }
+        }
+        std.sort.sort([]const u8, cpu_features.items, {}, asciiLessThan);
+        if (cpu.llvm_name) |llvm_name| {
+            try w.print(
+                \\    pub const {} = CpuModel{{
+                \\        .name = "{}",
+                \\        .llvm_name = "{}",
+                \\        .features = featureSet(&[_]Feature{{
+            , .{
+                std.zig.fmtId(cpu.zig_name),
+                std.zig.fmtEscapes(cpu.zig_name),
+                std.zig.fmtEscapes(llvm_name),
+            });
+        } else {
+            try w.print(
+                \\    pub const {} = CpuModel{{
+                \\        .name = "{}",
+                \\        .llvm_name = null,
+                \\        .features = featureSet(&[_]Feature{{
+            , .{
+                std.zig.fmtId(cpu.zig_name),
+                std.zig.fmtEscapes(cpu.zig_name),
+            });
+        }
+        if (cpu_features.items.len == 0) {
+            try w.writeAll(
+                \\}),
+                \\    };
+                \\
+            );
+        } else {
+            try w.writeAll("\n");
+            for (cpu_features.items) |feature_zig_name| {
+                try w.print("            .{},\n", .{std.zig.fmtId(feature_zig_name)});
+            }
+            try w.writeAll(
+                \\        }),
+                \\    };
+                \\
+            );
+        }
+    }
+
+    try w.writeAll(
+        \\};
+        \\
+    );
+    try bw.flush();
+
+    render_progress.end();
+}
+
+fn usageAndExit(file: fs.File, arg0: []const u8, code: u8) noreturn {
+    file.writer().print(
+        \\Usage: {s} /path/to/llvm-tblgen /path/git/llvm-project /path/git/zig
+        \\
+        \\Updates lib/std/target/<target>.zig from llvm/lib/Target/<Target>/<Target>.td .
+        \\
+        \\On a less beefy system, or when debugging, compile with --single-threaded.
+        \\
+    , .{arg0}) catch std.process.exit(1);
+    std.process.exit(code);
+}
+
+fn featureLessThan(context: void, a: Feature, b: Feature) bool {
+    return std.ascii.lessThanIgnoreCase(a.zig_name, b.zig_name);
+}
+
+fn cpuLessThan(context: void, a: Cpu, b: Cpu) bool {
+    return std.ascii.lessThanIgnoreCase(a.zig_name, b.zig_name);
+}
+
+fn asciiLessThan(context: void, a: []const u8, b: []const u8) bool {
+    return std.ascii.lessThanIgnoreCase(a, b);
+}
+
+fn llvmNameToZigName(arena: *mem.Allocator, llvm_name: []const u8) ![]const u8 {
+    const duped = try arena.dupe(u8, llvm_name);
+    for (duped) |*byte| switch (byte.*) {
+        '-', '.' => byte.* = '_',
+        else => continue,
+    };
+    return duped;
+}
+
+fn llvmNameToZigNameOmit(
+    arena: *mem.Allocator,
+    llvm_target: LlvmTarget,
+    llvm_name: []const u8,
+) !?[]const u8 {
+    for (llvm_target.feature_overrides) |feature_override| {
+        if (mem.eql(u8, feature_override.llvm_name, llvm_name)) {
+            if (feature_override.omit) return null;
+            return feature_override.zig_name orelse break;
+        }
+    }
+    return try llvmNameToZigName(arena, llvm_name);
+}
+
+fn hasSuperclass(obj: *json.ObjectMap, class_name: []const u8) bool {
+    const superclasses_json = obj.get("!superclasses") orelse return false;
+    for (superclasses_json.Array.items) |superclass_json| {
+        const superclass = superclass_json.String;
+        if (std.mem.eql(u8, superclass, class_name)) {
+            return true;
+        }
+    }
+    return false;
+}
+
+fn pruneFeatures(
+    arena: *mem.Allocator,
+    features_table: std.StringHashMap(Feature),
+    deps_set: *std.StringHashMap(void),
+) !void {
+    // For each element, recursively iterate over the dependencies and add
+    // everything we find to a "deletion set".
+    // Then, iterate over the deletion set and delete all that stuff from `deps_set`.
+    var deletion_set = std.StringHashMap(void).init(arena);
+    {
+        var it = deps_set.iterator();
+        while (it.next()) |entry| {
+            const feature = features_table.get(entry.key).?;
+            try walkFeatures(features_table, &deletion_set, feature);
+        }
+    }
+    {
+        var it = deletion_set.iterator();
+        while (it.next()) |entry| {
+            _ = deps_set.remove(entry.key);
+        }
+    }
+}
+
+fn walkFeatures(
+    features_table: std.StringHashMap(Feature),
+    deletion_set: *std.StringHashMap(void),
+    feature: Feature,
+) error{OutOfMemory}!void {
+    for (feature.deps) |dep| {
+        try deletion_set.put(dep, {});
+        const other_feature = features_table.get(dep).?;
+        try walkFeatures(features_table, deletion_set, other_feature);
+    }
+}
+
+fn putDep(
+    deps_set: *std.StringHashMap(void),
+    features_table: std.StringHashMap(Feature),
+    zig_feature_name: []const u8,
+) error{OutOfMemory}!void {
+    const feature = features_table.get(zig_feature_name).?;
+    if (feature.flatten) {
+        for (feature.deps) |dep| {
+            try putDep(deps_set, features_table, dep);
+        }
+    } else {
+        try deps_set.put(zig_feature_name, {});
+    }
+}