summaryrefslogtreecommitdiffstats
path: root/multimedia
diff options
context:
space:
mode:
author Vijay Marcel <vijaymarcel@outlook.com>2024-02-22 21:39:25 +0700
committer Willy Sudiarto Raharjo <willysr@slackbuilds.org>2024-02-23 16:33:17 +0700
commita21af5200f52a8552c3adf249cfc9764820e1248 (patch)
tree11df7bbfa443edc3e70f07caee24fabb2a1627db /multimedia
parent560f3e11fef6a413e7594a7afc6ca72fc9d5c72d (diff)
downloadslackbuilds-a21af5200f52a8552c3adf249cfc9764820e1248.tar.gz
slackbuilds-a21af5200f52a8552c3adf249cfc9764820e1248.tar.xz
multimedia/uavs3d: Updated for version 1.1.
Signed-off-by: Willy Sudiarto Raharjo <willysr@slackbuilds.org>
Diffstat (limited to 'multimedia')
-rw-r--r--multimedia/uavs3d/changelog8
-rw-r--r--multimedia/uavs3d/fix-build-issue.patch7618
-rw-r--r--multimedia/uavs3d/fix-libdir-in-cmakelists.patch (renamed from multimedia/uavs3d/cmakelist.patch)21
-rw-r--r--multimedia/uavs3d/uavs3d.SlackBuild42
-rw-r--r--multimedia/uavs3d/uavs3d.info6
5 files changed, 7654 insertions, 41 deletions
diff --git a/multimedia/uavs3d/changelog b/multimedia/uavs3d/changelog
index 72ca012f62..478e92d365 100644
--- a/multimedia/uavs3d/changelog
+++ b/multimedia/uavs3d/changelog
@@ -11,3 +11,11 @@ Updated to Github commit 0133ee4
Patched the cmakelist file to build in i586 and i686 platforms.
Also patched the file to set the correct lib dir in 64 bit
platforms.
+
+21/02/2024:
+
+Switched to a versioned one and
+patched it to latest git commit.
+From now on this will build only on x86 and x86_64.
+upstream hasn't made a release yet but they have
+upgraded the version to 1.2.0 internally.
diff --git a/multimedia/uavs3d/fix-build-issue.patch b/multimedia/uavs3d/fix-build-issue.patch
new file mode 100644
index 0000000000..8aa66107a7
--- /dev/null
+++ b/multimedia/uavs3d/fix-build-issue.patch
@@ -0,0 +1,7618 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 5118377..46458c7 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -1,15 +1,20 @@
+-cmake_minimum_required(VERSION 2.8)
++cmake_minimum_required(VERSION 3.1)
+
+ project(uavs3d)
+
++option(COMPILE_10BIT "Enable 10bit streams decoding support." OFF)
++
++set(CMAKE_C_STANDARD 99)
++set(CMAKE_POSITION_INDEPENDENT_CODE ON)
++
+ aux_source_directory(./test DIR_SRC_TEST)
+-set_source_files_properties(${DIR_SRC_TEST} PROPERTIES COMPILE_FLAGS "${CMAKE_C_FLAGS} -std=c99 -O3")
+
+ add_subdirectory(./source)
+
+ add_executable(uavs3dec ${DIR_SRC_TEST})
+
+-target_link_libraries(uavs3dec m)
++if (NOT MSVC)
++ target_link_libraries(uavs3dec m)
++endif()
+ target_link_libraries(uavs3dec uavs3d)
+ #target_link_libraries(uavs3dec dl)
+-
+diff --git a/COPYING b/COPYING
+index 409d303..ce30f0f 100644
+--- a/COPYING
++++ b/COPYING
+@@ -1,4 +1,4 @@
+-Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
++Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+
+ All rights reserved.
+
+@@ -9,10 +9,7 @@ modification, are permitted provided that the following conditions are met:
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+-3. All advertising materials mentioning features or use of this software
+- must display the following acknowledgement:
+- This product includes the software uAVS3d developed by Peking University Shenzhen Graduate School, Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation.
+-4. Neither the name of the organizations (Peking University Shenzhen Graduate School, Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation)
++3. Neither the name of the organizations (Peking University Shenzhen Graduate School, Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation)
+ nor the names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+diff --git a/README.md b/README.md
+index e46ada8..9ce6a2a 100644
+--- a/README.md
++++ b/README.md
+@@ -5,15 +5,16 @@
+ 2) supports to compile for Android/IOS/Windows/Linux/MacOS systems.
+ 3) optimized for ARMv7/ARMv8/SSE4/AVX2 chips.
+ 4) 10bit decoding on all supported platforms.
+-
++ 5) The uavs3 codec has supported x86 and arm platforms, and has been tested and verified on the Kunpeng processor.
++ 6) The ARM platform recommends the Kunpeng processor.
+ # license
+ Copyright reserved by “Peking University Shenzhen Graduate School”, “Peng Cheng Laboratory”, and “Guangdong Bohua UHD Innovation Corporation” <br><br>
+- This program is a free software. You can redistribute it and/or modify it under the terms of the BSD 4-clause license. <br>
++ This program is a free software. You can redistribute it and/or modify it under the terms of the BSD 3-clause license. <br>
+ For more details, please view the file "COPYING" in the project.
+
+ # compile
+ The default configuration only support 8bit decoding. <br>
+- To support 10bit streams decoding, edit source/decore/com_def.h : #define BIT_DEPTH 10
++ To support 10bit streams decoding: cmake -DCOMPILE_10BIT=1
+
+ ## windows
+ Prerequisites:
+@@ -22,17 +23,20 @@ Prerequisites:
+ build:
+ 1. ./version.bat (to generate version.h)
+ 2. solution file: build/x86_windows/uavs3d.sln
++
++ To support 10bit streams decoding, edit source/decore/com_def.h : #define COMPILE_10BIT 1
+
+ ## linux/mac
+ Prerequisites:
+ 1. gawk (http://www.gnu.org/software/gawk/)
+- 2. CMake (https://cmake.org) version 2.8 or higher
++ 2. CMake (https://cmake.org) version 3.1 or higher
+
+ Build:
+ 1. mkdir build/linux
+- 2. cd build/linux && cmake ../..
++ 2. cd build/linux && cmake -DCOMPILE_10BIT=0 ../..
+ 3. make && make install
+
++ To support 10bit streams decoding: cmake -DCOMPILE_10BIT=1
+ to build shared library, set BUILD_SHARED_LIBS=1 please.
+
+ ## ios
+@@ -40,8 +44,11 @@ Prerequisites:
+ XCode
+
+ Build:
+-1. ./version.sh (generate the version.h)
+-2. xcode solution file: build/ios/uavs3d.xcodeproj
++ 1. ./version.sh (generate the version.h)
++ 2. xcode solution file: build/ios/uavs3d.xcodeproj
++
++ To support 10bit streams decoding:
++ Find Xcode -> PROJECT -> Build Settings -> Preprocessor Macros, add COMPILE_10BIT=1
+
+ ## android
+ Prerequisites:
+@@ -51,8 +58,10 @@ Build ndk library or executable file:
+ 1. ./version.sh (generate the version.h)
+ 2. cd build/android/ndk/jni
+ 3. $NDK_PATH/ndk-build
++
++ To support 10bit streams decoding: edit build/android/ndk/jni/uavs3d_main.mk:
+
+-The executable application for arm64-v8a is generated by default. <br>To generate static or shared library for other platforms, modify correlative options in Android.mk and Application.mk.
++ LOCAL_CFLAGS += -DCOMPILE_10BIT=1
+
+ # Run tests
+ ## window/linux/mac/android
+diff --git a/build/android/ndk/jni/Android.mk b/build/android/ndk/jni/Android.mk
+index fe79947..88d57f4 100644
+--- a/build/android/ndk/jni/Android.mk
++++ b/build/android/ndk/jni/Android.mk
+@@ -5,53 +5,26 @@ SRC_PATH := ../../../../source
+ INCLUDE_PATH := ../../../../source/decore
+
+ ### Name of the local module
+-include $(CLEAR_VARS)
+-LOCAL_MODULE := uavs3d
++include $(LOCAL_PATH)/uavs3d_clear_vars.mk
++LOCAL_MODULE := uavs3d-static
++LOCAL_MODULE_FILENAME := libuavs3d
++include $(LOCAL_PATH)/uavs3d_main.mk
++include $(BUILD_STATIC_LIBRARY)
++
++include $(LOCAL_PATH)/uavs3d_clear_vars.mk
++LOCAL_MODULE := uavs3d-shared
++LOCAL_MODULE_FILENAME := libuavs3d
+ LOCAL_LDLIBS:=-L$(SYSROOT)/usr/lib -lm -llog
+-
+-### for posix pthread
+-#LOCAL_SHARED_LIBRARIES := libcutil
+-
+-### include search path when compiling all sources (C,C++,Assembly)
+-LOCAL_C_INCLUDES +=$(INCLUDE_PATH) \
+- $(LOCAL_PATH)/../app
+-
+-### c source code
+-uavs3d_srcs_c += $(SRC_PATH)/decore/alf.c
+-uavs3d_srcs_c += $(SRC_PATH)/decore/deblock.c
+-uavs3d_srcs_c += $(SRC_PATH)/decore/inter_pred.c
+-uavs3d_srcs_c += $(SRC_PATH)/decore/intra_pred.c
+-uavs3d_srcs_c += $(SRC_PATH)/decore/inv_trans.c
+-uavs3d_srcs_c += $(SRC_PATH)/decore/pic_manager.c
+-uavs3d_srcs_c += $(SRC_PATH)/decore/recon.c
+-uavs3d_srcs_c += $(SRC_PATH)/decore/sao.c
+-uavs3d_srcs_c += $(SRC_PATH)/decore/com_table.c
+-uavs3d_srcs_c += $(SRC_PATH)/decore/threadpool.c
+-uavs3d_srcs_c += $(SRC_PATH)/decore/win32thread.c
+-uavs3d_srcs_c += $(SRC_PATH)/decore/com_util.c
+-uavs3d_srcs_c += $(SRC_PATH)/decoder/uavs3d.c
+-uavs3d_srcs_c += $(SRC_PATH)/decoder/bitstream.c
+-uavs3d_srcs_c += $(SRC_PATH)/decoder/parser.c
+-uavs3d_srcs_c += $(SRC_PATH)/decoder/dec_util.c
+-
+-LOCAL_CFLAGS += -O3 -fPIC -std=gnu99
+ LOCAL_LDFLAGS += -fPIC
++include $(LOCAL_PATH)/uavs3d_main.mk
++include $(BUILD_SHARED_LIBRARY)
++
+
+-#if build_executable
++include $(LOCAL_PATH)/uavs3d_clear_vars.mk
++LOCAL_MODULE := uavs3d
++LOCAL_LDLIBS:=-L$(SYSROOT)/usr/lib -lm -llog
+ LOCAL_CFLAGS += -pie -fPIE
+ LOCAL_LDFLAGS += -pie -fPIE
+-uavs3d_srcs_test+= $(SRC_PATH)/../test/utest.c
+-#endif
+-
+-#if build armv7a
+-#LOCAL_CFLAGS += -mfpu=neon
+-#include $(LOCAL_PATH)/uavs3d_armv7a.mk
+-#elif build arm64
+-include $(LOCAL_PATH)/uavs3d_arm64.mk
+-#endif
+-
+-LOCAL_SRC_FILES := $(uavs3d_srcs_c) $(uavs3d_srcs_arm) $(uavs3d_srcs_test)
+-
+-#include $(BUILD_SHARED_LIBRARY)
+-#include $(BUILD_STATIC_LIBRARY)
++uavs3d_srcs_test+= $(SRC_PATH)/../test/utest.c
++include $(LOCAL_PATH)/uavs3d_main.mk
+ include $(BUILD_EXECUTABLE)
+diff --git a/build/android/ndk/jni/Application.mk b/build/android/ndk/jni/Application.mk
+index 292946e..29e7e02 100644
+--- a/build/android/ndk/jni/Application.mk
++++ b/build/android/ndk/jni/Application.mk
+@@ -1,8 +1,9 @@
+ # APP_ABI := armeabi-v7a
+- APP_ABI := arm64-v8a
++# APP_ABI := arm64-v8a
+ # APP_ABI := armeabi
+ # APP_ABI := x86
+ # APP_ABI := x86_64
++APP_ABI := all
+ APP_OPTIM := release
+ # TARGET_BUILD_TYPE=release
+
+diff --git a/build/android/ndk/jni/uavs3d_avx2.mk b/build/android/ndk/jni/uavs3d_avx2.mk
+new file mode 100644
+index 0000000..d80401a
+--- /dev/null
++++ b/build/android/ndk/jni/uavs3d_avx2.mk
+@@ -0,0 +1,11 @@
++
++AVX_SRC_PATH:=../../../../source/decore/avx2
++
++uavs3d_srcs_avx += $(AVX_SRC_PATH)/alf_avx2.c
++uavs3d_srcs_avx += $(AVX_SRC_PATH)/avx2.c
++uavs3d_srcs_avx += $(AVX_SRC_PATH)/inter_pred_avx2.c
++uavs3d_srcs_avx += $(AVX_SRC_PATH)/intra_pred_avx2.c
++uavs3d_srcs_avx += $(AVX_SRC_PATH)/itrans_avx2.c
++uavs3d_srcs_avx += $(AVX_SRC_PATH)/pixel_avx2.c
++uavs3d_srcs_avx += $(AVX_SRC_PATH)/sao_avx2.c
++
+diff --git a/build/android/ndk/jni/uavs3d_clear_vars.mk b/build/android/ndk/jni/uavs3d_clear_vars.mk
+new file mode 100644
+index 0000000..0c47cfc
+--- /dev/null
++++ b/build/android/ndk/jni/uavs3d_clear_vars.mk
+@@ -0,0 +1,6 @@
++include $(CLEAR_VARS)
++uavs3d_srcs_c :=
++uavs3d_srcs_test :=
++uavs3d_srcs_arm :=
++uavs3d_srcs_sse :=
++uavs3d_srcs_avx :=
+diff --git a/build/android/ndk/jni/uavs3d_main.mk b/build/android/ndk/jni/uavs3d_main.mk
+new file mode 100644
+index 0000000..6506e89
+--- /dev/null
++++ b/build/android/ndk/jni/uavs3d_main.mk
+@@ -0,0 +1,61 @@
++
++### for posix pthread
++#LOCAL_SHARED_LIBRARIES := libcutil
++
++### include search path when compiling all sources (C,C++,Assembly)
++LOCAL_C_INCLUDES +=$(INCLUDE_PATH) \
++ $(LOCAL_PATH)/../app
++
++### c source code
++uavs3d_srcs_c += $(SRC_PATH)/decore/alf.c
++uavs3d_srcs_c += $(SRC_PATH)/decore/deblock.c
++uavs3d_srcs_c += $(SRC_PATH)/decore/inter_pred.c
++uavs3d_srcs_c += $(SRC_PATH)/decore/intra_pred.c
++uavs3d_srcs_c += $(SRC_PATH)/decore/inv_trans.c
++uavs3d_srcs_c += $(SRC_PATH)/decore/pic_manager.c
++uavs3d_srcs_c += $(SRC_PATH)/decore/recon.c
++uavs3d_srcs_c += $(SRC_PATH)/decore/sao.c
++uavs3d_srcs_c += $(SRC_PATH)/decore/com_table.c
++uavs3d_srcs_c += $(SRC_PATH)/decore/threadpool.c
++uavs3d_srcs_c += $(SRC_PATH)/decore/win32thread.c
++uavs3d_srcs_c += $(SRC_PATH)/decore/com_util.c
++uavs3d_srcs_c += $(SRC_PATH)/decoder/uavs3d.c
++uavs3d_srcs_c += $(SRC_PATH)/decoder/bitstream.c
++uavs3d_srcs_c += $(SRC_PATH)/decoder/parser.c
++uavs3d_srcs_c += $(SRC_PATH)/decoder/dec_util.c
++
++
++LOCAL_CFLAGS += -O3 -fPIC -std=gnu99 -I../../../source/decore
++
++### To support 10bit streams decoding: edit it to -DCOMPILE_10BIT=1
++LOCAL_CFLAGS += -DCOMPILE_10BIT=0
++
++ifeq ($(TARGET_ARCH),arm)
++ ifeq ($(TARGET_ARCH_ABI), armeabi-v7a)
++ # build armv7a
++ LOCAL_CFLAGS += -mfpu=neon -D_armv7a
++ include $(LOCAL_PATH)/uavs3d_armv7a.mk
++ endif
++endif
++
++ifeq ($(TARGET_ARCH),arm64)
++ # build arm64
++ LOCAL_CFLAGS += -D_arm64
++ include $(LOCAL_PATH)/uavs3d_arm64.mk
++endif
++
++ifeq ($(TARGET_ARCH),x86)
++ # build x86
++ LOCAL_CFLAGS += -msse4.2 -mavx2
++ include $(LOCAL_PATH)/uavs3d_sse2.mk
++ include $(LOCAL_PATH)/uavs3d_avx2.mk
++endif
++
++ifeq ($(TARGET_ARCH),x86_64)
++ # build x86_64
++ LOCAL_CFLAGS += -msse4.2 -mavx2
++ include $(LOCAL_PATH)/uavs3d_sse2.mk
++ include $(LOCAL_PATH)/uavs3d_avx2.mk
++endif
++
++LOCAL_SRC_FILES := $(uavs3d_srcs_c) $(uavs3d_srcs_arm) $(uavs3d_srcs_sse) $(uavs3d_srcs_avx) $(uavs3d_srcs_test)
+diff --git a/build/android/ndk/jni/uavs3d_sse2.mk b/build/android/ndk/jni/uavs3d_sse2.mk
+new file mode 100644
+index 0000000..1f8847a
+--- /dev/null
++++ b/build/android/ndk/jni/uavs3d_sse2.mk
+@@ -0,0 +1,11 @@
++
++SSE_SRC_PATH:=../../../../source/decore/sse
++
++uavs3d_srcs_sse += $(SSE_SRC_PATH)/alf_sse.c
++uavs3d_srcs_sse += $(SSE_SRC_PATH)/deblock_sse.c
++uavs3d_srcs_sse += $(SSE_SRC_PATH)/inter_pred_sse.c
++uavs3d_srcs_sse += $(SSE_SRC_PATH)/intra_pred_sse.c
++uavs3d_srcs_sse += $(SSE_SRC_PATH)/itrans_sse.c
++uavs3d_srcs_sse += $(SSE_SRC_PATH)/pixel_sse.c
++uavs3d_srcs_sse += $(SSE_SRC_PATH)/sao_sse.c
++uavs3d_srcs_sse += $(SSE_SRC_PATH)/sse.c
+diff --git a/build/x86_windows/common.vcxproj b/build/vs2017/common.vcxproj
+similarity index 61%
+rename from build/x86_windows/common.vcxproj
+rename to build/vs2017/common.vcxproj
+index c8cd533..95de5c4 100644
+--- a/build/x86_windows/common.vcxproj
++++ b/build/vs2017/common.vcxproj
+@@ -1,157 +1,249 @@
+-<?xml version="1.0" encoding="utf-8"?>
+-<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+- <ItemGroup Label="ProjectConfigurations">
+- <ProjectConfiguration Include="Debug|x64">
+- <Configuration>Debug</Configuration>
+- <Platform>x64</Platform>
+- </ProjectConfiguration>
+- <ProjectConfiguration Include="Release|x64">
+- <Configuration>Release</Configuration>
+- <Platform>x64</Platform>
+- </ProjectConfiguration>
+- </ItemGroup>
+- <ItemGroup>
+- <ClCompile Include="..\..\source\decore\alf.c" />
+- <ClCompile Include="..\..\source\decore\avx2\alf_avx2.c" />
+- <ClCompile Include="..\..\source\decore\avx2\avx2.c" />
+- <ClCompile Include="..\..\source\decore\avx2\inter_pred_avx2.c" />
+- <ClCompile Include="..\..\source\decore\avx2\intra_pred_avx2.c" />
+- <ClCompile Include="..\..\source\decore\avx2\itrans_avx2.c" />
+- <ClCompile Include="..\..\source\decore\avx2\pixel_avx2.c" />
+- <ClCompile Include="..\..\source\decore\avx2\sao_avx2.c" />
+- <ClCompile Include="..\..\source\decore\com_table.c" />
+- <ClCompile Include="..\..\source\decore\com_util.c" />
+- <ClCompile Include="..\..\source\decore\deblock.c" />
+- <ClCompile Include="..\..\source\decore\inter_pred.c" />
+- <ClCompile Include="..\..\source\decore\intra_pred.c" />
+- <ClCompile Include="..\..\source\decore\inv_trans.c" />
+- <ClCompile Include="..\..\source\decore\pic_manager.c" />
+- <ClCompile Include="..\..\source\decore\recon.c" />
+- <ClCompile Include="..\..\source\decore\sao.c" />
+- <ClCompile Include="..\..\source\decore\sse\alf_sse.c" />
+- <ClCompile Include="..\..\source\decore\sse\deblock_sse.c" />
+- <ClCompile Include="..\..\source\decore\sse\inter_pred_sse.c" />
+- <ClCompile Include="..\..\source\decore\sse\intra_pred_sse.c" />
+- <ClCompile Include="..\..\source\decore\sse\itrans_sse.c" />
+- <ClCompile Include="..\..\source\decore\sse\pixel_sse.c" />
+- <ClCompile Include="..\..\source\decore\sse\sao_sse.c" />
+- <ClCompile Include="..\..\source\decore\sse\sse.c" />
+- <ClCompile Include="..\..\source\decore\threadpool.c" />
+- <ClCompile Include="..\..\source\decore\win32thread.c" />
+- </ItemGroup>
+- <ItemGroup>
+- <ClInclude Include="..\..\source\decore\avx2\avx2.h" />
+- <ClInclude Include="..\..\source\decore\com_def.h" />
+- <ClInclude Include="..\..\source\decore\com_sys.h" />
+- <ClInclude Include="..\..\source\decore\com_table.h" />
+- <ClInclude Include="..\..\source\decore\com_type.h" />
+- <ClInclude Include="..\..\source\decore\com_util.h" />
+- <ClInclude Include="..\..\source\decore\modules.h" />
+- <ClInclude Include="..\..\source\decore\sse\sse.h" />
+- <ClInclude Include="..\..\source\decore\threadpool.h" />
+- <ClInclude Include="..\..\source\decore\win32thread.h" />
+- </ItemGroup>
+- <PropertyGroup Label="Globals">
+- <ProjectGuid>{3F9C7116-C287-40D7-865C-D8C89CF4FF31}</ProjectGuid>
+- <Keyword>Win32Proj</Keyword>
+- <RootNamespace>com_lib_vs17</RootNamespace>
+- <ProjectName>common</ProjectName>
+- <WindowsTargetPlatformVersion>10.0.17763.0</WindowsTargetPlatformVersion>
+- </PropertyGroup>
+- <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+- <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+- <ConfigurationType>StaticLibrary</ConfigurationType>
+- <UseDebugLibraries>true</UseDebugLibraries>
+- <CharacterSet>MultiByte</CharacterSet>
+- <PlatformToolset>v141</PlatformToolset>
+- </PropertyGroup>
+- <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+- <ConfigurationType>StaticLibrary</ConfigurationType>
+- <UseDebugLibraries>false</UseDebugLibraries>
+- <WholeProgramOptimization>true</WholeProgramOptimization>
+- <CharacterSet>MultiByte</CharacterSet>
+- <PlatformToolset>v141</PlatformToolset>
+- </PropertyGroup>
+- <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+- <ImportGroup Label="ExtensionSettings">
+- </ImportGroup>
+- <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+- <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+- </ImportGroup>
+- <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+- <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+- </ImportGroup>
+- <PropertyGroup Label="UserMacros" />
+- <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+- <OutDir>..\..\lib\</OutDir>
+- </PropertyGroup>
+- <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+- <TargetName>$(ProjectName)</TargetName>
+- <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+- </PropertyGroup>
+- <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+- <OutDir>..\..\lib\</OutDir>
+- <TargetName>$(ProjectName)</TargetName>
+- <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+- </PropertyGroup>
+- <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+- <ClCompile>
+- <PrecompiledHeader>NotUsing</PrecompiledHeader>
+- <WarningLevel>Level3</WarningLevel>
+- <Optimization>Disabled</Optimization>
+- <PreprocessorDefinitions>WIN64;X86F;_DEBUG;_LIB;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+- <AdditionalIncludeDirectories>..\..\source\decore</AdditionalIncludeDirectories>
+- <PrecompiledHeaderFile>
+- </PrecompiledHeaderFile>
+- <PrecompiledHeaderOutputFile>
+- </PrecompiledHeaderOutputFile>
+- <CompileAs>CompileAsC</CompileAs>
+- <ErrorReporting>Prompt</ErrorReporting>
+- <ProgramDataBaseFileName>$(IntDir)vc$(PlatformToolsetVersion).pdb</ProgramDataBaseFileName>
+- <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+- <TreatWarningAsError>true</TreatWarningAsError>
+- <AdditionalOptions>/arch:AVX %(AdditionalOptions)</AdditionalOptions>
+- </ClCompile>
+- <Link>
+- <SubSystem>Windows</SubSystem>
+- <GenerateDebugInformation>true</GenerateDebugInformation>
+- </Link>
+- <Lib>
+- <OutputFile>..\..\lib\$(ProjectName).lib</OutputFile>
+- </Lib>
+- </ItemDefinitionGroup>
+- <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+- <ClCompile>
+- <WarningLevel>Level3</WarningLevel>
+- <PrecompiledHeader>NotUsing</PrecompiledHeader>
+- <Optimization>MaxSpeed</Optimization>
+- <FunctionLevelLinking>true</FunctionLevelLinking>
+- <IntrinsicFunctions>true</IntrinsicFunctions>
+- <PreprocessorDefinitions>WIN64;X86F;_LIB;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions);</PreprocessorDefinitions>
+- <AdditionalIncludeDirectories>..\..\source\decore</AdditionalIncludeDirectories>
+- <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+- <PrecompiledHeaderFile>
+- </PrecompiledHeaderFile>
+- <PrecompiledHeaderOutputFile>
+- </PrecompiledHeaderOutputFile>
+- <CompileAs>CompileAsC</CompileAs>
+- <ErrorReporting>Prompt</ErrorReporting>
+- <ProgramDataBaseFileName>$(IntDir)vc$(PlatformToolsetVersion).pdb</ProgramDataBaseFileName>
+- <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+- <TreatWarningAsError>true</TreatWarningAsError>
+- <AdditionalOptions>/arch:AVX %(AdditionalOptions)</AdditionalOptions>
+- </ClCompile>
+- <Link>
+- <SubSystem>Windows</SubSystem>
+- <GenerateDebugInformation>true</GenerateDebugInformation>
+- <EnableCOMDATFolding>true</EnableCOMDATFolding>
+- <OptimizeReferences>true</OptimizeReferences>
+- </Link>
+- <Lib>
+- <OutputFile>..\..\lib\$(ProjectName).lib</OutputFile>
+- </Lib>
+- </ItemDefinitionGroup>
+- <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+- <ImportGroup Label="ExtensionTargets">
+- </ImportGroup>
++<?xml version="1.0" encoding="utf-8"?>
++<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
++ <ItemGroup Label="ProjectConfigurations">
++ <ProjectConfiguration Include="Debug|Win32">
++ <Configuration>Debug</Configuration>
++ <Platform>Win32</Platform>
++ </ProjectConfiguration>
++ <ProjectConfiguration Include="Debug|x64">
++ <Configuration>Debug</Configuration>
++ <Platform>x64</Platform>
++ </ProjectConfiguration>
++ <ProjectConfiguration Include="Release|Win32">
++ <Configuration>Release</Configuration>
++ <Platform>Win32</Platform>
++ </ProjectConfiguration>
++ <ProjectConfiguration Include="Release|x64">
++ <Configuration>Release</Configuration>
++ <Platform>x64</Platform>
++ </ProjectConfiguration>
++ </ItemGroup>
++ <ItemGroup>
++ <ClCompile Include="..\..\source\decore\alf.c" />
++ <ClCompile Include="..\..\source\decore\avx2\alf_avx2.c" />
++ <ClCompile Include="..\..\source\decore\avx2\avx2.c" />
++ <ClCompile Include="..\..\source\decore\avx2\inter_pred_avx2.c" />
++ <ClCompile Include="..\..\source\decore\avx2\intra_pred_avx2.c" />
++ <ClCompile Include="..\..\source\decore\avx2\itrans_avx2.c" />
++ <ClCompile Include="..\..\source\decore\avx2\pixel_avx2.c" />
++ <ClCompile Include="..\..\source\decore\avx2\sao_avx2.c" />
++ <ClCompile Include="..\..\source\decore\com_table.c" />
++ <ClCompile Include="..\..\source\decore\com_util.c" />
++ <ClCompile Include="..\..\source\decore\deblock.c" />
++ <ClCompile Include="..\..\source\decore\inter_pred.c" />
++ <ClCompile Include="..\..\source\decore\intra_pred.c" />
++ <ClCompile Include="..\..\source\decore\inv_trans.c" />
++ <ClCompile Include="..\..\source\decore\pic_manager.c" />
++ <ClCompile Include="..\..\source\decore\recon.c" />
++ <ClCompile Include="..\..\source\decore\sao.c" />
++ <ClCompile Include="..\..\source\decore\sse\alf_sse.c" />
++ <ClCompile Include="..\..\source\decore\sse\deblock_sse.c" />
++ <ClCompile Include="..\..\source\decore\sse\inter_pred_sse.c" />
++ <ClCompile Include="..\..\source\decore\sse\intra_pred_sse.c" />
++ <ClCompile Include="..\..\source\decore\sse\itrans_sse.c" />
++ <ClCompile Include="..\..\source\decore\sse\pixel_sse.c" />
++ <ClCompile Include="..\..\source\decore\sse\sao_sse.c" />
++ <ClCompile Include="..\..\source\decore\sse\sse.c" />
++ <ClCompile Include="..\..\source\decore\threadpool.c" />
++ <ClCompile Include="..\..\source\decore\win32thread.c" />
++ </ItemGroup>
++ <ItemGroup>
++ <ClInclude Include="..\..\source\decore\avx2\avx2.h" />
++ <ClInclude Include="..\..\source\decore\com_def.h" />
++ <ClInclude Include="..\..\source\decore\com_sys.h" />
++ <ClInclude Include="..\..\source\decore\com_table.h" />
++ <ClInclude Include="..\..\source\decore\com_type.h" />
++ <ClInclude Include="..\..\source\decore\com_util.h" />
++ <ClInclude Include="..\..\source\decore\modules.h" />
++ <ClInclude Include="..\..\source\decore\sse\sse.h" />
++ <ClInclude Include="..\..\source\decore\threadpool.h" />
++ <ClInclude Include="..\..\source\decore\win32thread.h" />
++ </ItemGroup>
++ <PropertyGroup Label="Globals">
++ <ProjectGuid>{3F9C7116-C287-40D7-865C-D8C89CF4FF31}</ProjectGuid>
++ <Keyword>Win32Proj</Keyword>
++ <RootNamespace>com_lib_vs17</RootNamespace>
++ <ProjectName>common</ProjectName>
++ <WindowsTargetPlatformVersion>10.0.16299.0</WindowsTargetPlatformVersion>
++ </PropertyGroup>
++ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
++ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
++ <ConfigurationType>StaticLibrary</ConfigurationType>
++ <UseDebugLibraries>true</UseDebugLibraries>
++ <CharacterSet>MultiByte</CharacterSet>
++ <PlatformToolset>v141</PlatformToolset>
++ </PropertyGroup>
++ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
++ <ConfigurationType>StaticLibrary</ConfigurationType>
++ <UseDebugLibraries>true</UseDebugLibraries>
++ <CharacterSet>MultiByte</CharacterSet>
++ <PlatformToolset>v141</PlatformToolset>
++ </PropertyGroup>
++ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
++ <ConfigurationType>StaticLibrary</ConfigurationType>
++ <UseDebugLibraries>false</UseDebugLibraries>
++ <WholeProgramOptimization>true</WholeProgramOptimization>
++ <CharacterSet>MultiByte</CharacterSet>
++ <PlatformToolset>v141</PlatformToolset>
++ </PropertyGroup>
++ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
++ <ConfigurationType>StaticLibrary</ConfigurationType>
++ <UseDebugLibraries>false</UseDebugLibraries>
++ <WholeProgramOptimization>true</WholeProgramOptimization>
++ <CharacterSet>MultiByte</CharacterSet>
++ <PlatformToolset>v141</PlatformToolset>
++ </PropertyGroup>
++ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
++ <ImportGroup Label="ExtensionSettings">
++ </ImportGroup>
++ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
++ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
++ </ImportGroup>
++ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
++ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
++ </ImportGroup>
++ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
++ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
++ </ImportGroup>
++ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
++ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
++ </ImportGroup>
++ <PropertyGroup Label="UserMacros" />
++ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
++ <OutDir>..\..\lib\</OutDir>
++ </PropertyGroup>
++ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
++ <TargetName>$(ProjectName)</TargetName>
++ <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
++ </PropertyGroup>
++ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
++ <TargetName>$(ProjectName)</TargetName>
++ <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
++ </PropertyGroup>
++ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
++ <OutDir>..\..\lib\</OutDir>
++ <TargetName>$(ProjectName)</TargetName>
++ <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
++ </PropertyGroup>
++ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
++ <TargetName>$(ProjectName)</TargetName>
++ <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
++ </PropertyGroup>
++ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
++ <ClCompile>
++ <PrecompiledHeader>NotUsing</PrecompiledHeader>
++ <WarningLevel>Level3</WarningLevel>
++ <Optimization>Disabled</Optimization>
++ <PreprocessorDefinitions>WIN64;X86F;_DEBUG;_LIB;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
++ <AdditionalIncludeDirectories>..\..\source\decore</AdditionalIncludeDirectories>
++ <PrecompiledHeaderFile>
++ </PrecompiledHeaderFile>
++ <PrecompiledHeaderOutputFile>
++ </PrecompiledHeaderOutputFile>
++ <CompileAs>CompileAsC</CompileAs>
++ <ErrorReporting>Prompt</ErrorReporting>
++ <ProgramDataBaseFileName>$(IntDir)vc$(PlatformToolsetVersion).pdb</ProgramDataBaseFileName>
++ <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
++ <TreatWarningAsError>true</TreatWarningAsError>
++ <AdditionalOptions>/arch:AVX %(AdditionalOptions)</AdditionalOptions>
++ </ClCompile>
++ <Link>
++ <SubSystem>Windows</SubSystem>
++ <GenerateDebugInformation>true</GenerateDebugInformation>
++ </Link>
++ <Lib>
++ <OutputFile>..\..\lib\$(ProjectName).lib</OutputFile>
++ </Lib>
++ </ItemDefinitionGroup>
++ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
++ <ClCompile>
++ <PrecompiledHeader>NotUsing</PrecompiledHeader>
++ <WarningLevel>Level3</WarningLevel>
++ <Optimization>Disabled</Optimization>
++ <PreprocessorDefinitions>WIN64;X86F;_DEBUG;_LIB;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
++ <AdditionalIncludeDirectories>..\..\source\decore</AdditionalIncludeDirectories>
++ <PrecompiledHeaderFile>
++ </PrecompiledHeaderFile>
++ <PrecompiledHeaderOutputFile>
++ </PrecompiledHeaderOutputFile>
++ <CompileAs>CompileAsC</CompileAs>
++ <ErrorReporting>Prompt</ErrorReporting>
++ <ProgramDataBaseFileName>$(IntDir)vc$(PlatformToolsetVersion).pdb</ProgramDataBaseFileName>
++ <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
++ <TreatWarningAsError>true</TreatWarningAsError>
++ <AdditionalOptions>/arch:AVX %(AdditionalOptions)</AdditionalOptions>
++ </ClCompile>
++ <Link>
++ <SubSystem>Windows</SubSystem>
++ <GenerateDebugInformation>true</GenerateDebugInformation>
++ </Link>
++ <Lib>
++ <OutputFile>..\..\lib\$(ProjectName).lib</OutputFile>
++ </Lib>
++ </ItemDefinitionGroup>
++ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
++ <ClCompile>
++ <WarningLevel>Level3</WarningLevel>
++ <PrecompiledHeader>NotUsing</PrecompiledHeader>
++ <Optimization>MaxSpeed</Optimization>
++ <FunctionLevelLinking>true</FunctionLevelLinking>
++ <IntrinsicFunctions>true</IntrinsicFunctions>
++ <PreprocessorDefinitions>WIN64;X86F;_LIB;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions);</PreprocessorDefinitions>
++ <AdditionalIncludeDirectories>..\..\source\decore</AdditionalIncludeDirectories>
++ <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
++ <PrecompiledHeaderFile>
++ </PrecompiledHeaderFile>
++ <PrecompiledHeaderOutputFile>
++ </PrecompiledHeaderOutputFile>
++ <CompileAs>CompileAsC</CompileAs>
++ <ErrorReporting>Prompt</ErrorReporting>
++ <ProgramDataBaseFileName>$(IntDir)vc$(PlatformToolsetVersion).pdb</ProgramDataBaseFileName>
++ <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
++ <TreatWarningAsError>true</TreatWarningAsError>
++ <AdditionalOptions>/arch:AVX %(AdditionalOptions)</AdditionalOptions>
++ </ClCompile>
++ <Link>
++ <SubSystem>Windows</SubSystem>
++ <GenerateDebugInformation>true</GenerateDebugInformation>
++ <EnableCOMDATFolding>true</EnableCOMDATFolding>
++ <OptimizeReferences>true</OptimizeReferences>
++ </Link>
++ <Lib>
++ <OutputFile>..\..\lib\$(ProjectName).lib</OutputFile>
++ </Lib>
++ </ItemDefinitionGroup>
++ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
++ <ClCompile>
++ <WarningLevel>Level3</WarningLevel>
++ <PrecompiledHeader>NotUsing</PrecompiledHeader>
++ <Optimization>MaxSpeed</Optimization>
++ <FunctionLevelLinking>true</FunctionLevelLinking>
++ <IntrinsicFunctions>true</IntrinsicFunctions>
++ <PreprocessorDefinitions>WIN64;X86F;_LIB;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions);</PreprocessorDefinitions>
++ <AdditionalIncludeDirectories>..\..\source\decore</AdditionalIncludeDirectories>
++ <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
++ <PrecompiledHeaderFile>
++ </PrecompiledHeaderFile>
++ <PrecompiledHeaderOutputFile>
++ </PrecompiledHeaderOutputFile>
++ <CompileAs>CompileAsC</CompileAs>
++ <ErrorReporting>Prompt</ErrorReporting>
++ <ProgramDataBaseFileName>$(IntDir)vc$(PlatformToolsetVersion).pdb</ProgramDataBaseFileName>
++ <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
++ <TreatWarningAsError>true</TreatWarningAsError>
++ <AdditionalOptions>/arch:AVX %(AdditionalOptions)</AdditionalOptions>
++ </ClCompile>
++ <Link>
++ <SubSystem>Windows</SubSystem>
++ <GenerateDebugInformation>true</GenerateDebugInformation>
++ <EnableCOMDATFolding>true</EnableCOMDATFolding>
++ <OptimizeReferences>true</OptimizeReferences>
++ </Link>
++ <Lib>
++ <OutputFile>..\..\lib\$(ProjectName).lib</OutputFile>
++ </Lib>
++ </ItemDefinitionGroup>
++ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
++ <ImportGroup Label="ExtensionTargets">
++ </ImportGroup>
+ </Project>
+\ No newline at end of file
+diff --git a/build/x86_windows/common.vcxproj.filters b/build/vs2017/common.vcxproj.filters
+similarity index 100%
+rename from build/x86_windows/common.vcxproj.filters
+rename to build/vs2017/common.vcxproj.filters
+diff --git a/build/x86_windows/libuavs3d.vcxproj b/build/vs2017/libuavs3d.vcxproj
+similarity index 55%
+rename from build/x86_windows/libuavs3d.vcxproj
+rename to build/vs2017/libuavs3d.vcxproj
+index d92a51f..bb42a6c 100644
+--- a/build/x86_windows/libuavs3d.vcxproj
++++ b/build/vs2017/libuavs3d.vcxproj
+@@ -1,115 +1,193 @@
+-<?xml version="1.0" encoding="utf-8"?>
+-<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+- <ItemGroup Label="ProjectConfigurations">
+- <ProjectConfiguration Include="Debug|x64">
+- <Configuration>Debug</Configuration>
+- <Platform>x64</Platform>
+- </ProjectConfiguration>
+- <ProjectConfiguration Include="Release|x64">
+- <Configuration>Release</Configuration>
+- <Platform>x64</Platform>
+- </ProjectConfiguration>
+- </ItemGroup>
+- <ItemGroup>
+- <ClCompile Include="..\..\source\decoder\bitstream.c" />
+- <ClCompile Include="..\..\source\decoder\dec_util.c" />
+- <ClCompile Include="..\..\source\decoder\parser.c" />
+- <ClCompile Include="..\..\source\decoder\uavs3d.c" />
+- </ItemGroup>
+- <ItemGroup>
+- <ClInclude Include="..\..\contributor.h" />
+- <ClInclude Include="..\..\source\decoder\bitstream.h" />
+- <ClInclude Include="..\..\source\decoder\dec_type.h" />
+- <ClInclude Include="..\..\source\decoder\dec_util.h" />
+- <ClInclude Include="..\..\source\decoder\parser.h" />
+- <ClInclude Include="..\..\source\decoder\uavs3d.h" />
+- </ItemGroup>
+- <PropertyGroup Label="Globals">
+- <ProjectGuid>{40B445E8-306A-4C77-9B19-FC76C2379F79}</ProjectGuid>
+- <RootNamespace>dec_lib</RootNamespace>
+- <WindowsTargetPlatformVersion>10.0.17763.0</WindowsTargetPlatformVersion>
+- <ProjectName>libuavs3d</ProjectName>
+- </PropertyGroup>
+- <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+- <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+- <ConfigurationType>DynamicLibrary</ConfigurationType>
+- <UseDebugLibraries>true</UseDebugLibraries>
+- <PlatformToolset>v141</PlatformToolset>
+- <CharacterSet>MultiByte</CharacterSet>
+- </PropertyGroup>
+- <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+- <ConfigurationType>DynamicLibrary</ConfigurationType>
+- <UseDebugLibraries>false</UseDebugLibraries>
+- <PlatformToolset>v141</PlatformToolset>
+- <WholeProgramOptimization>true</WholeProgramOptimization>
+- <CharacterSet>MultiByte</CharacterSet>
+- </PropertyGroup>
+- <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+- <ImportGroup Label="ExtensionSettings">
+- </ImportGroup>
+- <ImportGroup Label="Shared">
+- </ImportGroup>
+- <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+- <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+- </ImportGroup>
+- <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+- <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+- </ImportGroup>
+- <PropertyGroup Label="UserMacros" />
+- <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+- <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+- <OutDir>..\..\bin</OutDir>
+- <LibraryPath>..\..\lib;$(LibraryPath)</LibraryPath>
+- </PropertyGroup>
+- <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+- <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+- <OutDir>..\..\bin</OutDir>
+- <LibraryPath>..\..\lib;$(LibraryPath)</LibraryPath>
+- </PropertyGroup>
+- <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+- <ClCompile>
+- <WarningLevel>Level3</WarningLevel>
+- <Optimization>Disabled</Optimization>
+- <SDLCheck>true</SDLCheck>
+- <AdditionalIncludeDirectories>..\..\source\decore;..\..\source\decoder</AdditionalIncludeDirectories>
+- <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+- <ProgramDataBaseFileName>$(IntDir)vc$(PlatformToolsetVersion).pdb</ProgramDataBaseFileName>
+- <PreprocessorDefinitions>WIN64;;%(PreprocessorDefinitions);UAVS3D_EXPORTS;_DEBUG</PreprocessorDefinitions>
+- <TreatWarningAsError>true</TreatWarningAsError>
+- <AdditionalOptions>/arch:AVX %(AdditionalOptions)</AdditionalOptions>
+- </ClCompile>
+- <Link>
+- <GenerateDebugInformation>true</GenerateDebugInformation>
+- <AdditionalDependencies>common.lib;%(AdditionalDependencies)</AdditionalDependencies>
+- <ShowProgress>NotSet</ShowProgress>
+- <Version>1.0</Version>
+- </Link>
+- </ItemDefinitionGroup>
+- <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+- <ClCompile>
+- <WarningLevel>Level3</WarningLevel>
+- <Optimization>MaxSpeed</Optimization>
+- <FunctionLevelLinking>true</FunctionLevelLinking>
+- <IntrinsicFunctions>true</IntrinsicFunctions>
+- <SDLCheck>true</SDLCheck>
+- <AdditionalIncludeDirectories>..\..\source\decore;..\..\source\decoder</AdditionalIncludeDirectories>
+- <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+- <CompileAs>CompileAsC</CompileAs>
+- <ProgramDataBaseFileName>$(IntDir)vc$(PlatformToolsetVersion).pdb</ProgramDataBaseFileName>
+- <PreprocessorDefinitions>WIN64;;%(PreprocessorDefinitions);UAVS3D_EXPORTS;</PreprocessorDefinitions>
+- <TreatWarningAsError>true</TreatWarningAsError>
+- <AdditionalOptions>/arch:AVX %(AdditionalOptions)</AdditionalOptions>
+- </ClCompile>
+- <Link>
+- <GenerateDebugInformation>true</GenerateDebugInformation>
+- <EnableCOMDATFolding>true</EnableCOMDATFolding>
+- <OptimizeReferences>true</OptimizeReferences>
+- <AdditionalDependencies>common.lib;%(AdditionalDependencies)</AdditionalDependencies>
+- <ShowProgress>NotSet</ShowProgress>
+- <Version>1.0</Version>
+- </Link>
+- </ItemDefinitionGroup>
+- <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+- <ImportGroup Label="ExtensionTargets">
+- </ImportGroup>
++<?xml version="1.0" encoding="utf-8"?>
++<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
++ <ItemGroup Label="ProjectConfigurations">
++ <ProjectConfiguration Include="Debug|Win32">
++ <Configuration>Debug</Configuration>
++ <Platform>Win32</Platform>
++ </ProjectConfiguration>
++ <ProjectConfiguration Include="Debug|x64">
++ <Configuration>Debug</Configuration>
++ <Platform>x64</Platform>
++ </ProjectConfiguration>
++ <ProjectConfiguration Include="Release|Win32">
++ <Configuration>Release</Configuration>
++ <Platform>Win32</Platform>
++ </ProjectConfiguration>
++ <ProjectConfiguration Include="Release|x64">
++ <Configuration>Release</Configuration>
++ <Platform>x64</Platform>
++ </ProjectConfiguration>
++ </ItemGroup>
++ <ItemGroup>
++ <ClCompile Include="..\..\source\decoder\bitstream.c" />
++ <ClCompile Include="..\..\source\decoder\dec_util.c" />
++ <ClCompile Include="..\..\source\decoder\parser.c" />
++ <ClCompile Include="..\..\source\decoder\uavs3d.c" />
++ </ItemGroup>
++ <ItemGroup>
++ <ClInclude Include="..\..\contributor.h" />
++ <ClInclude Include="..\..\source\decoder\bitstream.h" />
++ <ClInclude Include="..\..\source\decoder\dec_type.h" />
++ <ClInclude Include="..\..\source\decoder\dec_util.h" />
++ <ClInclude Include="..\..\source\decoder\parser.h" />
++ <ClInclude Include="..\..\source\decoder\uavs3d.h" />
++ </ItemGroup>
++ <PropertyGroup Label="Globals">
++ <ProjectGuid>{40B445E8-306A-4C77-9B19-FC76C2379F79}</ProjectGuid>
++ <RootNamespace>dec_lib</RootNamespace>
++ <WindowsTargetPlatformVersion>10.0.16299.0</WindowsTargetPlatformVersion>
++ <ProjectName>libuavs3d</ProjectName>
++ </PropertyGroup>
++ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
++ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
++ <ConfigurationType>DynamicLibrary</ConfigurationType>
++ <UseDebugLibraries>true</UseDebugLibraries>
++ <PlatformToolset>v141</PlatformToolset>
++ <CharacterSet>MultiByte</CharacterSet>
++ </PropertyGroup>
++ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
++ <ConfigurationType>DynamicLibrary</ConfigurationType>
++ <UseDebugLibraries>true</UseDebugLibraries>
++ <PlatformToolset>v141</PlatformToolset>
++ <CharacterSet>MultiByte</CharacterSet>
++ </PropertyGroup>
++ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
++ <ConfigurationType>DynamicLibrary</ConfigurationType>
++ <UseDebugLibraries>false</UseDebugLibraries>
++ <PlatformToolset>v141</PlatformToolset>
++ <WholeProgramOptimization>true</WholeProgramOptimization>
++ <CharacterSet>MultiByte</CharacterSet>
++ </PropertyGroup>
++ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
++ <ConfigurationType>DynamicLibrary</ConfigurationType>
++ <UseDebugLibraries>false</UseDebugLibraries>
++ <PlatformToolset>v141</PlatformToolset>
++ <WholeProgramOptimization>true</WholeProgramOptimization>
++ <CharacterSet>MultiByte</CharacterSet>
++ </PropertyGroup>
++ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
++ <ImportGroup Label="ExtensionSettings">
++ </ImportGroup>
++ <ImportGroup Label="Shared">
++ </ImportGroup>
++ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
++ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
++ </ImportGroup>
++ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
++ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
++ </ImportGroup>
++ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
++ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
++ </ImportGroup>
++ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
++ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
++ </ImportGroup>
++ <PropertyGroup Label="UserMacros" />
++ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
++ <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
++ <OutDir>..\..\bin</OutDir>
++ <LibraryPath>..\..\lib;$(LibraryPath)</LibraryPath>
++ </PropertyGroup>
++ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
++ <LibraryPath>..\..\lib;$(LibraryPath)</LibraryPath>
++ <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
++ </PropertyGroup>
++ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
++ <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
++ <OutDir>..\..\bin</OutDir>
++ <LibraryPath>..\..\lib;$(LibraryPath)</LibraryPath>
++ </PropertyGroup>
++ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
++ <LibraryPath>..\..\lib;$(LibraryPath)</LibraryPath>
++ <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
++ </PropertyGroup>
++ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
++ <ClCompile>
++ <WarningLevel>Level3</WarningLevel>
++ <Optimization>Disabled</Optimization>
++ <SDLCheck>true</SDLCheck>
++ <AdditionalIncludeDirectories>..\..\source\decore;..\..\source\decoder</AdditionalIncludeDirectories>
++ <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
++ <ProgramDataBaseFileName>$(IntDir)vc$(PlatformToolsetVersion).pdb</ProgramDataBaseFileName>
++ <PreprocessorDefinitions>WIN64;;%(PreprocessorDefinitions);UAVS3D_EXPORTS;_DEBUG</PreprocessorDefinitions>
++ <TreatWarningAsError>true</TreatWarningAsError>
++ <AdditionalOptions>/arch:AVX %(AdditionalOptions)</AdditionalOptions>
++ </ClCompile>
++ <Link>
++ <GenerateDebugInformation>true</GenerateDebugInformation>
++ <AdditionalDependencies>common.lib;%(AdditionalDependencies)</AdditionalDependencies>
++ <ShowProgress>NotSet</ShowProgress>
++ <Version>1.0</Version>
++ </Link>
++ </ItemDefinitionGroup>
++ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
++ <ClCompile>
++ <WarningLevel>Level3</WarningLevel>
++ <Optimization>Disabled</Optimization>
++ <SDLCheck>true</SDLCheck>
++ <AdditionalIncludeDirectories>..\..\source\decore;..\..\source\decoder</AdditionalIncludeDirectories>
++ <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
++ <ProgramDataBaseFileName>$(IntDir)vc$(PlatformToolsetVersion).pdb</ProgramDataBaseFileName>
++ <PreprocessorDefinitions>WIN64;;%(PreprocessorDefinitions);UAVS3D_EXPORTS;_DEBUG</PreprocessorDefinitions>
++ <TreatWarningAsError>true</TreatWarningAsError>
++ <AdditionalOptions>/arch:AVX %(AdditionalOptions)</AdditionalOptions>
++ </ClCompile>
++ <Link>
++ <GenerateDebugInformation>true</GenerateDebugInformation>
++ <AdditionalDependencies>common.lib;%(AdditionalDependencies)</AdditionalDependencies>
++ <ShowProgress>NotSet</ShowProgress>
++ <Version>1.0</Version>
++ </Link>
++ </ItemDefinitionGroup>
++ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
++ <ClCompile>
++ <WarningLevel>Level3</WarningLevel>
++ <Optimization>MaxSpeed</Optimization>
++ <FunctionLevelLinking>true</FunctionLevelLinking>
++ <IntrinsicFunctions>true</IntrinsicFunctions>
++ <SDLCheck>true</SDLCheck>
++ <AdditionalIncludeDirectories>..\..\source\decore;..\..\source\decoder</AdditionalIncludeDirectories>
++ <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
++ <CompileAs>CompileAsC</CompileAs>
++ <ProgramDataBaseFileName>$(IntDir)vc$(PlatformToolsetVersion).pdb</ProgramDataBaseFileName>
++ <PreprocessorDefinitions>WIN64;;%(PreprocessorDefinitions);UAVS3D_EXPORTS;</PreprocessorDefinitions>
++ <TreatWarningAsError>true</TreatWarningAsError>
++ <AdditionalOptions>/arch:AVX %(AdditionalOptions)</AdditionalOptions>
++ </ClCompile>
++ <Link>
++ <GenerateDebugInformation>true</GenerateDebugInformation>
++ <EnableCOMDATFolding>true</EnableCOMDATFolding>
++ <OptimizeReferences>true</OptimizeReferences>
++ <AdditionalDependencies>common.lib;%(AdditionalDependencies)</AdditionalDependencies>
++ <ShowProgress>NotSet</ShowProgress>
++ <Version>1.0</Version>
++ </Link>
++ </ItemDefinitionGroup>
++ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
++ <ClCompile>
++ <WarningLevel>Level3</WarningLevel>
++ <Optimization>MaxSpeed</Optimization>
++ <FunctionLevelLinking>true</FunctionLevelLinking>
++ <IntrinsicFunctions>true</IntrinsicFunctions>
++ <SDLCheck>true</SDLCheck>
++ <AdditionalIncludeDirectories>..\..\source\decore;..\..\source\decoder</AdditionalIncludeDirectories>
++ <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
++ <CompileAs>CompileAsC</CompileAs>
++ <ProgramDataBaseFileName>$(IntDir)vc$(PlatformToolsetVersion).pdb</ProgramDataBaseFileName>
++ <PreprocessorDefinitions>WIN64;;%(PreprocessorDefinitions);UAVS3D_EXPORTS;</PreprocessorDefinitions>
++ <TreatWarningAsError>true</TreatWarningAsError>
++ <AdditionalOptions>/arch:AVX %(AdditionalOptions)</AdditionalOptions>
++ </ClCompile>
++ <Link>
++ <GenerateDebugInformation>true</GenerateDebugInformation>
++ <EnableCOMDATFolding>true</EnableCOMDATFolding>
++ <OptimizeReferences>true</OptimizeReferences>
++ <AdditionalDependencies>common.lib;%(AdditionalDependencies)</AdditionalDependencies>
++ <ShowProgress>NotSet</ShowProgress>
++ <Version>1.0</Version>
++ </Link>
++ </ItemDefinitionGroup>
++ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
++ <ImportGroup Label="ExtensionTargets">
++ </ImportGroup>
+ </Project>
+\ No newline at end of file
+diff --git a/build/x86_windows/uavs3d.sln b/build/vs2017/uavs3d.sln
+similarity index 68%
+rename from build/x86_windows/uavs3d.sln
+rename to build/vs2017/uavs3d.sln
+index e6d34c4..d098d33 100644
+--- a/build/x86_windows/uavs3d.sln
++++ b/build/vs2017/uavs3d.sln
+@@ -1,47 +1,61 @@
+-
+-Microsoft Visual Studio Solution File, Format Version 12.00
+-# Visual Studio 15
+-VisualStudioVersion = 15.0.27130.2026
+-MinimumVisualStudioVersion = 10.0.40219.1
+-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "common", "common.vcxproj", "{3F9C7116-C287-40D7-865C-D8C89CF4FF31}"
+-EndProject
+-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "uavs3d", "uavs3d.vcxproj", "{798F7D68-C94D-41AF-86A4-98F7726D172C}"
+- ProjectSection(ProjectDependencies) = postProject
+- {3F9C7116-C287-40D7-865C-D8C89CF4FF31} = {3F9C7116-C287-40D7-865C-D8C89CF4FF31}
+- {40B445E8-306A-4C77-9B19-FC76C2379F79} = {40B445E8-306A-4C77-9B19-FC76C2379F79}
+- EndProjectSection
+-EndProject
+-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libuavs3d", "libuavs3d.vcxproj", "{40B445E8-306A-4C77-9B19-FC76C2379F79}"
+- ProjectSection(ProjectDependencies) = postProject
+- {3F9C7116-C287-40D7-865C-D8C89CF4FF31} = {3F9C7116-C287-40D7-865C-D8C89CF4FF31}
+- EndProjectSection
+-EndProject
+-Global
+- GlobalSection(SolutionConfigurationPlatforms) = preSolution
+- Debug|x64 = Debug|x64
+- Release|x64 = Release|x64
+- EndGlobalSection
+- GlobalSection(ProjectConfigurationPlatforms) = postSolution
+- {3F9C7116-C287-40D7-865C-D8C89CF4FF31}.Debug|x64.ActiveCfg = Debug|x64
+- {3F9C7116-C287-40D7-865C-D8C89CF4FF31}.Debug|x64.Build.0 = Debug|x64
+- {3F9C7116-C287-40D7-865C-D8C89CF4FF31}.Release|x64.ActiveCfg = Release|x64
+- {3F9C7116-C287-40D7-865C-D8C89CF4FF31}.Release|x64.Build.0 = Release|x64
+- {798F7D68-C94D-41AF-86A4-98F7726D172C}.Debug|x64.ActiveCfg = Debug|x64
+- {798F7D68-C94D-41AF-86A4-98F7726D172C}.Debug|x64.Build.0 = Debug|x64
+- {798F7D68-C94D-41AF-86A4-98F7726D172C}.Release|x64.ActiveCfg = Release|x64
+- {798F7D68-C94D-41AF-86A4-98F7726D172C}.Release|x64.Build.0 = Release|x64
+- {40B445E8-306A-4C77-9B19-FC76C2379F79}.Debug|x64.ActiveCfg = Debug|x64
+- {40B445E8-306A-4C77-9B19-FC76C2379F79}.Debug|x64.Build.0 = Debug|x64
+- {40B445E8-306A-4C77-9B19-FC76C2379F79}.Release|x64.ActiveCfg = Release|x64
+- {40B445E8-306A-4C77-9B19-FC76C2379F79}.Release|x64.Build.0 = Release|x64
+- EndGlobalSection
+- GlobalSection(SolutionProperties) = preSolution
+- HideSolutionNode = FALSE
+- EndGlobalSection
+- GlobalSection(ExtensibilityGlobals) = postSolution
+- SolutionGuid = {ED69324B-A55F-49DC-91D3-5F1D34DF875C}
+- EndGlobalSection
+- GlobalSection(Performance) = preSolution
+- HasPerformanceSessions = true
+- EndGlobalSection
+-EndGlobal
++
++Microsoft Visual Studio Solution File, Format Version 12.00
++# Visual Studio 15
++VisualStudioVersion = 15.0.26228.4
++MinimumVisualStudioVersion = 10.0.40219.1
++Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "common", "common.vcxproj", "{3F9C7116-C287-40D7-865C-D8C89CF4FF31}"
++EndProject
++Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "uavs3d", "uavs3d.vcxproj", "{798F7D68-C94D-41AF-86A4-98F7726D172C}"
++ ProjectSection(ProjectDependencies) = postProject
++ {3F9C7116-C287-40D7-865C-D8C89CF4FF31} = {3F9C7116-C287-40D7-865C-D8C89CF4FF31}
++ {40B445E8-306A-4C77-9B19-FC76C2379F79} = {40B445E8-306A-4C77-9B19-FC76C2379F79}
++ EndProjectSection
++EndProject
++Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libuavs3d", "libuavs3d.vcxproj", "{40B445E8-306A-4C77-9B19-FC76C2379F79}"
++ ProjectSection(ProjectDependencies) = postProject
++ {3F9C7116-C287-40D7-865C-D8C89CF4FF31} = {3F9C7116-C287-40D7-865C-D8C89CF4FF31}
++ EndProjectSection
++EndProject
++Global
++ GlobalSection(SolutionConfigurationPlatforms) = preSolution
++ Debug|x64 = Debug|x64
++ Debug|x86 = Debug|x86
++ Release|x64 = Release|x64
++ Release|x86 = Release|x86
++ EndGlobalSection
++ GlobalSection(ProjectConfigurationPlatforms) = postSolution
++ {3F9C7116-C287-40D7-865C-D8C89CF4FF31}.Debug|x64.ActiveCfg = Debug|x64
++ {3F9C7116-C287-40D7-865C-D8C89CF4FF31}.Debug|x64.Build.0 = Debug|x64
++ {3F9C7116-C287-40D7-865C-D8C89CF4FF31}.Debug|x86.ActiveCfg = Debug|Win32
++ {3F9C7116-C287-40D7-865C-D8C89CF4FF31}.Debug|x86.Build.0 = Debug|Win32
++ {3F9C7116-C287-40D7-865C-D8C89CF4FF31}.Release|x64.ActiveCfg = Release|x64
++ {3F9C7116-C287-40D7-865C-D8C89CF4FF31}.Release|x64.Build.0 = Release|x64
++ {3F9C7116-C287-40D7-865C-D8C89CF4FF31}.Release|x86.ActiveCfg = Release|Win32
++ {3F9C7116-C287-40D7-865C-D8C89CF4FF31}.Release|x86.Build.0 = Release|Win32
++ {798F7D68-C94D-41AF-86A4-98F7726D172C}.Debug|x64.ActiveCfg = Debug|x64
++ {798F7D68-C94D-41AF-86A4-98F7726D172C}.Debug|x64.Build.0 = Debug|x64
++ {798F7D68-C94D-41AF-86A4-98F7726D172C}.Debug|x86.ActiveCfg = Debug|Win32
++ {798F7D68-C94D-41AF-86A4-98F7726D172C}.Debug|x86.Build.0 = Debug|Win32
++ {798F7D68-C94D-41AF-86A4-98F7726D172C}.Release|x64.ActiveCfg = Release|x64
++ {798F7D68-C94D-41AF-86A4-98F7726D172C}.Release|x64.Build.0 = Release|x64
++ {798F7D68-C94D-41AF-86A4-98F7726D172C}.Release|x86.ActiveCfg = Release|Win32
++ {798F7D68-C94D-41AF-86A4-98F7726D172C}.Release|x86.Build.0 = Release|Win32
++ {40B445E8-306A-4C77-9B19-FC76C2379F79}.Debug|x64.ActiveCfg = Debug|x64
++ {40B445E8-306A-4C77-9B19-FC76C2379F79}.Debug|x64.Build.0 = Debug|x64
++ {40B445E8-306A-4C77-9B19-FC76C2379F79}.Debug|x86.ActiveCfg = Debug|Win32
++ {40B445E8-306A-4C77-9B19-FC76C2379F79}.Debug|x86.Build.0 = Debug|Win32
++ {40B445E8-306A-4C77-9B19-FC76C2379F79}.Release|x64.ActiveCfg = Release|x64
++ {40B445E8-306A-4C77-9B19-FC76C2379F79}.Release|x64.Build.0 = Release|x64
++ {40B445E8-306A-4C77-9B19-FC76C2379F79}.Release|x86.ActiveCfg = Release|Win32
++ {40B445E8-306A-4C77-9B19-FC76C2379F79}.Release|x86.Build.0 = Release|Win32
++ EndGlobalSection
++ GlobalSection(SolutionProperties) = preSolution
++ HideSolutionNode = FALSE
++ EndGlobalSection
++ GlobalSection(ExtensibilityGlobals) = postSolution
++ SolutionGuid = {ED69324B-A55F-49DC-91D3-5F1D34DF875C}
++ EndGlobalSection
++ GlobalSection(Performance) = preSolution
++ HasPerformanceSessions = true
++ EndGlobalSection
++EndGlobal
+diff --git a/build/x86_windows/uavs3d.vcxproj b/build/vs2017/uavs3d.vcxproj
+similarity index 55%
+rename from build/x86_windows/uavs3d.vcxproj
+rename to build/vs2017/uavs3d.vcxproj
+index 9557243..25603ec 100644
+--- a/build/x86_windows/uavs3d.vcxproj
++++ b/build/vs2017/uavs3d.vcxproj
+@@ -1,118 +1,200 @@
+-<?xml version="1.0" encoding="utf-8"?>
+-<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+- <ItemGroup Label="ProjectConfigurations">
+- <ProjectConfiguration Include="Debug|x64">
+- <Configuration>Debug</Configuration>
+- <Platform>x64</Platform>
+- </ProjectConfiguration>
+- <ProjectConfiguration Include="Release|x64">
+- <Configuration>Release</Configuration>
+- <Platform>x64</Platform>
+- </ProjectConfiguration>
+- </ItemGroup>
+- <PropertyGroup Label="Globals">
+- <ProjectGuid>{798F7D68-C94D-41AF-86A4-98F7726D172C}</ProjectGuid>
+- <Keyword>Win32Proj</Keyword>
+- <RootNamespace>dec_test_vs17</RootNamespace>
+- <ProjectName>uavs3d</ProjectName>
+- <WindowsTargetPlatformVersion>10.0.17763.0</WindowsTargetPlatformVersion>
+- </PropertyGroup>
+- <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+- <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+- <ConfigurationType>Application</ConfigurationType>
+- <UseDebugLibraries>true</UseDebugLibraries>
+- <CharacterSet>MultiByte</CharacterSet>
+- <PlatformToolset>v141</PlatformToolset>
+- </PropertyGroup>
+- <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+- <ConfigurationType>Application</ConfigurationType>
+- <UseDebugLibraries>false</UseDebugLibraries>
+- <WholeProgramOptimization>true</WholeProgramOptimization>
+- <CharacterSet>MultiByte</CharacterSet>
+- <PlatformToolset>v141</PlatformToolset>
+- </PropertyGroup>
+- <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+- <ImportGroup Label="ExtensionSettings">
+- </ImportGroup>
+- <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+- <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+- </ImportGroup>
+- <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+- <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+- </ImportGroup>
+- <PropertyGroup Label="UserMacros" />
+- <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+- <LinkIncremental>false</LinkIncremental>
+- <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+- <OutDir>$(SolutionDir)\..\..\bin</OutDir>
+- </PropertyGroup>
+- <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+- <LinkIncremental>false</LinkIncremental>
+- <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+- <OutDir>$(SolutionDir)\..\..\bin</OutDir>
+- </PropertyGroup>
+- <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+- <ClCompile>
+- <PrecompiledHeader>
+- </PrecompiledHeader>
+- <WarningLevel>Level3</WarningLevel>
+- <Optimization>Disabled</Optimization>
+- <PreprocessorDefinitions>WIN64;_CONSOLE;%(PreprocessorDefinitions);_DEBUG</PreprocessorDefinitions>
+- <AdditionalIncludeDirectories>..\..\inc;..\..\src</AdditionalIncludeDirectories>
+- <ProgramDataBaseFileName>$(IntDir)vc$(PlatformToolsetVersion).pdb</ProgramDataBaseFileName>
+- <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+- <TreatWarningAsError>true</TreatWarningAsError>
+- <AdditionalOptions>/arch:AVX %(AdditionalOptions)</AdditionalOptions>
+- </ClCompile>
+- <Link>
+- <SubSystem>Console</SubSystem>
+- <GenerateDebugInformation>true</GenerateDebugInformation>
+- <AdditionalDependencies>
+- </AdditionalDependencies>
+- <AdditionalLibraryDirectories>..\..\lib</AdditionalLibraryDirectories>
+- </Link>
+- </ItemDefinitionGroup>
+- <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+- <ClCompile>
+- <WarningLevel>Level3</WarningLevel>
+- <PrecompiledHeader>
+- </PrecompiledHeader>
+- <Optimization>MaxSpeed</Optimization>
+- <FunctionLevelLinking>true</FunctionLevelLinking>
+- <IntrinsicFunctions>true</IntrinsicFunctions>
+- <PreprocessorDefinitions>WIN64;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+- <AdditionalIncludeDirectories>..\..\inc;..\..\src</AdditionalIncludeDirectories>
+- <ProgramDataBaseFileName>$(IntDir)vc$(PlatformToolsetVersion).pdb</ProgramDataBaseFileName>
+- <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+- <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+- <TreatWarningAsError>true</TreatWarningAsError>
+- <AdditionalOptions>/arch:AVX %(AdditionalOptions)</AdditionalOptions>
+- </ClCompile>
+- <Link>
+- <SubSystem>Console</SubSystem>
+- <GenerateDebugInformation>true</GenerateDebugInformation>
+- <EnableCOMDATFolding>true</EnableCOMDATFolding>
+- <OptimizeReferences>true</OptimizeReferences>
+- <AdditionalDependencies>
+- </AdditionalDependencies>
+- <AdditionalLibraryDirectories>..\..\lib</AdditionalLibraryDirectories>
+- </Link>
+- </ItemDefinitionGroup>
+- <ItemGroup>
+- <ClInclude Include="..\..\test\utest.h" />
+- </ItemGroup>
+- <ItemGroup>
+- <ClCompile Include="..\..\test\utest.c" />
+- </ItemGroup>
+- <ItemGroup>
+- <ProjectReference Include="common.vcxproj">
+- <Project>{3f9c7116-c287-40d7-865c-d8c89cf4ff31}</Project>
+- </ProjectReference>
+- <ProjectReference Include="libuavs3d.vcxproj">
+- <Project>{40b445e8-306a-4c77-9b19-fc76c2379f79}</Project>
+- </ProjectReference>
+- </ItemGroup>
+- <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+- <ImportGroup Label="ExtensionTargets">
+- </ImportGroup>
++<?xml version="1.0" encoding="utf-8"?>
++<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
++ <ItemGroup Label="ProjectConfigurations">
++ <ProjectConfiguration Include="Debug|Win32">
++ <Configuration>Debug</Configuration>
++ <Platform>Win32</Platform>
++ </ProjectConfiguration>
++ <ProjectConfiguration Include="Debug|x64">
++ <Configuration>Debug</Configuration>
++ <Platform>x64</Platform>
++ </ProjectConfiguration>
++ <ProjectConfiguration Include="Release|Win32">
++ <Configuration>Release</Configuration>
++ <Platform>Win32</Platform>
++ </ProjectConfiguration>
++ <ProjectConfiguration Include="Release|x64">
++ <Configuration>Release</Configuration>
++ <Platform>x64</Platform>
++ </ProjectConfiguration>
++ </ItemGroup>
++ <PropertyGroup Label="Globals">
++ <ProjectGuid>{798F7D68-C94D-41AF-86A4-98F7726D172C}</ProjectGuid>
++ <Keyword>Win32Proj</Keyword>
++ <RootNamespace>dec_test_vs17</RootNamespace>
++ <ProjectName>uavs3d</ProjectName>
++ <WindowsTargetPlatformVersion>10.0.16299.0</WindowsTargetPlatformVersion>
++ </PropertyGroup>
++ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
++ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
++ <ConfigurationType>Application</ConfigurationType>
++ <UseDebugLibraries>true</UseDebugLibraries>
++ <CharacterSet>MultiByte</CharacterSet>
++ <PlatformToolset>v141</PlatformToolset>
++ </PropertyGroup>
++ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
++ <ConfigurationType>Application</ConfigurationType>
++ <UseDebugLibraries>true</UseDebugLibraries>
++ <CharacterSet>MultiByte</CharacterSet>
++ <PlatformToolset>v141</PlatformToolset>
++ </PropertyGroup>
++ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
++ <ConfigurationType>Application</ConfigurationType>
++ <UseDebugLibraries>false</UseDebugLibraries>
++ <WholeProgramOptimization>true</WholeProgramOptimization>
++ <CharacterSet>MultiByte</CharacterSet>
++ <PlatformToolset>v141</PlatformToolset>
++ </PropertyGroup>
++ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
++ <ConfigurationType>Application</ConfigurationType>
++ <UseDebugLibraries>false</UseDebugLibraries>
++ <WholeProgramOptimization>true</WholeProgramOptimization>
++ <CharacterSet>MultiByte</CharacterSet>
++ <PlatformToolset>v141</PlatformToolset>
++ </PropertyGroup>
++ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
++ <ImportGroup Label="ExtensionSettings">
++ </ImportGroup>
++ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
++ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
++ </ImportGroup>
++ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
++ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
++ </ImportGroup>
++ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
++ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
++ </ImportGroup>
++ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
++ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
++ </ImportGroup>
++ <PropertyGroup Label="UserMacros" />
++ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
++ <LinkIncremental>false</LinkIncremental>
++ <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
++ <OutDir>$(SolutionDir)\..\..\bin</OutDir>
++ </PropertyGroup>
++ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
++ <LinkIncremental>false</LinkIncremental>
++ <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
++ </PropertyGroup>
++ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
++ <LinkIncremental>false</LinkIncremental>
++ <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
++ <OutDir>$(SolutionDir)\..\..\bin</OutDir>
++ </PropertyGroup>
++ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
++ <LinkIncremental>false</LinkIncremental>
++ <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
++ </PropertyGroup>
++ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
++ <ClCompile>
++ <PrecompiledHeader>
++ </PrecompiledHeader>
++ <WarningLevel>Level3</WarningLevel>
++ <Optimization>Disabled</Optimization>
++ <PreprocessorDefinitions>WIN64;_CONSOLE;%(PreprocessorDefinitions);_DEBUG</PreprocessorDefinitions>
++ <AdditionalIncludeDirectories>..\..\inc;..\..\src</AdditionalIncludeDirectories>
++ <ProgramDataBaseFileName>$(IntDir)vc$(PlatformToolsetVersion).pdb</ProgramDataBaseFileName>
++ <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
++ <TreatWarningAsError>true</TreatWarningAsError>
++ <AdditionalOptions>/arch:AVX %(AdditionalOptions)</AdditionalOptions>
++ </ClCompile>
++ <Link>
++ <SubSystem>Console</SubSystem>
++ <GenerateDebugInformation>true</GenerateDebugInformation>
++ <AdditionalDependencies>
++ </AdditionalDependencies>
++ <AdditionalLibraryDirectories>..\..\lib</AdditionalLibraryDirectories>
++ </Link>
++ </ItemDefinitionGroup>
++ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
++ <ClCompile>
++ <PrecompiledHeader>
++ </PrecompiledHeader>
++ <WarningLevel>Level3</WarningLevel>
++ <Optimization>Disabled</Optimization>
++ <PreprocessorDefinitions>WIN64;_CONSOLE;%(PreprocessorDefinitions);_DEBUG</PreprocessorDefinitions>
++ <AdditionalIncludeDirectories>..\..\inc;..\..\src</AdditionalIncludeDirectories>
++ <ProgramDataBaseFileName>$(IntDir)vc$(PlatformToolsetVersion).pdb</ProgramDataBaseFileName>
++ <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
++ <TreatWarningAsError>true</TreatWarningAsError>
++ <AdditionalOptions>/arch:AVX %(AdditionalOptions)</AdditionalOptions>
++ </ClCompile>
++ <Link>
++ <SubSystem>Console</SubSystem>
++ <GenerateDebugInformation>true</GenerateDebugInformation>
++ <AdditionalDependencies>
++ </AdditionalDependencies>
++ <AdditionalLibraryDirectories>..\..\lib</AdditionalLibraryDirectories>
++ </Link>
++ </ItemDefinitionGroup>
++ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
++ <ClCompile>
++ <WarningLevel>Level3</WarningLevel>
++ <PrecompiledHeader>
++ </PrecompiledHeader>
++ <Optimization>MaxSpeed</Optimization>
++ <FunctionLevelLinking>true</FunctionLevelLinking>
++ <IntrinsicFunctions>true</IntrinsicFunctions>
++ <PreprocessorDefinitions>WIN64;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
++ <AdditionalIncludeDirectories>..\..\inc;..\..\src</AdditionalIncludeDirectories>
++ <ProgramDataBaseFileName>$(IntDir)vc$(PlatformToolsetVersion).pdb</ProgramDataBaseFileName>
++ <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
++ <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
++ <TreatWarningAsError>true</TreatWarningAsError>
++ <AdditionalOptions>/arch:AVX %(AdditionalOptions)</AdditionalOptions>
++ </ClCompile>
++ <Link>
++ <SubSystem>Console</SubSystem>
++ <GenerateDebugInformation>true</GenerateDebugInformation>
++ <EnableCOMDATFolding>true</EnableCOMDATFolding>
++ <OptimizeReferences>true</OptimizeReferences>
++ <AdditionalDependencies>
++ </AdditionalDependencies>
++ <AdditionalLibraryDirectories>..\..\lib</AdditionalLibraryDirectories>
++ </Link>
++ </ItemDefinitionGroup>
++ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
++ <ClCompile>
++ <WarningLevel>Level3</WarningLevel>
++ <PrecompiledHeader>
++ </PrecompiledHeader>
++ <Optimization>MaxSpeed</Optimization>
++ <FunctionLevelLinking>true</FunctionLevelLinking>
++ <IntrinsicFunctions>true</IntrinsicFunctions>
++ <PreprocessorDefinitions>WIN64;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
++ <AdditionalIncludeDirectories>..\..\inc;..\..\src</AdditionalIncludeDirectories>
++ <ProgramDataBaseFileName>$(IntDir)vc$(PlatformToolsetVersion).pdb</ProgramDataBaseFileName>
++ <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
++ <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
++ <TreatWarningAsError>true</TreatWarningAsError>
++ <AdditionalOptions>/arch:AVX %(AdditionalOptions)</AdditionalOptions>
++ </ClCompile>
++ <Link>
++ <SubSystem>Console</SubSystem>
++ <GenerateDebugInformation>true</GenerateDebugInformation>
++ <EnableCOMDATFolding>true</EnableCOMDATFolding>
++ <OptimizeReferences>true</OptimizeReferences>
++ <AdditionalDependencies>
++ </AdditionalDependencies>
++ <AdditionalLibraryDirectories>..\..\lib</AdditionalLibraryDirectories>
++ </Link>
++ </ItemDefinitionGroup>
++ <ItemGroup>
++ <ClInclude Include="..\..\test\utest.h" />
++ </ItemGroup>
++ <ItemGroup>
++ <ClCompile Include="..\..\test\utest.c" />
++ </ItemGroup>
++ <ItemGroup>
++ <ProjectReference Include="common.vcxproj">
++ <Project>{3f9c7116-c287-40d7-865c-d8c89cf4ff31}</Project>
++ </ProjectReference>
++ <ProjectReference Include="libuavs3d.vcxproj">
++ <Project>{40b445e8-306a-4c77-9b19-fc76c2379f79}</Project>
++ </ProjectReference>
++ </ItemGroup>
++ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
++ <ImportGroup Label="ExtensionTargets">
++ </ImportGroup>
+ </Project>
+\ No newline at end of file
+diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
+index 4530a63..b52363e 100644
+--- a/source/CMakeLists.txt
++++ b/source/CMakeLists.txt
+@@ -1,19 +1,93 @@
+
+ set(LIBNAME uavs3d)
+
++# check cpu
++if("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "AMD64" OR
++ "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
++ if(${CMAKE_SIZEOF_VOID_P} EQUAL 4)
++ set(UAVS3D_TARGET_CPU "x86")
++ elseif(${CMAKE_SIZEOF_VOID_P} EQUAL 8)
++ set(UAVS3D_TARGET_CPU "x86_64")
++ else()
++ message(FATAL_ERROR
++ " Unexpected pointer size ${CMAKE_SIZEOF_VOID_P} for ${CMAKE_SYSTEM_PROCESSOR}\n")
++ endif()
++elseif("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "i386" OR
++ "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86")
++ set(UAVS3D_TARGET_CPU "x86")
++elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "aarch64" OR
++ "${CMAKE_SYSTEM_PROCESSOR}" MATCHES "arm64")
++ set(UAVS3D_TARGET_CPU "arm64")
++elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "^arm")
++ set(UAVS3D_TARGET_CPU "armv7")
++elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "loongarch64")
++ set(UAVS3D_TARGET_CPU "loongarch64")
++else()
++ message(WARNING "unsupported architecture: ${CMAKE_SYSTEM_PROCESSOR}\n")
++ set(UAVS3D_TARGET_CPU "generic")
++endif()
++
+ # add source
+ aux_source_directory(./decoder DIR_UAVS3D_SRC)
+ aux_source_directory(./decore DIR_UAVS3D_CORE)
+-aux_source_directory(./decore/sse DIR_X86_SRC)
+-aux_source_directory(./decore/avx2 DIR_X86_256_SRC)
+
+ list(APPEND DIR_UAVS3D_SRC ${DIR_UAVS3D_CORE})
+
+ include_directories("decore")
++set(UAVS3D_ASM_FILES "")
++
++if("${UAVS3D_TARGET_CPU}" MATCHES "x86" OR
++ "${UAVS3D_TARGET_CPU}" MATCHES "x86_64")
++ aux_source_directory(./decore/sse DIR_X86_SRC)
++ aux_source_directory(./decore/avx2 DIR_X86_256_SRC)
++ set_source_files_properties(${DIR_X86_SRC} PROPERTIES COMPILE_FLAGS "${CMAKE_C_FLAGS} -msse4.2")
++ set_source_files_properties(${DIR_X86_256_SRC} PROPERTIES COMPILE_FLAGS "${CMAKE_C_FLAGS} -mavx2")
++
++ list(APPEND UAVS3D_ASM_FILES ${DIR_X86_SRC})
++ list(APPEND UAVS3D_ASM_FILES ${DIR_X86_256_SRC})
++elseif("${UAVS3D_TARGET_CPU}" MATCHES "armv7")
++ list(APPEND UAVS3D_ASM_FILES "./decore/arm64/armv7.c")
++ list(APPEND UAVS3D_ASM_FILES "./decore/arm64/alf_armv7.S")
++ list(APPEND UAVS3D_ASM_FILES "./decore/arm64/deblock_armv7.S")
++ list(APPEND UAVS3D_ASM_FILES "./decore/arm64/def_armv7.S")
++ list(APPEND UAVS3D_ASM_FILES "./decore/arm64/inter_pred_armv7.S")
++ list(APPEND UAVS3D_ASM_FILES "./decore/arm64/intra_pred_armv7.S")
++ list(APPEND UAVS3D_ASM_FILES "./decore/arm64/dct2_armv7.S")
++ list(APPEND UAVS3D_ASM_FILES "./decore/arm64/itrans_dct8_dst7_armv7.S")
++ list(APPEND UAVS3D_ASM_FILES "./decore/arm64/pixel_armv7.S")
++ list(APPEND UAVS3D_ASM_FILES "./decore/arm64/sao_armv7.c")
++ list(APPEND UAVS3D_ASM_FILES "./decore/arm64/sao_kernel_armv7.S")
++
++ add_definitions(-D _armv7a)
++ enable_language(ASM)
++elseif("${UAVS3D_TARGET_CPU}" MATCHES "arm64")
++ list(APPEND UAVS3D_ASM_FILES "./decore/arm64/arm64.c")
++ list(APPEND UAVS3D_ASM_FILES "./decore/arm64/alf_arm64.S")
++ list(APPEND UAVS3D_ASM_FILES "./decore/arm64/deblock_arm64.S")
++ list(APPEND UAVS3D_ASM_FILES "./decore/arm64/def_arm64.S")
++ list(APPEND UAVS3D_ASM_FILES "./decore/arm64/inter_pred_arm64.S")
++ list(APPEND UAVS3D_ASM_FILES "./decore/arm64/intra_pred_arm64.S")
++ list(APPEND UAVS3D_ASM_FILES "./decore/arm64/intra_pred_chroma_arm64.S")
++ list(APPEND UAVS3D_ASM_FILES "./decore/arm64/itrans_arm64.c")
++ list(APPEND UAVS3D_ASM_FILES "./decore/arm64/itrans_dct2_arm64.S")
++ list(APPEND UAVS3D_ASM_FILES "./decore/arm64/itrans_dct8_dst7_arm64.S")
++ list(APPEND UAVS3D_ASM_FILES "./decore/arm64/pixel_arm64.S")
++ list(APPEND UAVS3D_ASM_FILES "./decore/arm64/sao_arm64.c")
++ list(APPEND UAVS3D_ASM_FILES "./decore/arm64/sao_kernel_arm64.S")
+
+-set_source_files_properties(${DIR_UAVS3D_SRC} PROPERTIES COMPILE_FLAGS "${CMAKE_C_FLAGS} -fPIC -std=c99 -O3")
+-set_source_files_properties(${DIR_X86_SRC} PROPERTIES COMPILE_FLAGS "${CMAKE_C_FLAGS} -fPIC -std=c99 -O3 -msse4.2")
+-set_source_files_properties(${DIR_X86_256_SRC} PROPERTIES COMPILE_FLAGS "${CMAKE_C_FLAGS} -fPIC -std=c99 -O3 -mavx2")
++ add_definitions(-D _arm64)
++ enable_language(ASM)
++elseif("${UAVS3D_TARGET_CPU}" MATCHES "loongarch64")
++ # loongarch64
++endif()
++
++if(COMPILE_10BIT)
++ add_definitions(-DCOMPILE_10BIT=1)
++ message("-- compile 10bit")
++else()
++ add_definitions(-DCOMPILE_10BIT=0)
++ message("-- compile 8bit")
++endif()
+
+ # get version
+ set (CONFIG_DIR ${CMAKE_CURRENT_SOURCE_DIR}/..)
+@@ -30,6 +104,7 @@ endfunction()
+ extract_version_string("${CONFIG_DIR}/version.h" uavs3d_version)
+ MESSAGE(STATUS "uavs3d version \t\t: ${uavs3d_version}")
+
++MESSAGE(STATUS "Target CPU\t\t\t: ${UAVS3D_TARGET_CPU}")
+ # pkg-config
+ find_package(Threads REQUIRED)
+ set(prefix "${CMAKE_INSTALL_PREFIX}")
+@@ -65,9 +140,11 @@ MESSAGE(STATUS "BUILD_SHARED_LIBS \t\t: true")
+ else()
+ MESSAGE(STATUS "BUILD_SHARED_LIBS \t\t: false")
+ endif()
+-add_library(${LIBNAME} ${DIR_UAVS3D_SRC} ${DIR_X86_256_SRC} ${DIR_X86_SRC})
++add_library(${LIBNAME} ${DIR_UAVS3D_SRC} ${UAVS3D_ASM_FILES})
+
+-target_link_libraries(${LIBNAME} m)
++if (NOT MSVC)
++ target_link_libraries(${LIBNAME} m)
++endif()
+ if(CMAKE_USE_PTHREADS_INIT)
+ target_link_libraries(${LIBNAME} pthread)
+ endif()
+@@ -76,4 +153,3 @@ endif()
+ install(TARGETS uavs3d LIBRARY DESTINATION ${CMAKE_INSTALL_LIB_DIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIB_DIR})
+ install(FILES decoder/uavs3d.h DESTINATION ${CMAKE_INSTALL_INCLUDE_DIR})
+ install(FILES ${CONFIG_DIR}/${LIBNAME}.pc DESTINATION ${CMAKE_INSTALL_PKGCONFIG_DIR})
+-
+diff --git a/source/decoder/bitstream.c b/source/decoder/bitstream.c
+index 9c433ad..1c3aaac 100644
+--- a/source/decoder/bitstream.c
++++ b/source/decoder/bitstream.c
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decoder/bitstream.h b/source/decoder/bitstream.h
+index 60052b4..f1b1043 100644
+--- a/source/decoder/bitstream.h
++++ b/source/decoder/bitstream.h
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decoder/dec_type.h b/source/decoder/dec_type.h
+index 1761605..cfb0442 100644
+--- a/source/decoder/dec_type.h
++++ b/source/decoder/dec_type.h
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decoder/dec_util.c b/source/decoder/dec_util.c
+index e9374d9..e30e323 100644
+--- a/source/decoder/dec_util.c
++++ b/source/decoder/dec_util.c
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+@@ -250,7 +245,7 @@ static void uavs3d_always_inline com_mv_rounding_affine(s32 hor, s32 ver, s32 *
+ }
+ }
+
+-static void uavs3d_always_inline check_umve_motion_availability(int scup, int cu_width, int cu_height, int i_scu, int neighbor[NUM_SPATIAL_MV], int valid[NUM_SPATIAL_MV], com_scu_t * map_scu, s16(*map_mv)[REFP_NUM][MV_D], s8(*map_refi)[REFP_NUM])
++static void uavs3d_always_inline check_umve_motion_availability(int scup, int cu_width, int cu_height, int i_scu, int neighbor[5], int valid[5], com_scu_t * map_scu, s16(*map_mv)[REFP_NUM][MV_D], s8(*map_refi)[REFP_NUM])
+ {
+ int cu_width_in_scu = cu_width >> MIN_CU_LOG2;
+ int cu_height_in_scu = cu_height >> MIN_CU_LOG2;
+@@ -985,7 +980,6 @@ static int get_affine_merge_candidate(com_core_t *core, s8 mrg_list_refi[REFP_NU
+ int scup = core->cu_scup;
+ com_map_t *map = &core->map;
+ com_seqh_t *seqhdr = core->seqhdr;
+- com_ref_pic_t(*refp)[REFP_NUM] = core->refp;
+ int i_scu = seqhdr->i_scu;
+ int lidx, i, k;
+ int cu_width = core->cu_width;
+@@ -1112,6 +1106,7 @@ static int get_affine_merge_candidate(com_core_t *core, s8 mrg_list_refi[REFP_NU
+ int neb_addr_rb = scup + i_scu * (cu_height_in_scu - 1) + (cu_width_in_scu - 1);
+ int scu_y;
+ int scup_co = get_colocal_scup(neb_addr_rb, i_scu, seqhdr->pic_width_in_scu, seqhdr->pic_height_in_scu, &scu_y);
++ com_ref_pic_t(*refp)[REFP_NUM] = core->refp;
+
+ if (core->pichdr->slice_type == SLICE_B) {
+ uavs3d_check_ref_avaliable(refp[0][REFP_1].pic, scu_y << MIN_CU_LOG2);
+diff --git a/source/decoder/dec_util.h b/source/decoder/dec_util.h
+index 77c00a6..9d46217 100644
+--- a/source/decoder/dec_util.h
++++ b/source/decoder/dec_util.h
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decoder/parser.c b/source/decoder/parser.c
+index 45bb26a..9405daa 100644
+--- a/source/decoder/parser.c
++++ b/source/decoder/parser.c
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+@@ -88,6 +83,9 @@ int dec_parse_sqh(com_bs_t * bs, com_seqh_t * seqhdr)
+ dec_bs_read1(bs, 1); //marker_bit
+ seqhdr->vertical_size = dec_bs_read(bs, 14, 0, COM_UINT32_MAX);
+
++ seqhdr->display_horizontal_size = seqhdr->horizontal_size;
++ seqhdr->display_vertical_size = seqhdr->vertical_size;
++
+ seqhdr->chroma_format = (u8)dec_bs_read(bs, 2, 1, 1);
+ seqhdr->sample_precision = (u8)dec_bs_read(bs, 3, 1, 2);
+
+@@ -262,19 +260,21 @@ static int user_data(com_pic_header_t *pichdr, com_bs_t * bs)
+ return RET_OK;
+ }
+
+-static int sequence_display_extension(com_bs_t * bs)
++static int sequence_display_extension(com_bs_t * bs, com_seqh_t *seqhdr)
+ {
+ dec_bs_read(bs, 3, 0, COM_UINT32_MAX); // video_format u(3)
+ dec_bs_read1(bs, -1); // sample_range u(1)
+- int colour_description = dec_bs_read1(bs, -1); // colour_description u(1)
+- if (colour_description) {
+- dec_bs_read(bs, 8, 0, COM_UINT32_MAX); // colour_primaries u(8)
+- dec_bs_read(bs, 8, 0, COM_UINT32_MAX); // transfer_characteristics u(8)
+- dec_bs_read(bs, 8, 0, COM_UINT32_MAX); // matrix_coefficients u(8)
++
++ seqhdr->colour_description = dec_bs_read1(bs, -1); // colour_description u(1)
++
++ if (seqhdr->colour_description) {
++ seqhdr->colour_primaries = dec_bs_read(bs, 8, 0, COM_UINT32_MAX); // colour_primaries u(8)
++ seqhdr->transfer_characteristics = dec_bs_read(bs, 8, 0, COM_UINT32_MAX); // transfer_characteristics u(8)
++ seqhdr->matrix_coefficients = dec_bs_read(bs, 8, 0, COM_UINT32_MAX); // matrix_coefficients u(8)
+ }
+- dec_bs_read(bs, 14, 0, COM_UINT32_MAX); // display_horizontal_size u(14)
++ seqhdr->display_horizontal_size = dec_bs_read(bs, 14, 0, COM_UINT32_MAX); // display_horizontal_size u(14)
+ dec_bs_read1(bs, 1); //marker_bit
+- dec_bs_read(bs, 14, 0, COM_UINT32_MAX); // display_vertical_size u(14)
++ seqhdr->display_vertical_size = dec_bs_read(bs, 14, 0, COM_UINT32_MAX); // display_vertical_size u(14)
+ char td_mode_flag = dec_bs_read1(bs, -1); // td_mode_flag u(1)
+
+ if (td_mode_flag == 1) {
+@@ -460,7 +460,7 @@ static int extension_data(com_bs_t * bs, com_seqh_t *seqhdr, com_pic_header_t *p
+ if (i == 0) {
+ int ret = dec_bs_read(bs, 4, 0, COM_UINT32_MAX);
+ if (ret == 2) {
+- sequence_display_extension(bs);
++ sequence_display_extension(bs, seqhdr);
+ } else if (ret == 3) {
+ temporal_scalability_extension(bs);
+ } else if (ret == 4) {
+@@ -1647,7 +1647,7 @@ static uavs3d_always_inline u32 lbac_read_unary_sym_ep(com_lbac_t * lbac)
+ low = lbac_refill2(lbac, low);
+ }
+ val += bin;
+- } while (bin);
++ } while (bin && lbac->cur < lbac->end);
+
+ lbac->range = range;
+ lbac->low = low;
+@@ -2834,8 +2834,7 @@ int dec_parse_lcu_delta_qp(com_lbac_t * lbac, int last_dqp)
+ bin = lbac_dec_bin(lbac, ctx->lcu_qp_delta + act_ctx);
+ act_ctx = min(3, act_ctx + 1);
+ act_sym += !bin;
+- }
+- while (!bin);
++ } while (!bin && lbac->cur < lbac->end);
+ }
+
+ dquant = (act_sym + 1) >> 1;
+diff --git a/source/decoder/parser.h b/source/decoder/parser.h
+index 20cd103..93fbb1a 100644
+--- a/source/decoder/parser.h
++++ b/source/decoder/parser.h
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decoder/uavs3d.c b/source/decoder/uavs3d.c
+index 9ac7dc2..7b0491d 100644
+--- a/source/decoder/uavs3d.c
++++ b/source/decoder/uavs3d.c
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+@@ -1060,7 +1055,7 @@ int __cdecl uavs3d_flush(void *h, uavs3d_io_frm_t* frm_out)
+
+ void* __cdecl uavs3d_create(uavs3d_cfg_t * dec_cfg, uavs3d_lib_output_callback_t callback, int * err)
+ {
+- uavs3d_dec_t *ctx = NULL;
++ uavs3d_dec_t *ctx;
+
+ printf("libuavs3d(%2d): %s_%s, %s\n", BIT_DEPTH, VERSION_STR, VERSION_TYPE, VERSION_SHA1);
+
+diff --git a/source/decoder/uavs3d.h b/source/decoder/uavs3d.h
+index f5ea2b0..1931917 100644
+--- a/source/decoder/uavs3d.h
++++ b/source/decoder/uavs3d.h
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+@@ -137,6 +132,9 @@ typedef struct uavs3d_com_seqh_t {
+ unsigned int bbv_buffer_size; /* 18 bits */
+ int horizontal_size; /* 14 bits */
+ int vertical_size; /* 14 bits */
++ int display_horizontal_size; /* 14 bits */
++ int display_vertical_size; /* 14 bits */
++
+ unsigned char log2_max_cu_width_height; /* 3 bits */
+ unsigned char min_cu_size;
+ unsigned char max_part_ratio_log2;
+@@ -211,6 +209,13 @@ typedef struct uavs3d_com_seqh_t {
+
+ /* alf map */
+ unsigned char *alf_idx_map;
++
++ /* hdr info */
++ unsigned char colour_description;
++ unsigned char colour_primaries;
++ unsigned char transfer_characteristics;
++ unsigned char matrix_coefficients;
++
+ } com_seqh_t;
+
+ #define FRAME_MAX_PLANES 3
+diff --git a/source/decore/alf.c b/source/decore/alf.c
+index 89a2411..111d99f 100644
+--- a/source/decore/alf.c
++++ b/source/decore/alf.c
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decore/arm64/alf_arm64.S b/source/decore/arm64/alf_arm64.S
+index 49a620f..9998073 100644
+--- a/source/decore/arm64/alf_arm64.S
++++ b/source/decore/arm64/alf_arm64.S
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decore/arm64/arm64.c b/source/decore/arm64/arm64.c
+index 0850349..74c2f81 100644
+--- a/source/decore/arm64/arm64.c
++++ b/source/decore/arm64/arm64.c
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+@@ -102,7 +97,7 @@ static void uavs3d_padding_rows_chroma_arm64(pel *src, int i_src, int width, int
+
+ void uavs3d_funs_init_arm64()
+ {
+-#if (BIT_DEPTH == 8)
++#if !COMPILE_10BIT
+ uavs3d_funs_handle.sao[ Y_C] = uavs3d_sao_on_lcu_arm64;
+ uavs3d_funs_handle.sao[UV_C] = uavs3d_sao_on_lcu_chroma_arm64;
+ uavs3d_funs_handle.alf[ Y_C] = uavs3d_alf_one_lcu_arm64;
+@@ -311,7 +306,7 @@ void uavs3d_funs_init_arm64()
+ uavs3d_funs_handle.itrans_dst7[1] = uavs3d_itrans_dct8_pb8_arm64;
+ uavs3d_funs_handle.itrans_dst7[2] = uavs3d_itrans_dct8_pb16_arm64;
+
+- uavs3d_funs_handle.conv_fmt_16bit = uavs3d_conv_fmt_16bit_arm64;
++ //uavs3d_funs_handle.conv_fmt_16bit = uavs3d_conv_fmt_16bit_arm64;
+ uavs3d_funs_handle.padding_rows_luma = uavs3d_padding_rows_luma_arm64;
+ uavs3d_funs_handle.padding_rows_chroma = uavs3d_padding_rows_chroma_arm64;
+
+diff --git a/source/decore/arm64/arm64.h b/source/decore/arm64/arm64.h
+index 6ba0566..b2410ef 100644
+--- a/source/decore/arm64/arm64.h
++++ b/source/decore/arm64/arm64.h
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decore/arm64/deblock_arm64.S b/source/decore/arm64/deblock_arm64.S
+index ff47274..b152147 100644
+--- a/source/decore/arm64/deblock_arm64.S
++++ b/source/decore/arm64/deblock_arm64.S
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+@@ -1540,12 +1535,12 @@ deblock_ver_filtered:
+ bif v17.16b, v1.16b, v25.16b
+ bif v18.16b, v6.16b, v25.16b
+
+- mov v1.2d, v17.2d
+- mov v2.2d, v28.2d
+- mov v3.2d, v30.2d
+- mov v4.2d, v31.2d
+- mov v5.2d, v29.2d
+- mov v6.2d, v18.2d
++ mov v1.16b, v17.16b
++ mov v2.16b, v28.16b
++ mov v3.16b, v30.16b
++ mov v4.16b, v31.16b
++ mov v5.16b, v29.16b
++ mov v6.16b, v18.16b
+
+ st4 {v0.H, v1.H, v2.H, v3.H}[0], [x2], #8
+ st4 {v4.H, v5.H, v6.H, v7.H}[0], [x2], x5
+diff --git a/source/decore/arm64/def_arm64.S b/source/decore/arm64/def_arm64.S
+index c6a219d..82e267d 100644
+--- a/source/decore/arm64/def_arm64.S
++++ b/source/decore/arm64/def_arm64.S
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+@@ -35,25 +30,23 @@
+ * For more information, contact us at rgwang@pkusz.edu.cn.
+ **************************************************************************************/
+
+-#if defined(__ANDROID__)
+-.macro function name
+- .text
+- .align 4
+- .global \name
+- .type \name, %function
+- \name:
+-.endm
+-
+ #if defined(__aarch64__) && !defined(__arm64__)
+ #define __arm64__ 1
+ #endif
+-#elif defined(__APPLE__)
++
++#if defined(__APPLE__)
+ .macro function name
+ .text
+ .align 4
+ .global _\name
+ _\name:
+ .endm
++#else
++.macro function name
++ .text
++ .align 4
++ .global \name
++ .type \name, %function
++ \name:
++.endm
+ #endif
+-
+-#define COMPILE_10BIT 0
+diff --git a/source/decore/arm64/inter_pred_arm64.S b/source/decore/arm64/inter_pred_arm64.S
+index e86addd..375f041 100644
+--- a/source/decore/arm64/inter_pred_arm64.S
++++ b/source/decore/arm64/inter_pred_arm64.S
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+@@ -1809,9 +1804,9 @@ if_hor_ver_chroma_w8_loop_y:
+ smlal v30.4s, v22.4h, v7.h[3]
+ smlal2 v31.4s, v22.8h, v7.h[3]
+
+- mov v16.2d, v20.2d
+- mov v17.2d, v21.2d
+- mov v18.2d, v22.2d
++ mov v16.16b, v20.16b
++ mov v17.16b, v21.16b
++ mov v18.16b, v22.16b
+
+ rshrn v24.4h, v24.4s, #12
+ rshrn2 v24.8h, v25.4s, #12
+@@ -2553,10 +2548,10 @@ if_hor_ver_luma_w4_loop_y:
+ smlal v29.4s, v21.4h, v8.h[7]
+ smlal2 v30.4s, v21.8h, v8.h[7]
+
+- mov v16.2d, v18.2d
+- mov v17.2d, v19.2d
+- mov v18.2d, v20.2d
+- mov v19.2d, v21.2d
++ mov v16.16b, v18.16b
++ mov v17.16b, v19.16b
++ mov v18.16b, v20.16b
++ mov v19.16b, v21.16b
+
+ rshrn v27.4h, v27.4s, #12
+ rshrn v28.4h, v28.4s, #12
+@@ -5268,7 +5263,7 @@ function uavs3d_if_hor_ver_luma_w8_arm64
+ ldr w8, [sp] // w8 = max_val
+
+ sub sp, sp, #80
+- sub x9, sp, #16
++ add x9, sp, #64
+ st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp]
+ st1 {v11.2d}, [x9]
+
+diff --git a/source/decore/arm64/intra_pred_arm64.S b/source/decore/arm64/intra_pred_arm64.S
+index e2c4629..5689bd4 100644
+--- a/source/decore/arm64/intra_pred_arm64.S
++++ b/source/decore/arm64/intra_pred_arm64.S
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+@@ -2996,12 +2991,12 @@ intra_pred_bi_ipf_end:
+ // const s8 *flt_coef_hor, const s8 *flt_coef_ver, int w, int h, int bit_depth)
+ //src->x0, dst->x1, i_dst->x2, flt_range_hor->x3, flt_range_ver->x4, flt_coef_hor->x5, flt_coef_ver->x6, w->x7
+ function uavs3d_intra_pred_ipf_arm64
+-#if defined(__ANDROID__)
+- ldr w8, [sp] // w8 = h
+- ldr w9, [sp, #8] // w9 = bit_depth
+-#elif defined(__APPLE__)
++#if defined(__APPLE__)
+ ldr w8, [sp]
+ ldr w9, [sp, #4]
++#else
++ ldr w8, [sp] // w8 = h
++ ldr w9, [sp, #8] // w9 = bit_depth
+ #endif
+ add x0, x0, #1 // p_top = src + 1
+
+diff --git a/source/decore/arm64/intra_pred_chroma_arm64.S b/source/decore/arm64/intra_pred_chroma_arm64.S
+index 2c142f3..b12d02a 100644
+--- a/source/decore/arm64/intra_pred_chroma_arm64.S
++++ b/source/decore/arm64/intra_pred_chroma_arm64.S
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+@@ -555,7 +550,7 @@ intra_pred_dc_uv_fillblock_w8:
+ b intra_pred_dc_uv_end
+
+ intra_pred_dc_uv_fillblock_w16:
+- mov v1.8h, v0.8h
++ mov v1.16b, v0.16b
+ intra_pred_dc_uv_fillblock_w16_y:
+ st1 {v0.8h, v1.8h}, [x1], x2 // store dst[x]
+ st1 {v0.8h, v1.8h}, [x1], x2
+@@ -570,9 +565,9 @@ intra_pred_dc_uv_fillblock_w32x:
+ cmp w3, #64
+ beq intra_pred_dc_uv_fillblock_w64
+
+- mov v1.8h, v0.8h
+- mov v2.8h, v0.8h
+- mov v3.8h, v0.8h
++ mov v1.16b, v0.16b
++ mov v2.16b, v0.16b
++ mov v3.16b, v0.16b
+ intra_pred_dc_uv_fillblock_w32_y:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 // store dst[x]
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2
+@@ -585,9 +580,9 @@ intra_pred_dc_uv_fillblock_w32_y:
+ intra_pred_dc_uv_fillblock_w64:
+
+ sub x2, x2, #64
+- mov v1.8h, v0.8h
+- mov v2.8h, v0.8h
+- mov v3.8h, v0.8h
++ mov v1.16b, v0.16b
++ mov v2.16b, v0.16b
++ mov v3.16b, v0.16b
+ intra_pred_dc_uv_fillblock_w64_y:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2
+diff --git a/source/decore/arm64/itrans_arm64.c b/source/decore/arm64/itrans_arm64.c
+index d26ec5f..9b2362e 100644
+--- a/source/decore/arm64/itrans_arm64.c
++++ b/source/decore/arm64/itrans_arm64.c
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decore/arm64/itrans_arm64.h b/source/decore/arm64/itrans_arm64.h
+index c4977d0..15d9a4b 100644
+--- a/source/decore/arm64/itrans_arm64.h
++++ b/source/decore/arm64/itrans_arm64.h
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decore/arm64/itrans_dct2_arm64.S b/source/decore/arm64/itrans_dct2_arm64.S
+index 00b99e1..0f37c7e 100644
+--- a/source/decore/arm64/itrans_dct2_arm64.S
++++ b/source/decore/arm64/itrans_dct2_arm64.S
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+@@ -758,9 +753,9 @@ dct2_h16_1st_loopx:
+ dct2_h16_2nd_transform:
+
+ sub sp, sp, #48
+- sub x7, sp, #16
+- st1 {v10.8h, v11.8h}, [sp]
+- st1 {v12.8h}, [x7]
++ add x7, sp, #16
++ st1 {v10.8h, v11.8h}, [x7]
++ st1 {v12.8h}, [sp]
+
+ mov w8, #1
+ lsl w8, w8, w5
+@@ -1059,8 +1054,8 @@ dct2_h16_2nd_loopx:
+ cmp x8, x3
+ blt dct2_h16_2nd_loopx
+
+- ld1 {v10.8h, v11.8h}, [sp], #32
+ ld1 {v12.8h}, [sp], #16
++ ld1 {v10.8h, v11.8h}, [sp], #32
+ ld1 {v8.8h, v9.8h}, [sp], #32
+
+ dct2_h16_end:
+diff --git a/source/decore/arm64/itrans_dct8_dst7_arm64.S b/source/decore/arm64/itrans_dct8_dst7_arm64.S
+index 340865c..5ba39e5 100644
+--- a/source/decore/arm64/itrans_dct8_dst7_arm64.S
++++ b/source/decore/arm64/itrans_dct8_dst7_arm64.S
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decore/arm64/pixel_arm64.S b/source/decore/arm64/pixel_arm64.S
+index bc79826..81fa81c 100644
+--- a/source/decore/arm64/pixel_arm64.S
++++ b/source/decore/arm64/pixel_arm64.S
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+@@ -285,8 +280,8 @@ padding_rows_lr_y:
+
+ ld1r {v0.8h}, [x0]
+ ld1r {v2.8h}, [x5] // right reference pointer
+- mov v1.8h, v0.8h
+- mov v3.8h, v2.8h
++ mov v1.16b, v0.16b
++ mov v3.16b, v2.16b
+
+ sub x5, x0, x4
+ add x6, x0, x2
+@@ -332,8 +327,8 @@ padding_rows_chroma_lr_y:
+ ld1r {v0.4s}, [x0] // src[0] src[1]
+ ld1r {v2.4s}, [x5] // right reference pointer
+
+- mov v1.4s, v0.4s
+- mov v3.4s, v2.4s
++ mov v1.16b, v0.16b
++ mov v3.16b, v2.16b
+
+ sub x5, x0, x4
+ add x6, x0, x2
+diff --git a/source/decore/arm64/sao_arm64.c b/source/decore/arm64/sao_arm64.c
+index dc1a5b0..37cf9e0 100644
+--- a/source/decore/arm64/sao_arm64.c
++++ b/source/decore/arm64/sao_arm64.c
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decore/arm64/sao_kernel_arm64.S b/source/decore/arm64/sao_kernel_arm64.S
+index 1546b17..b0d0bb8 100644
+--- a/source/decore/arm64/sao_kernel_arm64.S
++++ b/source/decore/arm64/sao_kernel_arm64.S
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+@@ -451,10 +446,10 @@ loop_x_eo_90_chroma_end:
+ ************************************************************************************************************************************/
+ function uavs3d_sao_eo_135_arm64
+ // get start_x_r0 and end_x_r0
+-#if defined(__ANDROID__)
+- ldp x8, x9, [sp]
+-#elif defined(__APPLE__)
++#if defined(__APPLE__)
+ ldp w8, w9, [sp]
++#else
++ ldp x8, x9, [sp]
+ #endif
+ sxtw x8, w8 // start_x_r0
+ sxtw x9, w9 // end_x_r0
+@@ -537,10 +532,10 @@ test_loop_x_eo_135_end_r0:
+
+ // ------- middle rows -------
+ // get param
+-#if defined(__ANDROID__)
+- ldp x7, x8, [sp, #16]
+-#elif defined(__APPLE__)
++#if defined(__APPLE__)
+ ldp w7, w8, [sp, #8]
++#else
++ ldp x7, x8, [sp, #16]
+ #endif
+ sxtw x7, w7 // start_x_r
+ sxtw x8, w8 // end_x_r
+@@ -613,10 +608,10 @@ test_loop_x_eo_135_end_r:
+ bgt test_loop_y_eo_135_r
+
+ // ------- last row -------
+-#if defined(__ANDROID__)
+- ldp x6, x7, [sp, #32]
+-#elif defined(__APPLE__)
++#if defined(__APPLE__)
+ ldp w6, w7, [sp, #16]
++#else
++ ldp x6, x7, [sp, #32]
+ #endif
+ sxtw x6, w6 // start_x_rn
+ sxtw x7, w7 // end_x_rn
+@@ -691,10 +686,10 @@ test_loop_x_eo_135_end_rn:
+ ************************************************************************************************************************************/
+ function uavs3d_sao_eo_135_chroma_arm64
+
+-#if defined(__ANDROID__)
+- ldp x8, x9, [sp]
+-#elif defined(__APPLE__)
++#if defined(__APPLE__)
+ ldp w8, w9, [sp]
++#else
++ ldp x8, x9, [sp]
+ #endif
+ sxtw x8, w8 // start_x_r0
+ sxtw x9, w9 // end_x_r0
+@@ -793,10 +788,10 @@ loop_x_eo_135_chroma_end_r0:
+ add x1, x1, x3 //-- dst+=dst_stride
+
+ //--------------------------------middle rows--------------------------------
+-#if defined(__ANDROID__)
+- ldp x7 , x8, [sp, #16] //-- x7=start_x_r; x8=end_x_r
+-#elif defined(__APPLE__)
++#if defined(__APPLE__)
+ ldp w7 , w8, [sp, #8] //-- x7=start_x_r; x8=end_x_r
++#else
++ ldp x7 , x8, [sp, #16] //-- x7=start_x_r; x8=end_x_r
+ #endif
+ sxtw x7 , w7
+ sxtw x8 , w8
+@@ -886,10 +881,10 @@ loop_x_eo_135_chroma_end_r:
+ bgt loop_y_eo_135_chroma_r
+
+ //---------------------------------last row--------------------------------
+-#if defined(__ANDROID__)
+- ldp x6, x7, [sp, #32] //-- x6=start_x_rn; x7=end_x_rn
+-#elif defined(__APPLE__)
++#if defined(__APPLE__)
+ ldp w6, w7, [sp, #16] //-- x6=start_x_rn; x7=end_x_rn
++#else
++ ldp x6, x7, [sp, #32] //-- x6=start_x_rn; x7=end_x_rn
+ #endif
+ sxtw x7, w7
+ sxtw x6, w6
+@@ -979,10 +974,10 @@ loop_x_eo_135_chroma_end_rn:
+ * end_x_r0->x8, start_x_r->x9, end_x_r->x10, start_x_rn->x11, end_x_rn->x12
+ ************************************************************************************************************************************/
+ function uavs3d_sao_eo_45_arm64
+-#if defined(__ANDROID__)
+- ldp x8, x9, [sp]
+-#elif defined(__APPLE__)
++#if defined(__APPLE__)
+ ldp w8, w9, [sp]
++#else
++ ldp x8, x9, [sp]
+ #endif
+ sxtw x8, w8 // start_x_r0
+ sxtw x9, w9 // end_x_r0
+@@ -1064,10 +1059,10 @@ test_loop_x_eo_45_end_r0:
+
+ // ------- middle rows -------
+ // get param
+-#if defined(__ANDROID__)
+- ldp x7, x8, [sp, #16] // x7 start_x_r; x8 end_x_r
+-#elif defined(__APPLE__)
++#if defined(__APPLE__)
+ ldp w7, w8, [sp, #8] // x7 start_x_r; x8 end_x_r
++#else
++ ldp x7, x8, [sp, #16] // x7 start_x_r; x8 end_x_r
+ #endif
+ sxtw x7, w7
+ sxtw x8, w8
+@@ -1141,10 +1136,10 @@ test_loop_x_eo_45_end_r:
+ bgt test_loop_y_eo_45_r
+
+ // ------- last row -------
+-#if defined(__ANDROID__)
+- ldp x6, x7, [sp, #32] // $x6 start_x_rn; $x7 end_x_rn
+-#elif defined(__APPLE__)
++#if defined(__APPLE__)
+ ldp w6, w7, [sp, #16] // $x6 start_x_rn; $x7 end_x_rn
++#else
++ ldp x6, x7, [sp, #32] // $x6 start_x_rn; $x7 end_x_rn
+ #endif
+ sxtw x6, w6
+ sxtw x7, w7
+@@ -1217,10 +1212,10 @@ ret
+ * end_x_r0->x8, start_x_r->x9, end_x_r->x10, start_x_rn->x11, end_x_rn->x12
+ ************************************************************************************************************************************/
+ function uavs3d_sao_eo_45_chroma_arm64
+-#if defined(__ANDROID__)
+- ldp x8, x9, [sp]
+-#elif defined(__APPLE__)
++#if defined(__APPLE__)
+ ldp w8, w9, [sp]
++#else
++ ldp x8, x9, [sp]
+ #endif
+ sxtw x8, w8 // start_x_r0
+ sxtw x9, w9 // end_x_r0
+@@ -1315,10 +1310,10 @@ loop_x_eo_45_chroma_end_r0:
+ add x1, x1, x3 //-- dst+=dst_stride
+
+ //--------------------------------middle rows--------------------------------
+-#if defined(__ANDROID__)
+- ldp x7 , x8, [sp, #16] //-- x7=start_x_r; x8=end_x_r
+-#elif defined(__APPLE__)
++#if defined(__APPLE__)
+ ldp w7 , w8, [sp, #8] //-- x7=start_x_r; x8=end_x_r
++#else
++ ldp x7 , x8, [sp, #16] //-- x7=start_x_r; x8=end_x_r
+ #endif
+ sxtw x7 , w7
+ sxtw x8 , w8
+@@ -1405,10 +1400,10 @@ loop_x_eo_45_chroma_end_r:
+ bgt loop_y_eo_45_chroma_r
+
+ //---------------------------------last row--------------------------------
+-#if defined(__ANDROID__)
+- ldp x6 , x7, [sp, #32] //-- x6=start_x_rn; x7=end_x_rn
+-#elif defined(__APPLE__)
++#if defined(__APPLE__)
+ ldp w6 , w7, [sp, #16] //-- x6=start_x_rn; x7=end_x_rn
++#else
++ ldp x6 , x7, [sp, #32] //-- x6=start_x_rn; x7=end_x_rn
+ #endif
+ sxtw x7 , w7
+ sxtw x6 , w6
+@@ -2221,10 +2216,10 @@ loop_x_eo_90_chroma_end:
+ * src->x0, dst->x1, src_stride->x2, dst_stride->x3, offset->x4, mask->x5, mb_height->x6, bit_depth->x7
+ ************************************************************************************************************************************/
+ function uavs3d_sao_eo_135_arm64
+-#if defined(__ANDROID__)
+- ldp x8, x9, [sp] // start_x_r0 and end_x_r0
+-#elif defined(__APPLE__)
++#if defined(__APPLE__)
+ ldp w8, w9, [sp]
++#else
++ ldp x8, x9, [sp] // start_x_r0 and end_x_r0
+ #endif
+
+ sxtw x8, w8 // start_x_r0
+@@ -2336,10 +2331,10 @@ test_loop_x_eo_135_end_r0:
+
+ // ------- middle rows -------
+ // get param
+-#if defined(__ANDROID__)
+- ldp x7, x8, [sp, #16]
+-#elif defined(__APPLE__)
++#if defined(__APPLE__)
+ ldp w7, w8, [sp, #8]
++#else
++ ldp x7, x8, [sp, #16]
+ #endif
+ sxtw x7, w7 // x7 start_x_r
+ sxtw x8, w8 // x8 end_x_r
+@@ -2431,10 +2426,10 @@ test_loop_x_eo_135_end_r:
+ bgt test_loop_y_eo_135_r
+
+ // ------- last row -------
+-#if defined(__ANDROID__)
+- ldp x6, x7, [sp, #32]
+-#elif defined(__APPLE__)
++#if defined(__APPLE__)
+ ldp w6, w7, [sp, #16]
++#else
++ ldp x6, x7, [sp, #32]
+ #endif
+ sxtw x6, w6 // start_x_rn
+ sxtw x7, w7 // end_x_rn
+@@ -2527,10 +2522,10 @@ test_loop_x_eo_135_end_rn:
+ * src->x0, dst->x1, src_stride->x2, dst_stride->x3, offset->x4, mask->x5, mb_height->x6, bit_depth->x7
+ ************************************************************************************************************************************/
+ function uavs3d_sao_eo_135_chroma_arm64
+-#if defined(__ANDROID__)
+- ldp x8, x9, [sp] // start_x_r0 and end_x_r0
+-#elif defined(__APPLE__)
++#if defined(__APPLE__)
+ ldp w8, w9, [sp]
++#else
++ ldp x8, x9, [sp] // start_x_r0 and end_x_r0
+ #endif
+
+ mov w13, #1
+@@ -2636,10 +2631,10 @@ loop_x_eo_135_chroma_end_r0:
+ add x1, x1, x3 //-- dst+=dst_stride
+
+ //--------------------------------middle rows--------------------------------
+-#if defined(__ANDROID__)
+- ldp x7 , x8, [sp, #16] //-- x7=start_x_r; x8=end_x_r
+-#elif defined(__APPLE__)
++#if defined(__APPLE__)
+ ldp w7 , w8, [sp, #8] //-- x7=start_x_r; x8=end_x_r
++#else
++ ldp x7 , x8, [sp, #16] //-- x7=start_x_r; x8=end_x_r
+ #endif
+ sxtw x7 , w7
+ sxtw x8 , w8
+@@ -2730,10 +2725,10 @@ loop_x_eo_135_chroma_end_r:
+ bgt loop_y_eo_135_chroma_r
+
+ //---------------------------------last row--------------------------------
+-#if defined(__ANDROID__)
+- ldp x6, x7, [sp, #32] //-- x6=start_x_rn; x7=end_x_rn
+-#elif defined(__APPLE__)
++#if defined(__APPLE__)
+ ldp w6, w7, [sp, #16] //-- x6=start_x_rn; x7=end_x_rn
++#else
++ ldp x6, x7, [sp, #32] //-- x6=start_x_rn; x7=end_x_rn
+ #endif
+ sxtw x7, w7
+ sxtw x6, w6
+@@ -2822,10 +2817,10 @@ loop_x_eo_135_chroma_end_rn:
+ * src->x0, dst->x1, src_stride->x2, dst_stride->x3, offset->x4, mask->x5, mb_height->x6, bit_depth->x7
+ ************************************************************************************************************************************/
+ function uavs3d_sao_eo_45_arm64
+-#if defined(__ANDROID__)
+- ldp x8, x9, [sp] // start_x_r0 and end_x_r0
+-#elif defined(__APPLE__)
++#if defined(__APPLE__)
+ ldp w8, w9, [sp]
++#else
++ ldp x8, x9, [sp] // start_x_r0 and end_x_r0
+ #endif
+
+ mov w12, #1
+@@ -2936,10 +2931,10 @@ test_loop_x_eo_45_end_r0:
+
+ // ------- middle rows -------
+ // get param
+-#if defined(__ANDROID__)
+- ldp x7, x8, [sp, #16] // x7 start_x_r; x8 end_x_r
+-#elif defined(__APPLE__)
++#if defined(__APPLE__)
+ ldp w7, w8, [sp, #8] // x7 start_x_r; x8 end_x_r
++#else
++ ldp x7, x8, [sp, #16] // x7 start_x_r; x8 end_x_r
+ #endif
+ sxtw x7, w7
+ sxtw x8, w8
+@@ -3031,10 +3026,10 @@ test_loop_x_eo_45_end_r:
+ bgt test_loop_y_eo_45_r
+
+ // ------- last row -------
+-#if defined(__ANDROID__)
+- ldp x6, x7, [sp, #32]
+-#elif defined(__APPLE__)
++#if defined(__APPLE__)
+ ldp w6, w7, [sp, #16]
++#else
++ ldp x6, x7, [sp, #32]
+ #endif
+ sxtw x6, w6 // start_x_rn
+ sxtw x7, w7 // end_x_rn
+@@ -3126,10 +3121,10 @@ test_loop_x_eo_45_end_rn:
+ * src->x0, dst->x1, src_stride->x2, dst_stride->x3, offset->x4, mask->x5, mb_height->x6, bit_depth->x7
+ ************************************************************************************************************************************/
+ function uavs3d_sao_eo_45_chroma_arm64
+-#if defined(__ANDROID__)
+- ldp x8, x9, [sp] // start_x_r0 and end_x_r0
+-#elif defined(__APPLE__)
++#if defined(__APPLE__)
+ ldp w8, w9, [sp]
++#else
++ ldp x8, x9, [sp] // start_x_r0 and end_x_r0
+ #endif
+
+ mov w12, #1
+@@ -3236,10 +3231,10 @@ loop_x_eo_45_chroma_end_r0:
+ add x1, x1, x3 //-- dst+=dst_stride
+
+ //--------------------------------middle rows--------------------------------
+-#if defined(__ANDROID__)
+- ldp x7 , x8, [sp, #16] //-- x7=start_x_r; x8=end_x_r
+-#elif defined(__APPLE__)
++#if defined(__APPLE__)
+ ldp w7 , w8, [sp, #8] //-- x7=start_x_r; x8=end_x_r
++#else
++ ldp x7 , x8, [sp, #16] //-- x7=start_x_r; x8=end_x_r
+ #endif
+ sxtw x7 , w7
+ sxtw x8 , w8
+@@ -3328,10 +3323,10 @@ loop_x_eo_45_chroma_end_r:
+ bgt loop_y_eo_45_chroma_r
+
+ //---------------------------------last row--------------------------------
+-#if defined(__ANDROID__)
+- ldp x6 , x7, [sp, #32] //-- x6=start_x_rn; x7=end_x_rn
+-#elif defined(__APPLE__)
++#if defined(__APPLE__)
+ ldp w6 , w7, [sp, #16] //-- x6=start_x_rn; x7=end_x_rn
++#else
++ ldp x6 , x7, [sp, #32] //-- x6=start_x_rn; x7=end_x_rn
+ #endif
+ sxtw x7 , w7
+ sxtw x6 , w6
+diff --git a/source/decore/arm64/sao_kernel_arm64.h b/source/decore/arm64/sao_kernel_arm64.h
+index 03373ce..b9c448e 100644
+--- a/source/decore/arm64/sao_kernel_arm64.h
++++ b/source/decore/arm64/sao_kernel_arm64.h
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decore/armv7/alf_armv7.S b/source/decore/armv7/alf_armv7.S
+index 5b0e3e0..c87f29b 100644
+--- a/source/decore/armv7/alf_armv7.S
++++ b/source/decore/armv7/alf_armv7.S
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decore/armv7/armv7.c b/source/decore/armv7/armv7.c
+index 0be03a4..6ad9086 100644
+--- a/source/decore/armv7/armv7.c
++++ b/source/decore/armv7/armv7.c
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+@@ -242,7 +237,7 @@ void uavs3d_itrans_dct2_h64_w64_armv7(s16 *src, s16 *dst, int bit_depth)
+
+ void uavs3d_funs_init_armv7()
+ {
+-#if BIT_DEPTH == 8
++#if !COMPILE_10BIT
+ uavs3d_funs_handle.sao[ Y_C] = uavs3d_sao_on_lcu_armv7;
+ uavs3d_funs_handle.sao[UV_C] = uavs3d_sao_on_lcu_chroma_armv7;
+ uavs3d_funs_handle.alf[ Y_C] = uavs3d_alf_one_lcu_armv7;
+diff --git a/source/decore/armv7/armv7.h b/source/decore/armv7/armv7.h
+index 60efd7e..0114927 100644
+--- a/source/decore/armv7/armv7.h
++++ b/source/decore/armv7/armv7.h
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decore/armv7/dct2_armv7.S b/source/decore/armv7/dct2_armv7.S
+index 82c2f82..05e965f 100644
+--- a/source/decore/armv7/dct2_armv7.S
++++ b/source/decore/armv7/dct2_armv7.S
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decore/armv7/deblock_armv7.S b/source/decore/armv7/deblock_armv7.S
+index 202d121..dcdeabf 100644
+--- a/source/decore/armv7/deblock_armv7.S
++++ b/source/decore/armv7/deblock_armv7.S
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decore/armv7/def_armv7.S b/source/decore/armv7/def_armv7.S
+index fdc3d27..861f016 100644
+--- a/source/decore/armv7/def_armv7.S
++++ b/source/decore/armv7/def_armv7.S
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+@@ -35,23 +30,18 @@
+ * For more information, contact us at rgwang@pkusz.edu.cn.
+ **************************************************************************************/
+
+-#if defined(__ANDROID__)
+-.macro function name
+- .global \name
+- .hidden \name
+- .type \name, %function
+-\name:
+-.endm
+-#elif defined(__APPLE__)
++#if defined(__APPLE__)
+ .macro function name
+ .text
+ .align 4
+ .global _\name
+ _\name:
+ .endm
+-
++#else
++.macro function name
++ .global \name
++ .hidden \name
++ .type \name, %function
++\name:
++.endm
+ #endif
+-
+-#define COMPILE_10BIT 0
+-
+-
+diff --git a/source/decore/armv7/inter_pred_armv7.S b/source/decore/armv7/inter_pred_armv7.S
+index edbcc62..fd78ae4 100644
+--- a/source/decore/armv7/inter_pred_armv7.S
++++ b/source/decore/armv7/inter_pred_armv7.S
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decore/armv7/intra_pred_armv7.S b/source/decore/armv7/intra_pred_armv7.S
+index cbce583..807f6a5 100644
+--- a/source/decore/armv7/intra_pred_armv7.S
++++ b/source/decore/armv7/intra_pred_armv7.S
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decore/armv7/itrans_dct8_dst7_armv7.S b/source/decore/armv7/itrans_dct8_dst7_armv7.S
+index 129bbc3..d8d1ab1 100755
+--- a/source/decore/armv7/itrans_dct8_dst7_armv7.S
++++ b/source/decore/armv7/itrans_dct8_dst7_armv7.S
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decore/armv7/pixel_armv7.S b/source/decore/armv7/pixel_armv7.S
+index 83124a4..f88f474 100644
+--- a/source/decore/armv7/pixel_armv7.S
++++ b/source/decore/armv7/pixel_armv7.S
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decore/armv7/sao_armv7.c b/source/decore/armv7/sao_armv7.c
+index cec6880..8b3070e 100644
+--- a/source/decore/armv7/sao_armv7.c
++++ b/source/decore/armv7/sao_armv7.c
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decore/armv7/sao_kernel_armv7.S b/source/decore/armv7/sao_kernel_armv7.S
+index 0449ecc..1652c99 100644
+--- a/source/decore/armv7/sao_kernel_armv7.S
++++ b/source/decore/armv7/sao_kernel_armv7.S
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decore/armv7/sao_kernel_armv7.h b/source/decore/armv7/sao_kernel_armv7.h
+index d30c690..49b020b 100644
+--- a/source/decore/armv7/sao_kernel_armv7.h
++++ b/source/decore/armv7/sao_kernel_armv7.h
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decore/avx2/alf_avx2.c b/source/decore/avx2/alf_avx2.c
+index 74226b6..cc2cf5f 100644
+--- a/source/decore/avx2/alf_avx2.c
++++ b/source/decore/avx2/alf_avx2.c
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+@@ -489,7 +484,7 @@ void uavs3d_alf_one_lcu_one_chroma_avx2(pel *dst, int i_dst, pel *src, int i_src
+ __m256i T000, T001, T010, T011, T100, T101, T110, T111, T200, T201, T210, T211, T310, T311, T300, T301, T400, T401, T410, T411, T500, T501, T510, T511, T600, T601, T610, T611, T700, T701, T710, T711;
+ __m256i E00, E01, E10, E11;
+ __m256i C0, C1, C2, C3, C4, C5, C6, C7, C8;
+- __m256i S0, S00, S01, S1, S10, S11, S2, S20, S21, S3, S30, S31, S4, S40, S41, S5, S50, S51, S6, S7, S8, S60, S61, S70, S71, S80, S81, S82, S83, SS1, SS2, SS3, SS4;
++ __m256i S0, S00, S01, S1, S10, S11, S2, S20, S21, S3, S30, S31, S4, S40, S41, S5, S50, S51, S6, S7, S60, S61, S70, S71, S80, S81, S82, S83, SS1, SS2, SS3, SS4;
+ __m256i mAddOffset;
+ __m256i mZero = _mm256_set1_epi16(0);
+ __m256i mMax = _mm256_set1_epi16((short)((1 << sample_bit_depth) - 1));
+@@ -629,7 +624,7 @@ void uavs3d_alf_one_lcu_one_chroma_avx2(pel *dst, int i_dst, pel *src, int i_src
+ S5 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(S50, 1));
+ S6 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(S60, 1));
+ S7 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(S70, 1));
+- S8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(S80, 1));
++ //S8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(S80, 1));
+ S0 = _mm256_add_epi32(S0, S1);
+ S2 = _mm256_add_epi32(S2, S3);
+ S4 = _mm256_add_epi32(S4, S5);
+@@ -803,7 +798,7 @@ void uavs3d_alf_one_lcu_one_chroma_avx2(pel *dst, int i_dst, pel *src, int i_src
+ S5 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(S50, 1));
+ S6 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(S60, 1));
+ S7 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(S70, 1));
+- S8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(S80, 1));
++ //S8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(S80, 1));
+ S0 = _mm256_add_epi32(S0, S1);
+ S2 = _mm256_add_epi32(S2, S3);
+ S4 = _mm256_add_epi32(S4, S5);
+@@ -821,7 +816,7 @@ void uavs3d_alf_one_lcu_one_chroma_avx2(pel *dst, int i_dst, pel *src, int i_src
+ S5 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(S51));
+ S6 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(S61));
+ S7 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(S71));
+- S8 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(S81));
++ //S8 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(S81));
+ S0 = _mm256_add_epi32(S0, S1);
+ S2 = _mm256_add_epi32(S2, S3);
+ S4 = _mm256_add_epi32(S4, S5);
+@@ -839,7 +834,7 @@ void uavs3d_alf_one_lcu_one_chroma_avx2(pel *dst, int i_dst, pel *src, int i_src
+ S5 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(S51, 1));
+ S6 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(S61, 1));
+ S7 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(S71, 1));
+- S8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(S81, 1));
++ //S8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(S81, 1));
+ S0 = _mm256_add_epi32(S0, S1);
+ S2 = _mm256_add_epi32(S2, S3);
+ S4 = _mm256_add_epi32(S4, S5);
+@@ -1055,7 +1050,6 @@ void uavs3d_alf_one_lcu_chroma_avx2(pel *dst, int i_dst, pel *src, int i_src, in
+ int startPos = 0;
+ int endPos = lcu_height;
+ int xPosEnd = lcu_width << 1;
+- long long tmp[8];
+
+ src += (startPos*i_src);
+ dst += (startPos*i_dst);
+@@ -1066,22 +1060,14 @@ void uavs3d_alf_one_lcu_chroma_avx2(pel *dst, int i_dst, pel *src, int i_src, in
+ T00 = _mm256_unpacklo_epi16(C8, C8);
+ T01 = _mm256_unpackhi_epi16(C8, C8);
+
+- tmp[0] = _mm256_extract_epi64(T00, 0); // win32 compile error if C0 = _mm256_set1_epi64x(_mm256_extract_epi64(T00, 0));
+- tmp[1] = _mm256_extract_epi64(T00, 1);
+- tmp[2] = _mm256_extract_epi64(T00, 2);
+- tmp[3] = _mm256_extract_epi64(T00, 3);
+- tmp[4] = _mm256_extract_epi64(T01, 0);
+- tmp[5] = _mm256_extract_epi64(T01, 1);
+- tmp[6] = _mm256_extract_epi64(T01, 2);
+- tmp[7] = _mm256_extract_epi64(T01, 3);
+- C0 = _mm256_set1_epi64x(tmp[0]);
+- C1 = _mm256_set1_epi64x(tmp[1]);
+- C2 = _mm256_set1_epi64x(tmp[2]);
+- C3 = _mm256_set1_epi64x(tmp[3]);
+- C4 = _mm256_set1_epi64x(tmp[4]);
+- C5 = _mm256_set1_epi64x(tmp[5]);
+- C6 = _mm256_set1_epi64x(tmp[6]);
+- C7 = _mm256_set1_epi64x(tmp[7]);
++ C0 = _mm256_permute4x64_epi64(T00, 0x00);
++ C1 = _mm256_permute4x64_epi64(T00, 0x55);
++ C2 = _mm256_permute4x64_epi64(T00, 0xaa);
++ C3 = _mm256_permute4x64_epi64(T00, 0xff);
++ C4 = _mm256_permute4x64_epi64(T01, 0x00);
++ C5 = _mm256_permute4x64_epi64(T01, 0x55);
++ C6 = _mm256_permute4x64_epi64(T01, 0xaa);
++ C7 = _mm256_permute4x64_epi64(T01, 0xff);
+ C8 = _mm256_set1_epi32((unsigned short)coef[16] + (((unsigned short)coef[17]) << 16));
+ C8 = _mm256_unpacklo_epi16(C8, C8);
+
+diff --git a/source/decore/avx2/avx2.c b/source/decore/avx2/avx2.c
+index a17dac5..d4b3ebf 100644
+--- a/source/decore/avx2/avx2.c
++++ b/source/decore/avx2/avx2.c
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+@@ -183,11 +178,13 @@ void uavs3d_funs_init_avx2()
+ uavs3d_funs_handle.ipcpy[4] = uavs3d_if_cpy_w64_avx2;
+ uavs3d_funs_handle.ipcpy[5] = uavs3d_if_cpy_w128_avx2;
+
++ uavs3d_funs_handle.ipflt[IPFILTER_H_4][1] = uavs3d_if_hor_chroma_w8_avx2;
+ uavs3d_funs_handle.ipflt[IPFILTER_H_4][2] = uavs3d_if_hor_chroma_w16_avx2;
+ uavs3d_funs_handle.ipflt[IPFILTER_H_4][3] = uavs3d_if_hor_chroma_w16x_avx2;
+ uavs3d_funs_handle.ipflt[IPFILTER_H_4][4] = uavs3d_if_hor_chroma_w16x_avx2;
+ uavs3d_funs_handle.ipflt[IPFILTER_H_4][5] = uavs3d_if_hor_chroma_w16x_avx2;
+
++ uavs3d_funs_handle.ipflt[IPFILTER_H_8][1] = uavs3d_if_hor_luma_w8_avx2;
+ uavs3d_funs_handle.ipflt[IPFILTER_H_8][2] = uavs3d_if_hor_luma_w16_avx2;
+ uavs3d_funs_handle.ipflt[IPFILTER_H_8][3] = uavs3d_if_hor_luma_w16x_avx2;
+ uavs3d_funs_handle.ipflt[IPFILTER_H_8][4] = uavs3d_if_hor_luma_w16x_avx2;
+@@ -198,6 +195,7 @@ void uavs3d_funs_init_avx2()
+ uavs3d_funs_handle.ipflt[IPFILTER_V_4][4] = uavs3d_if_ver_chroma_w32x_avx2;
+ uavs3d_funs_handle.ipflt[IPFILTER_V_4][5] = uavs3d_if_ver_chroma_w32x_avx2;
+
++ uavs3d_funs_handle.ipflt[IPFILTER_V_8][1] = uavs3d_if_ver_luma_w8_avx2;
+ uavs3d_funs_handle.ipflt[IPFILTER_V_8][2] = uavs3d_if_ver_luma_w16_avx2;
+ uavs3d_funs_handle.ipflt[IPFILTER_V_8][3] = uavs3d_if_ver_luma_w16x_avx2;
+ uavs3d_funs_handle.ipflt[IPFILTER_V_8][4] = uavs3d_if_ver_luma_w16x_avx2;
+@@ -209,6 +207,8 @@ void uavs3d_funs_init_avx2()
+ uavs3d_funs_handle.ipflt_ext[IPFILTER_EXT_4][4] = uavs3d_if_hor_ver_chroma_w16x_avx2;
+ uavs3d_funs_handle.ipflt_ext[IPFILTER_EXT_4][5] = uavs3d_if_hor_ver_chroma_w16x_avx2;
+
++ uavs3d_funs_handle.ipflt_ext[IPFILTER_EXT_8][0] = uavs3d_if_hor_ver_luma_w4_avx2;
++ uavs3d_funs_handle.ipflt_ext[IPFILTER_EXT_8][1] = uavs3d_if_hor_ver_luma_w8_avx2;
+ uavs3d_funs_handle.ipflt_ext[IPFILTER_EXT_8][2] = uavs3d_if_hor_ver_luma_w16x_avx2;
+ uavs3d_funs_handle.ipflt_ext[IPFILTER_EXT_8][3] = uavs3d_if_hor_ver_luma_w16x_avx2;
+ uavs3d_funs_handle.ipflt_ext[IPFILTER_EXT_8][4] = uavs3d_if_hor_ver_luma_w16x_avx2;
+diff --git a/source/decore/avx2/avx2.h b/source/decore/avx2/avx2.h
+index 43d0ad3..a29cfe8 100644
+--- a/source/decore/avx2/avx2.h
++++ b/source/decore/avx2/avx2.h
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decore/avx2/inter_pred_avx2.c b/source/decore/avx2/inter_pred_avx2.c
+index c7287d3..cbaa2fc 100644
+--- a/source/decore/avx2/inter_pred_avx2.c
++++ b/source/decore/avx2/inter_pred_avx2.c
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+-* Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++* Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+-* 3. All advertising materials mentioning features or use of this software
+-* must display the following acknowledgement:
+-* This product includes the software uAVS3d developed by
+-* Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+-* and Guangdong Bohua UHD Innovation Corporation.
+-* 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++* 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+@@ -33,7 +28,7 @@
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * For more information, contact us at rgwang@pkusz.edu.cn.
+-**************************************************************************************/
++**************************************************************************************/
+
+ #include "avx2.h"
+
+@@ -134,9 +129,11 @@ void uavs3d_if_hor_chroma_w8_avx2(const pel *src, int i_src, pel *dst, int i_dst
+
+ __m256i mCoefy1_hor = _mm256_set1_epi16(*(s16*)coeff);
+ __m256i mCoefy2_hor = _mm256_set1_epi16(*(s16*)(coeff + 2));
+- __m256i mSwitch = _mm256_setr_epi8(0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9);
++ __m256i mSwitch0 = _mm256_setr_epi8(0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9);
++ __m256i mSwitch1 = _mm256_setr_epi8(0+4, 2+4, 1+4, 3+4, 2+4, 4+4, 3+4, 5+4, 4+4, 6+4, 5+4, 7+4, 6+4, 8+4, 7+4, 9+4,
++ 0+4, 2+4, 1+4, 3+4, 2+4, 4+4, 3+4, 5+4, 4+4, 6+4, 5+4, 7+4, 6+4, 8+4, 7+4, 9+4);
+ __m256i mAddOffset = _mm256_set1_epi16(offset);
+- __m256i T0, T1, S0, S1, R0, R1, sum;
++ __m256i T0, T1, S0, R0, R1, sum;
+ __m128i s0, s1;
+
+ src -= 2;
+@@ -144,27 +141,27 @@ void uavs3d_if_hor_chroma_w8_avx2(const pel *src, int i_src, pel *dst, int i_dst
+ while (height > 0) {
+ s0 = _mm_loadu_si128((__m128i*)(src));
+ s1 = _mm_loadu_si128((__m128i*)(src + i_src));
++ src += i_src << 1;
++ uavs3d_prefetch(src, _MM_HINT_NTA);
++ uavs3d_prefetch(src + i_src, _MM_HINT_NTA);
+
+ S0 = _mm256_set_m128i(s1, s0);
+- S1 = _mm256_srli_si256(S0, 4);
+
+- R0 = _mm256_shuffle_epi8(S0, mSwitch); // 4 rows s0 and s1
+- R1 = _mm256_shuffle_epi8(S1, mSwitch);
++ R0 = _mm256_shuffle_epi8(S0, mSwitch0); // 4 rows s0 and s1
++ R1 = _mm256_shuffle_epi8(S0, mSwitch1);
+
+ T0 = _mm256_maddubs_epi16(R0, mCoefy1_hor); // 4x4: s0*c0 + s1*c1
+ T1 = _mm256_maddubs_epi16(R1, mCoefy2_hor);
+ sum = _mm256_add_epi16(T0, T1);
+
+- sum = _mm256_add_epi16(sum, mAddOffset);
++ sum = _mm256_add_epi16(sum, mAddOffset);
+ sum = _mm256_srai_epi16(sum, shift);
+
+ s0 = _mm_packus_epi16(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
+- s1 = _mm_srli_si128(s0, 8);
+ _mm_storel_epi64((__m128i*)(dst), s0);
+- _mm_storel_epi64((__m128i*)(dst + i_dst), s1);
++ _mm_storeh_pi((__m64*)(dst + i_dst), _mm_castsi128_ps(s0));
+
+ height -= 2;
+- src += i_src << 1;
+ dst += i_dst << 1;
+ }
+ }
+@@ -184,9 +181,11 @@ void uavs3d_if_hor_chroma_w16_avx2(const pel *src, int i_src, pel *dst, int i_ds
+ src -= 2;
+
+ while (height) {
+- uavs3d_prefetch(src + i_src*2, _MM_HINT_NTA);
+ S0 = _mm256_loadu_si256((__m256i*)(src));
+ S1 = _mm256_loadu_si256((__m256i*)(src + i_src));
++ src += i_src << 1;
++ uavs3d_prefetch(src, _MM_HINT_NTA);
++ uavs3d_prefetch(src + i_src, _MM_HINT_NTA);
+ S2 = _mm256_permute4x64_epi64(S0, 0x94);
+ S3 = _mm256_permute4x64_epi64(S1, 0x94);
+ R0 = _mm256_shuffle_epi8(S2, mSwitch1);
+@@ -200,6 +199,8 @@ void uavs3d_if_hor_chroma_w16_avx2(const pel *src, int i_src, pel *dst, int i_ds
+ sum0 = _mm256_add_epi16(T0, T1);
+ sum1 = _mm256_add_epi16(T2, T3);
+
++ height -= 2;
++
+ sum0 = _mm256_add_epi16(sum0, mAddOffset);
+ sum1 = _mm256_add_epi16(sum1, mAddOffset);
+ sum0 = _mm256_srai_epi16(sum0, shift);
+@@ -207,8 +208,6 @@ void uavs3d_if_hor_chroma_w16_avx2(const pel *src, int i_src, pel *dst, int i_ds
+ _mm_storeu_si128((__m128i*)(dst), _mm_packus_epi16(_mm256_castsi256_si128(sum0), _mm256_extracti128_si256(sum0, 1)));
+ _mm_storeu_si128((__m128i*)(dst + i_dst), _mm_packus_epi16(_mm256_castsi256_si128(sum1), _mm256_extracti128_si256(sum1, 1)));
+
+- height -= 2;
+- src += i_src << 1;
+ dst += i_dst << 1;
+ }
+ }
+@@ -228,10 +227,10 @@ void uavs3d_if_hor_chroma_w32_avx2(const pel *src, int i_src, pel *dst, int i_ds
+ src -= 2;
+
+ while (height--) {
+- uavs3d_prefetch(src + i_src, _MM_HINT_NTA);
+-
+ S0 = _mm256_loadu_si256((__m256i*)(src));
+ S1 = _mm256_loadu_si256((__m256i*)(src + 16));
++ uavs3d_prefetch(src + i_src, _MM_HINT_NTA);
++
+ S2 = _mm256_permute4x64_epi64(S0, 0x94);
+ S3 = _mm256_permute4x64_epi64(S1, 0x94);
+ R0 = _mm256_shuffle_epi8(S2, mSwitch1);
+@@ -308,59 +307,52 @@ void uavs3d_if_hor_luma_w4_avx2(const pel *src, int i_src, pel *dst, int i_dst,
+ const int offset = 32;
+ const int shift = 6;
+ __m256i mAddOffset = _mm256_set1_epi16(offset);
+- __m256i mSwitch1 = _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12, 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
+- __m256i mSwitch2 = _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14, 2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14);
+- __m256i T0, T1, T2, T3, S0, S1, S2, S3, sum;
++ __m256i mSwitch1 = _mm256_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6);
++ __m256i mSwitch2 = _mm256_setr_epi8(4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10);
++ __m256i T0, T1, T2, T3, S0, S1, sum;
+ __m256i r0, r1, r2, r3;
+ __m128i s0, s1, s2, s3;
+- __m256i mCoefy1_hor = _mm256_set1_epi16(*(s16*)coeff);
+- __m256i mCoefy2_hor = _mm256_set1_epi16(*(s16*)(coeff + 2));
+- __m256i mCoefy3_hor = _mm256_set1_epi16(*(s16*)(coeff + 4));
+- __m256i mCoefy4_hor = _mm256_set1_epi16(*(s16*)(coeff + 6));
++ __m256i mCoefy1_hor = _mm256_set1_epi32(*(s32*)coeff);
++ __m256i mCoefy2_hor = _mm256_set1_epi32(*(s32*)(coeff + 4));
+ src -= 3;
+
+ while (height > 0) {
+- uavs3d_prefetch(src + i_src, _MM_HINT_NTA);
+ s0 = _mm_loadu_si128((__m128i*)(src));
+ s1 = _mm_loadu_si128((__m128i*)(src + i_src));
+ s2 = _mm_loadu_si128((__m128i*)(src + i_src * 2));
+ s3 = _mm_loadu_si128((__m128i*)(src + i_src * 3));
++ src += i_src << 2;
++ uavs3d_prefetch(src, _MM_HINT_NTA);
++ uavs3d_prefetch(src + i_src, _MM_HINT_NTA);
+
+ S0 = _mm256_set_m128i(s2, s0);
+ S1 = _mm256_set_m128i(s3, s1);
+
+- S2 = _mm256_srli_si256(S0, 4);
+- S3 = _mm256_srli_si256(S1, 4);
+-
+- T0 = _mm256_unpacklo_epi64(S0, S1);
+- T1 = _mm256_unpacklo_epi64(S2, S3);
+-
+- r0 = _mm256_shuffle_epi8(T0, mSwitch1);
+- r1 = _mm256_shuffle_epi8(T0, mSwitch2);
+- r2 = _mm256_shuffle_epi8(T1, mSwitch1);
+- r3 = _mm256_shuffle_epi8(T1, mSwitch2);
++ r0 = _mm256_shuffle_epi8(S0, mSwitch1);
++ r1 = _mm256_shuffle_epi8(S0, mSwitch2);
++ r2 = _mm256_shuffle_epi8(S1, mSwitch1);
++ r3 = _mm256_shuffle_epi8(S1, mSwitch2);
+
+ T0 = _mm256_maddubs_epi16(r0, mCoefy1_hor);
+ T1 = _mm256_maddubs_epi16(r1, mCoefy2_hor);
+- T2 = _mm256_maddubs_epi16(r2, mCoefy3_hor);
+- T3 = _mm256_maddubs_epi16(r3, mCoefy4_hor);
++ T2 = _mm256_maddubs_epi16(r2, mCoefy1_hor);
++ T3 = _mm256_maddubs_epi16(r3, mCoefy2_hor);
+
+ T0 = _mm256_add_epi16(T0, T1);
+ T1 = _mm256_add_epi16(T2, T3);
+- sum = _mm256_add_epi16(T0, T1);
++ sum = _mm256_hadd_epi16(T0, T1);
+
+ sum = _mm256_add_epi16(sum, mAddOffset);
+ sum = _mm256_srai_epi16(sum, shift);
+
+ s0 = _mm_packus_epi16(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
+
++ height -= 4;
+ M32(dst) = _mm_extract_epi32(s0, 0);
+ M32(dst + i_dst) = _mm_extract_epi32(s0, 1);
+ M32(dst + i_dst * 2) = _mm_extract_epi32(s0, 2);
+ M32(dst + i_dst * 3) = _mm_extract_epi32(s0, 3);
+
+- height -= 4;
+- src += i_src << 2;
+ dst += i_dst << 2;
+ }
+ }
+@@ -385,9 +377,11 @@ void uavs3d_if_hor_luma_w8_avx2(const pel *src, int i_src, pel *dst, int i_dst,
+ src -= 3;
+
+ while (height) {
+- uavs3d_prefetch(src + i_src, _MM_HINT_NTA);
+ s0 = _mm_loadu_si128((__m128i*)(src));
+ s1 = _mm_loadu_si128((__m128i*)(src + i_src));
++ src += i_src << 1;
++ uavs3d_prefetch(src, _MM_HINT_NTA);
++ uavs3d_prefetch(src + i_src, _MM_HINT_NTA);
+ S = _mm256_set_m128i(s1, s0);
+
+ r0 = _mm256_shuffle_epi8(S, mSwitch1);
+@@ -407,13 +401,11 @@ void uavs3d_if_hor_luma_w8_avx2(const pel *src, int i_src, pel *dst, int i_dst,
+ sum = _mm256_add_epi16(sum, mAddOffset);
+ sum = _mm256_srai_epi16(sum, shift);
+
++ height -= 2;
+ s0 = _mm_packus_epi16(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
+- s1 = _mm_srli_si128(s0, 8);
+ _mm_storel_epi64((__m128i*)(dst), s0);
+- _mm_storel_epi64((__m128i*)(dst + i_dst), s1);
++ _mm_storeh_pi((__m64*)(dst + i_dst), _mm_castsi128_ps(s0));
+
+- height -= 2;
+- src += i_src << 1;
+ dst += i_dst << 1;
+ }
+ }
+@@ -437,11 +429,13 @@ void uavs3d_if_hor_luma_w16_avx2(const pel *src, int i_src, pel *dst, int i_dst,
+ src -= 3;
+
+ while (height) {
+- uavs3d_prefetch(src + i_src, _MM_HINT_NTA);
+ S0 = _mm256_loadu_si256((__m256i*)(src));
+ S1 = _mm256_loadu_si256((__m256i*)(src + i_src));
+ S2 = _mm256_permute4x64_epi64(S0, 0x94);
+ S3 = _mm256_permute4x64_epi64(S1, 0x94);
++ src += i_src << 1;
++ uavs3d_prefetch(src, _MM_HINT_NTA);
++ uavs3d_prefetch(src + i_src, _MM_HINT_NTA);
+
+ r0 = _mm256_shuffle_epi8(S2, mSwitch1);
+ r1 = _mm256_shuffle_epi8(S2, mSwitch2);
+@@ -475,11 +469,10 @@ void uavs3d_if_hor_luma_w16_avx2(const pel *src, int i_src, pel *dst, int i_dst,
+ sum0 = _mm256_srai_epi16(sum0, shift);
+ sum1 = _mm256_srai_epi16(sum1, shift);
+
++ height -= 2;
+ _mm_storeu_si128((__m128i*)(dst), _mm_packus_epi16(_mm256_castsi256_si128(sum0), _mm256_extracti128_si256(sum0, 1)));
+ _mm_storeu_si128((__m128i*)(dst + i_dst), _mm_packus_epi16(_mm256_castsi256_si128(sum1), _mm256_extracti128_si256(sum1, 1)));
+
+- height -= 2;
+- src += i_src << 1;
+ dst += i_dst << 1;
+ }
+ }
+@@ -503,13 +496,14 @@ void uavs3d_if_hor_luma_w32_avx2(const pel *src, int i_src, pel *dst, int i_dst,
+ src -= 3;
+
+ while (height--) {
+- uavs3d_prefetch(src + i_src, _MM_HINT_NTA);
+-
+ S0 = _mm256_loadu_si256((__m256i*)(src));
+ S1 = _mm256_loadu_si256((__m256i*)(src + 16));
+ S2 = _mm256_permute4x64_epi64(S0, 0x94);
+ S3 = _mm256_permute4x64_epi64(S1, 0x94);
+
++ src += i_src;
++ uavs3d_prefetch(src, _MM_HINT_NTA);
++
+ r0 = _mm256_shuffle_epi8(S2, mSwitch1);
+ r1 = _mm256_shuffle_epi8(S2, mSwitch2);
+ r2 = _mm256_shuffle_epi8(S2, mSwitch3);
+@@ -545,7 +539,6 @@ void uavs3d_if_hor_luma_w32_avx2(const pel *src, int i_src, pel *dst, int i_dst,
+ _mm_storeu_si128((__m128i*)(dst), _mm_packus_epi16(_mm256_castsi256_si128(sum0), _mm256_extracti128_si256(sum0, 1)));
+ _mm_storeu_si128((__m128i*)(dst + 16), _mm_packus_epi16(_mm256_castsi256_si128(sum1), _mm256_extracti128_si256(sum1, 1)));
+
+- src += i_src;
+ dst += i_dst;
+ }
+ }
+@@ -635,15 +628,17 @@ void uavs3d_if_ver_chroma_w8_avx2(const pel *src, int i_src, pel *dst, int i_dst
+ src -= i_src;
+
+ while (height) {
+- uavs3d_prefetch(src + 5 * i_src, _MM_HINT_NTA);
+- uavs3d_prefetch(src + 6 * i_src, _MM_HINT_NTA);
+- height -= 2;
+ s0 = _mm_loadl_epi64((__m128i*)(src));
+ s1 = _mm_loadl_epi64((__m128i*)(src + i_src));
+ s2 = _mm_loadl_epi64((__m128i*)(src + i_src2));
+ s3 = _mm_loadl_epi64((__m128i*)(src + i_src3));
+ s4 = _mm_loadl_epi64((__m128i*)(src + i_src4));
+
++ src += 2 * i_src;
++ height -= 2;
++ uavs3d_prefetch(src + i_src3, _MM_HINT_NTA);
++ uavs3d_prefetch(src + i_src4, _MM_HINT_NTA);
++
+ S0 = _mm256_set_m128i(s1, s0);
+ S1 = _mm256_set_m128i(s2, s1);
+ S2 = _mm256_set_m128i(s3, s2);
+@@ -660,12 +655,10 @@ void uavs3d_if_ver_chroma_w8_avx2(const pel *src, int i_src, pel *dst, int i_dst
+ mVal = _mm256_add_epi16(mVal, mAddOffset);
+ mVal = _mm256_srai_epi16(mVal, shift);
+ s0 = _mm_packus_epi16(_mm256_castsi256_si128(mVal), _mm256_extracti128_si256(mVal, 1));
+- s1 = _mm_srli_si128(s0, 8);
+
+ _mm_storel_epi64((__m128i*)(dst), s0);
+- _mm_storel_epi64((__m128i*)(dst + i_dst), s1);
++ _mm_storeh_pi((__m64*)(dst + i_dst), _mm_castsi128_ps(s0));
+
+- src += 2 * i_src;
+ dst += 2 * i_dst;
+ }
+ }
+@@ -687,15 +680,17 @@ void uavs3d_if_ver_chroma_w16_avx2(const pel *src, int i_src, pel *dst, int i_ds
+ src -= i_src;
+
+ while (height) {
+- uavs3d_prefetch(src + 5 * i_src, _MM_HINT_NTA);
+- uavs3d_prefetch(src + 6 * i_src, _MM_HINT_NTA);
+- height -= 2;
+ s0 = _mm_loadu_si128((__m128i*)(src));
+ s1 = _mm_loadu_si128((__m128i*)(src + i_src));
+ s2 = _mm_loadu_si128((__m128i*)(src + i_src2));
+ s3 = _mm_loadu_si128((__m128i*)(src + i_src3));
+ s4 = _mm_loadu_si128((__m128i*)(src + i_src4));
+
++ src += 2 * i_src;
++ uavs3d_prefetch(src + i_src3, _MM_HINT_NTA);
++ uavs3d_prefetch(src + i_src4, _MM_HINT_NTA);
++ height -= 2;
++
+ S0 = _mm256_set_m128i(s1, s0);
+ S1 = _mm256_set_m128i(s2, s1);
+ S2 = _mm256_set_m128i(s3, s2);
+@@ -723,7 +718,6 @@ void uavs3d_if_ver_chroma_w16_avx2(const pel *src, int i_src, pel *dst, int i_ds
+ _mm_storeu_si128((__m128i*)dst, _mm256_castsi256_si128(mVal0));
+ _mm_storeu_si128((__m128i*)(dst + i_dst), _mm256_extracti128_si256(mVal0, 1));
+
+- src += 2 * i_src;
+ dst += 2 * i_dst;
+ }
+ }
+@@ -744,15 +738,17 @@ void uavs3d_if_ver_chroma_w32_avx2(const pel *src, int i_src, pel *dst, int i_ds
+ src -= i_src;
+
+ while (height) {
+- uavs3d_prefetch(src + 5 * i_src, _MM_HINT_NTA);
+- uavs3d_prefetch(src + 6 * i_src, _MM_HINT_NTA);
+- height -= 2;
+ S0 = _mm256_loadu_si256((__m256i*)(src));
+ S1 = _mm256_loadu_si256((__m256i*)(src + i_src));
+ S2 = _mm256_loadu_si256((__m256i*)(src + i_src2));
+ S3 = _mm256_loadu_si256((__m256i*)(src + i_src3));
+ S4 = _mm256_loadu_si256((__m256i*)(src + i_src4));
+
++ src += 2 * i_src;
++ height -= 2;
++ uavs3d_prefetch(src + i_src3, _MM_HINT_NTA);
++ uavs3d_prefetch(src + i_src4, _MM_HINT_NTA);
++
+ T0 = _mm256_unpacklo_epi8(S0, S1);
+ T1 = _mm256_unpackhi_epi8(S0, S1);
+ T2 = _mm256_unpacklo_epi8(S2, S3);
+@@ -790,7 +786,6 @@ void uavs3d_if_ver_chroma_w32_avx2(const pel *src, int i_src, pel *dst, int i_ds
+ _mm256_storeu_si256((__m256i*)dst, mVal0);
+ _mm256_storeu_si256((__m256i*)(dst + i_dst), mVal2);
+
+- src += 2 * i_src;
+ dst += 2 * i_dst;
+
+ }
+@@ -811,7 +806,6 @@ void uavs3d_if_ver_chroma_w64_avx2(const pel *src, int i_src, pel *dst, int i_ds
+ src -= i_src;
+
+ while (height--){
+- uavs3d_prefetch(src + 4 * i_src, _MM_HINT_NTA);
+ S0 = _mm256_loadu_si256((__m256i*)(src));
+ S4 = _mm256_loadu_si256((__m256i*)(src + 32));
+ S1 = _mm256_loadu_si256((__m256i*)(src + i_src));
+@@ -821,6 +815,7 @@ void uavs3d_if_ver_chroma_w64_avx2(const pel *src, int i_src, pel *dst, int i_ds
+ S3 = _mm256_loadu_si256((__m256i*)(src + i_src3));
+ S7 = _mm256_loadu_si256((__m256i*)(src + i_src3 + 32));
+
++ src += i_src;
+ T0 = _mm256_unpacklo_epi8(S0, S1);
+ T1 = _mm256_unpacklo_epi8(S2, S3);
+ T2 = _mm256_unpackhi_epi8(S0, S1);
+@@ -830,6 +825,8 @@ void uavs3d_if_ver_chroma_w64_avx2(const pel *src, int i_src, pel *dst, int i_ds
+ T6 = _mm256_unpackhi_epi8(S4, S5);
+ T7 = _mm256_unpackhi_epi8(S6, S7);
+
++ uavs3d_prefetch(src + i_src3, _MM_HINT_NTA);
++
+ T0 = _mm256_maddubs_epi16(T0, coeff0);
+ T1 = _mm256_maddubs_epi16(T1, coeff1);
+ T2 = _mm256_maddubs_epi16(T2, coeff0);
+@@ -858,7 +855,6 @@ void uavs3d_if_ver_chroma_w64_avx2(const pel *src, int i_src, pel *dst, int i_ds
+ _mm256_storeu_si256((__m256i*)(dst), mVal0);
+ _mm256_storeu_si256((__m256i*)(dst + 32), mVal1);
+
+- src += i_src;
+ dst += i_dst;
+ }
+ }
+@@ -878,7 +874,6 @@ void uavs3d_if_ver_chroma_w128_avx2(const pel *src, int i_src, pel *dst, int i_d
+ src -= i_src;
+
+ while (height--) {
+- uavs3d_prefetch(src + 4 * i_src, _MM_HINT_NTA);
+ S0 = _mm256_loadu_si256((__m256i*)(src));
+ S4 = _mm256_loadu_si256((__m256i*)(src + 32));
+ S1 = _mm256_loadu_si256((__m256i*)(src + i_src));
+@@ -934,6 +929,9 @@ void uavs3d_if_ver_chroma_w128_avx2(const pel *src, int i_src, pel *dst, int i_d
+ S3 = _mm256_loadu_si256((__m256i*)(src + i_src3 + 64));
+ S7 = _mm256_loadu_si256((__m256i*)(src + i_src3 + 96));
+
++ src += i_src;
++ uavs3d_prefetch(src + i_src3, _MM_HINT_NTA);
++
+ T0 = _mm256_unpacklo_epi8(S0, S1);
+ T1 = _mm256_unpacklo_epi8(S2, S3);
+ T2 = _mm256_unpackhi_epi8(S0, S1);
+@@ -971,7 +969,6 @@ void uavs3d_if_ver_chroma_w128_avx2(const pel *src, int i_src, pel *dst, int i_d
+ _mm256_storeu_si256((__m256i*)(dst + 64), mVal0);
+ _mm256_storeu_si256((__m256i*)(dst + 96), mVal1);
+
+- src += i_src;
+ dst += i_dst;
+ }
+ }
+@@ -1078,7 +1075,6 @@ void uavs3d_if_ver_luma_w8_avx2(const pel *src, int i_src, pel *dst, int i_dst,
+ while (height) {
+ __m128i S0, S1, S2, S3, S4, S5, S6, S7, S8;
+
+- height -= 2;
+ S0 = _mm_loadl_epi64((__m128i*)(src));
+ S1 = _mm_loadl_epi64((__m128i*)(src + i_src));
+ S2 = _mm_loadl_epi64((__m128i*)(src + i_src2));
+@@ -1098,6 +1094,11 @@ void uavs3d_if_ver_luma_w8_avx2(const pel *src, int i_src, pel *dst, int i_dst,
+ R6 = _mm256_set_m128i(S7, S6);
+ R7 = _mm256_set_m128i(S8, S7);
+
++ src += 2 * i_src;
++ height -= 2;
++ uavs3d_prefetch(src + i_src7, _MM_HINT_NTA);
++ uavs3d_prefetch(src + i_src8, _MM_HINT_NTA);
++
+ T0 = _mm256_unpacklo_epi8(R0, R1);
+ T1 = _mm256_unpacklo_epi8(R2, R3);
+ T2 = _mm256_unpacklo_epi8(R4, R5);
+@@ -1115,11 +1116,9 @@ void uavs3d_if_ver_luma_w8_avx2(const pel *src, int i_src, pel *dst, int i_dst,
+ mVal = _mm256_add_epi16(mVal, mAddOffset);
+ mVal = _mm256_srai_epi16(mVal, shift);
+ S0 = _mm_packus_epi16(_mm256_castsi256_si128(mVal), _mm256_extracti128_si256(mVal, 1));
+- S1 = _mm_srli_si128(S0, 8);
+
+ _mm_storel_epi64((__m128i*)(dst), S0);
+- _mm_storel_epi64((__m128i*)(dst + i_dst), S1);
+- src += 2 * i_src;
++ _mm_storeh_pi((__m64*)(dst + i_dst), _mm_castsi128_ps(S0));
+ dst += 2 * i_dst;
+ }
+ }
+@@ -1147,10 +1146,6 @@ void uavs3d_if_ver_luma_w16_avx2(const pel *src, int i_src, pel *dst, int i_dst,
+
+ while(height) {
+ __m128i S0, S1, S2, S3, S4, S5, S6, S7, S8;
+- uavs3d_prefetch(src + 9 * i_src, _MM_HINT_NTA);
+- uavs3d_prefetch(src + 10 * i_src, _MM_HINT_NTA);
+-
+- height -= 2;
+ S0 = _mm_loadu_si128((__m128i*)(src));
+ S1 = _mm_loadu_si128((__m128i*)(src + i_src));
+ S2 = _mm_loadu_si128((__m128i*)(src + i_src2));
+@@ -1170,6 +1165,12 @@ void uavs3d_if_ver_luma_w16_avx2(const pel *src, int i_src, pel *dst, int i_dst,
+ R6 = _mm256_set_m128i(S6, S7);
+ R7 = _mm256_set_m128i(S7, S8);
+
++ src += 2 * i_src;
++ height -= 2;
++
++ uavs3d_prefetch(src + i_src7, _MM_HINT_NTA);
++ uavs3d_prefetch(src + i_src8, _MM_HINT_NTA);
++
+ T0 = _mm256_unpacklo_epi8(R0, R1);
+ T1 = _mm256_unpackhi_epi8(R0, R1);
+ T2 = _mm256_unpacklo_epi8(R2, R3);
+@@ -1203,7 +1204,6 @@ void uavs3d_if_ver_luma_w16_avx2(const pel *src, int i_src, pel *dst, int i_dst,
+
+ _mm_storeu_si128((__m128i*)dst, _mm256_extractf128_si256(mVal1, 1));
+ _mm_storeu_si128((__m128i*)(dst + i_dst), _mm256_castsi256_si128(mVal1));
+- src += 2 * i_src;
+ dst += 2 * i_dst;
+ }
+ }
+@@ -1230,7 +1230,6 @@ void uavs3d_if_ver_luma_w32_avx2(const pel *src, int i_src, pel *dst, int i_dst,
+ src -= 3 * i_src;
+ while (height--) {
+ __m256i S0, S1, S2, S3, S4, S5, S6, S7;
+- uavs3d_prefetch(src + 8 * i_src, _MM_HINT_NTA);
+ S0 = _mm256_loadu_si256((__m256i*)(src));
+ S1 = _mm256_loadu_si256((__m256i*)(src + i_src));
+ S2 = _mm256_loadu_si256((__m256i*)(src + i_src2));
+@@ -1240,6 +1239,7 @@ void uavs3d_if_ver_luma_w32_avx2(const pel *src, int i_src, pel *dst, int i_dst,
+ S6 = _mm256_loadu_si256((__m256i*)(src + i_src6));
+ S7 = _mm256_loadu_si256((__m256i*)(src + i_src7));
+
++ src += i_src;
+ T0 = _mm256_unpacklo_epi8(S0, S1);
+ T1 = _mm256_unpacklo_epi8(S2, S3);
+ T2 = _mm256_unpacklo_epi8(S4, S5);
+@@ -1249,6 +1249,8 @@ void uavs3d_if_ver_luma_w32_avx2(const pel *src, int i_src, pel *dst, int i_dst,
+ T6 = _mm256_unpackhi_epi8(S4, S5);
+ T7 = _mm256_unpackhi_epi8(S6, S7);
+
++ uavs3d_prefetch(src + i_src7, _MM_HINT_NTA);
++
+ T0 = _mm256_maddubs_epi16(T0, coeff0);
+ T1 = _mm256_maddubs_epi16(T1, coeff1);
+ T2 = _mm256_maddubs_epi16(T2, coeff2);
+@@ -1273,7 +1275,6 @@ void uavs3d_if_ver_luma_w32_avx2(const pel *src, int i_src, pel *dst, int i_dst,
+
+ _mm256_storeu_si256((__m256i*)(dst), mVal1);
+
+- src += i_src;
+ dst += i_dst;
+ }
+ }
+@@ -1295,12 +1296,11 @@ void uavs3d_if_ver_luma_w64_avx2(const pel *src, int i_src, pel *dst, int i_dst,
+ __m256i coeff3 = _mm256_set1_epi16(*(s16*)(coeff + 6));
+ __m256i T0, T1, T2, T3, T4, T5, T6, T7, mVal1, mVal2;
+
+- src -= 3 * i_src;
++ src -= i_src3;
+
+ while (height--) {
+ const pel *p = src + 32;
+ __m256i S0, S1, S2, S3, S4, S5, S6, S7;
+- uavs3d_prefetch(src + 8 * i_src, _MM_HINT_NTA);
+ S0 = _mm256_loadu_si256((__m256i*)(src));
+ S1 = _mm256_loadu_si256((__m256i*)(src + i_src));
+ S2 = _mm256_loadu_si256((__m256i*)(src + i_src2));
+@@ -1352,6 +1352,7 @@ void uavs3d_if_ver_luma_w64_avx2(const pel *src, int i_src, pel *dst, int i_dst,
+ S6 = _mm256_loadu_si256((__m256i*)(p + i_src6));
+ S7 = _mm256_loadu_si256((__m256i*)(p + i_src7));
+
++ src += i_src;
+ T0 = _mm256_unpacklo_epi8(S0, S1);
+ T1 = _mm256_unpacklo_epi8(S2, S3);
+ T2 = _mm256_unpacklo_epi8(S4, S5);
+@@ -1361,6 +1362,8 @@ void uavs3d_if_ver_luma_w64_avx2(const pel *src, int i_src, pel *dst, int i_dst,
+ T6 = _mm256_unpackhi_epi8(S4, S5);
+ T7 = _mm256_unpackhi_epi8(S6, S7);
+
++ uavs3d_prefetch(src + i_src7, _MM_HINT_NTA);
++
+ T0 = _mm256_maddubs_epi16(T0, coeff0);
+ T1 = _mm256_maddubs_epi16(T1, coeff1);
+ T2 = _mm256_maddubs_epi16(T2, coeff2);
+@@ -1385,7 +1388,6 @@ void uavs3d_if_ver_luma_w64_avx2(const pel *src, int i_src, pel *dst, int i_dst,
+
+ _mm256_storeu_si256((__m256i*)(dst + 32), mVal1);
+
+- src += i_src;
+ dst += i_dst;
+ }
+ }
+@@ -1412,7 +1414,6 @@ void uavs3d_if_ver_luma_w128_avx2(const pel *src, int i_src, pel *dst, int i_dst
+ while (height--) {
+ const pel *p = src + 32;
+ __m256i S0, S1, S2, S3, S4, S5, S6, S7;
+- uavs3d_prefetch(src + 8 * i_src, _MM_HINT_NTA);
+ S0 = _mm256_loadu_si256((__m256i*)(src));
+ S1 = _mm256_loadu_si256((__m256i*)(src + i_src));
+ S2 = _mm256_loadu_si256((__m256i*)(src + i_src2));
+@@ -1552,6 +1553,8 @@ void uavs3d_if_ver_luma_w128_avx2(const pel *src, int i_src, pel *dst, int i_dst
+ S6 = _mm256_loadu_si256((__m256i*)(p + i_src6));
+ S7 = _mm256_loadu_si256((__m256i*)(p + i_src7));
+
++ src += i_src;
++ uavs3d_prefetch(src + i_src7, _MM_HINT_NTA);
+ T0 = _mm256_unpacklo_epi8(S0, S1);
+ T1 = _mm256_unpacklo_epi8(S2, S3);
+ T2 = _mm256_unpacklo_epi8(S4, S5);
+@@ -1585,7 +1588,6 @@ void uavs3d_if_ver_luma_w128_avx2(const pel *src, int i_src, pel *dst, int i_dst
+
+ _mm256_storeu_si256((__m256i*)(dst + 96), mVal1);
+
+- src += i_src;
+ dst += i_dst;
+ }
+ }
+@@ -1668,6 +1670,8 @@ void uavs3d_if_hor_ver_chroma_w8_avx2(const pel *src, int i_src, pel *dst, int i
+ S2 = _mm256_permute2x128_si256(mVal[1], mVal[2], 0x21);
+ S3 = mVal[2];
+
++ uavs3d_prefetch(src + i_src3, _MM_HINT_NTA);
++
+ T0 = _mm256_unpacklo_epi16(S0, S1);
+ T1 = _mm256_unpacklo_epi16(S2, S3);
+ T2 = _mm256_unpackhi_epi16(S0, S1);
+@@ -1709,17 +1713,15 @@ void uavs3d_if_hor_ver_chroma_w8_avx2(const pel *src, int i_src, pel *dst, int i
+ T0 = _mm256_srai_epi32(T0, shift);
+ T2 = _mm256_srai_epi32(T2, shift);
+
+- s0 = _mm_packus_epi16(_mm256_castsi256_si128(R0), _mm256_extracti128_si256(R0, 1));
+- s1 = _mm_srli_si128(s0, 8);
++ s2 = _mm_packus_epi16(_mm256_castsi256_si128(R0), _mm256_extracti128_si256(R0, 1));
+
+ T0 = _mm256_packs_epi32(T0, T2);
+- s2 = _mm_packus_epi16(_mm256_castsi256_si128(T0), _mm256_extracti128_si256(T0, 1));
+- s3 = _mm_srli_si128(s2, 8);
++ s3 = _mm_packus_epi16(_mm256_castsi256_si128(T0), _mm256_extracti128_si256(T0, 1));
+
+- _mm_storel_epi64((__m128i*)(dst), s0);
+- _mm_storel_epi64((__m128i*)(dst + i_dst), s1);
+- _mm_storel_epi64((__m128i*)(dst + i_dst*2), s2);
+- _mm_storel_epi64((__m128i*)(dst + i_dst*3), s3);
++ _mm_storel_epi64((__m128i*)(dst), s2);
++ _mm_storeh_pi((__m64*)(dst + i_dst), _mm_castsi128_ps(s2));
++ _mm_storel_epi64((__m128i*)(dst + i_dst*2), s3);
++ _mm_storeh_pi((__m64*)(dst + i_dst*3), _mm_castsi128_ps(s3));
+
+ dst += i_dst << 2;
+ height -= 4;
+@@ -1756,17 +1758,17 @@ void uavs3d_if_hor_ver_chroma_w16_avx2(const pel *src, int i_src, pel *dst, int
+ row = height + 3;
+
+ while (row--) {
+- uavs3d_prefetch(src + i_src*2, _MM_HINT_NTA);
+ S0 = _mm256_loadu_si256((__m256i*)(src));
++ src += i_src;
+ S1 = _mm256_permute4x64_epi64(S0, 0x94);
++ uavs3d_prefetch(src, _MM_HINT_NTA);
+ R0 = _mm256_shuffle_epi8(S1, mSwitch1);
+ R1 = _mm256_shuffle_epi8(S1, mSwitch2);
+ T0 = _mm256_maddubs_epi16(R0, mCoefy1_hor);
+ T1 = _mm256_maddubs_epi16(R1, mCoefy2_hor);
+ sum = _mm256_add_epi16(T0, T1);
+
+- _mm256_storeu_si256((__m256i*)(tmp), sum);
+- src += i_src;
++ _mm256_store_si256((__m256i*)(tmp), sum);
+ tmp += i_tmp;
+ }
+
+@@ -1878,8 +1880,8 @@ void uavs3d_if_hor_ver_chroma_w32x_avx2(const pel *src, int i_src, pel *dst, int
+ sum0 = _mm256_add_epi16(T0, T1);
+ sum1 = _mm256_add_epi16(T2, T3);
+
+- _mm256_storeu_si256((__m256i*)(tmp + col), sum0);
+- _mm256_storeu_si256((__m256i*)(tmp + col + 16), sum1);
++ _mm256_store_si256((__m256i*)(tmp + col), sum0);
++ _mm256_store_si256((__m256i*)(tmp + col + 16), sum1);
+ }
+ src += i_src;
+ tmp += i_tmp;
+@@ -2055,8 +2057,8 @@ void uavs3d_if_hor_ver_luma_w4_avx2(const pel *src, int i_src, pel *dst, int i_d
+
+ s0 = _mm_loadu_si128((__m128i*)(src));
+ s1 = _mm_loadu_si128((__m128i*)(src + i_src));
+- s2 = _mm_loadu_si128((__m128i*)(src + i_src * 2));
+- s3 = _mm_loadu_si128((__m128i*)(src + i_src * 3));
++ s2 = _mm_loadu_si128((__m128i*)(src + i_src2));
++ s3 = _mm_loadu_si128((__m128i*)(src + i_src3));
+
+ S0 = _mm256_set_m128i(s2, s0);
+ S1 = _mm256_set_m128i(s3, s1);
+@@ -2090,8 +2092,8 @@ void uavs3d_if_hor_ver_luma_w4_avx2(const pel *src, int i_src, pel *dst, int i_d
+ // hor
+ s0 = _mm_loadu_si128((__m128i*)(src));
+ s1 = _mm_loadu_si128((__m128i*)(src + i_src));
+- s2 = _mm_loadu_si128((__m128i*)(src + i_src * 2));
+- s3 = _mm_loadu_si128((__m128i*)(src + i_src * 3));
++ s2 = _mm_loadu_si128((__m128i*)(src + i_src2));
++ s3 = _mm_loadu_si128((__m128i*)(src + i_src3));
+
+ S0 = _mm256_set_m128i(s2, s0);
+ S1 = _mm256_set_m128i(s3, s1);
+@@ -2173,19 +2175,7 @@ void uavs3d_if_hor_ver_luma_w4_avx2(const pel *src, int i_src, pel *dst, int i_d
+
+ void uavs3d_if_hor_ver_luma_w8_avx2(const pel *src, int i_src, pel *dst, int i_dst, int width, int height, const s8 *coef_x, const s8 *coef_y, int max_val)
+ {
+- const int i_tmp = 8;
+- const int i_tmp2 = 16;
+- const int i_tmp3 = 24;
+- const int i_tmp4 = 32;
+- const int i_tmp5 = 40;
+- const int i_tmp6 = 48;
+- const int i_tmp7 = 56;;
+- const int i_tmp8 = 64;
+- const int i_tmp9 = 72;;
+- const int i_tmp10 = 80;
+ const int i_src2 = i_src << 1;
+- int row;
+- int shift = 12;
+
+ __m256i T0, T1, T2, T3, T4, T5, T6, T7, T8, T9;
+ __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9;
+@@ -2201,12 +2191,16 @@ void uavs3d_if_hor_ver_luma_w8_avx2(const pel *src, int i_src, pel *dst, int i_d
+
+ //HOR
+ {
++ int row;
+ src = src - 3 * i_src - 3;
+
+ // first row
+ {
+ __m128i mSrc0 = _mm_loadu_si128((__m128i*)(src));
+ T0 = _mm256_set_m128i(mSrc0, mSrc0);
++ src += i_src;
++
++ uavs3d_prefetch(src, _MM_HINT_NTA);
+
+ r0 = _mm256_shuffle_epi8(T0, mSwitch1);
+ r1 = _mm256_shuffle_epi8(T0, mSwitch2);
+@@ -2224,13 +2218,16 @@ void uavs3d_if_hor_ver_luma_w8_avx2(const pel *src, int i_src, pel *dst, int i_d
+
+ mVal[0] = _mm256_permute4x64_epi64(mVal[0], 0x44);
+
+- src += i_src;
+ }
+
+ for (row = 1; row < 4; row++) {
+- __m128i mSrc0 = _mm_loadu_si128((__m128i*)(src));
++ __m128i mSrc0 = _mm_loadu_si128((__m128i*)(src));
+ __m128i mSrc1 = _mm_loadu_si128((__m128i*)(src + i_src));
+ T0 = _mm256_set_m128i(mSrc1, mSrc0);
++ src += i_src2;
++
++ uavs3d_prefetch(src, _MM_HINT_NTA);
++ uavs3d_prefetch(src + i_src, _MM_HINT_NTA);
+
+ r0 = _mm256_shuffle_epi8(T0, mSwitch1);
+ r1 = _mm256_shuffle_epi8(T0, mSwitch2);
+@@ -2245,8 +2242,6 @@ void uavs3d_if_hor_ver_luma_w8_avx2(const pel *src, int i_src, pel *dst, int i_d
+ T0 = _mm256_add_epi16(T0, T1);
+ T1 = _mm256_add_epi16(T2, T3);
+ mVal[row] = _mm256_add_epi16(T0, T1);
+-
+- src += i_src2;
+ }
+ }
+
+@@ -2260,35 +2255,44 @@ void uavs3d_if_hor_ver_luma_w8_avx2(const pel *src, int i_src, pel *dst, int i_d
+ __m256i mCoefy2 = _mm256_cvtepi8_epi16(mCoefy22);
+ __m256i mCoefy3 = _mm256_cvtepi8_epi16(mCoefy33);
+ __m256i mCoefy4 = _mm256_cvtepi8_epi16(mCoefy44);
++ const int shift = 12;
+
+ while (height > 0) {
+ __m128i s0, s1;
+ //hor
+- s0 = _mm_loadu_si128((__m128i*)(src));
++ s0 = _mm_loadu_si128((__m128i*)(src));
+ s1 = _mm_loadu_si128((__m128i*)(src + i_src));
+ T0 = _mm256_set_m128i(s1, s0);
+
++ src += i_src2;
++
++ uavs3d_prefetch(src, _MM_HINT_NTA);
++ uavs3d_prefetch(src + i_src, _MM_HINT_NTA);
++
+ r0 = _mm256_shuffle_epi8(T0, mSwitch1);
+ r1 = _mm256_shuffle_epi8(T0, mSwitch2);
+ r2 = _mm256_shuffle_epi8(T0, mSwitch3);
+ r3 = _mm256_shuffle_epi8(T0, mSwitch4);
+
+- src += i_src2;
+-
+ T0 = _mm256_maddubs_epi16(r0, mCoefy1_hor);
+ T1 = _mm256_maddubs_epi16(r1, mCoefy2_hor);
+ T2 = _mm256_maddubs_epi16(r2, mCoefy3_hor);
+ T3 = _mm256_maddubs_epi16(r3, mCoefy4_hor);
+
+- s0 = _mm_loadu_si128((__m128i*)(src));
++ s0 = _mm_loadu_si128((__m128i*)(src));
+ s1 = _mm_loadu_si128((__m128i*)(src + i_src));
+
+ T0 = _mm256_add_epi16(T0, T1);
+ T1 = _mm256_add_epi16(T2, T3);
+ mVal[4] = _mm256_add_epi16(T0, T1);
+
++ src += i_src2;
++
+ T0 = _mm256_set_m128i(s1, s0);
+
++ uavs3d_prefetch(src, _MM_HINT_NTA);
++ uavs3d_prefetch(src + i_src, _MM_HINT_NTA);
++
+ r0 = _mm256_shuffle_epi8(T0, mSwitch1);
+ r1 = _mm256_shuffle_epi8(T0, mSwitch2);
+ r2 = _mm256_shuffle_epi8(T0, mSwitch3);
+@@ -2303,8 +2307,6 @@ void uavs3d_if_hor_ver_luma_w8_avx2(const pel *src, int i_src, pel *dst, int i_d
+ T1 = _mm256_add_epi16(T2, T3);
+ mVal[5] = _mm256_add_epi16(T0, T1);
+
+- src += i_src2;
+-
+ T0 = _mm256_permute2x128_si256(mVal[0], mVal[1], 0x21);
+ T1 = mVal[1];
+ T2 = _mm256_permute2x128_si256(mVal[1], mVal[2], 0x21);
+@@ -2352,10 +2354,9 @@ void uavs3d_if_hor_ver_luma_w8_avx2(const pel *src, int i_src, pel *dst, int i_d
+
+ T0 = _mm256_packs_epi32(T0, T4);
+ s0 = _mm_packus_epi16(_mm256_castsi256_si128(T0), _mm256_extracti128_si256(T0, 1));
+- s1 = _mm_srli_si128(s0, 8);
+
+ _mm_storel_epi64((__m128i*)(dst), s0);
+- _mm_storel_epi64((__m128i*)(dst + i_dst), s1);
++ _mm_storeh_pi((__m64*)(dst + i_dst), _mm_castsi128_ps(s0));
+
+ r4 = _mm256_unpacklo_epi16(T8, T9);
+ r9 = _mm256_unpackhi_epi16(T8, T9);
+@@ -2382,13 +2383,12 @@ void uavs3d_if_hor_ver_luma_w8_avx2(const pel *src, int i_src, pel *dst, int i_d
+
+ T0 = _mm256_packs_epi32(T0, T4);
+ s0 = _mm_packus_epi16(_mm256_castsi256_si128(T0), _mm256_extracti128_si256(T0, 1));
+- s1 = _mm_srli_si128(s0, 8);
+
++ height -= 4;
+ _mm_storel_epi64((__m128i*)(dst + i_dst * 2), s0);
+- _mm_storel_epi64((__m128i*)(dst + i_dst * 3), s1);
++ _mm_storeh_pi((__m64*)(dst + i_dst * 3), _mm_castsi128_ps(s0));
+
+ dst += i_dst << 2;
+- height -= 4;
+ }
+ }
+ }
+@@ -2397,7 +2397,6 @@ void uavs3d_if_hor_ver_luma_w16_avx2(const pel *src, int i_src, pel *dst, int i_
+ {
+ ALIGNED_32(s16 tmp_res[(128 + 7) * 16]);
+ s16 *tmp = tmp_res;
+- int row;
+ __m256i mVal1, mVal2, mVal;
+ __m256i T0, T1, T2, T3, T4, T5, T6, T7, T8, T9;
+ __m256i S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, S10;
+@@ -2406,6 +2405,7 @@ void uavs3d_if_hor_ver_luma_w16_avx2(const pel *src, int i_src, pel *dst, int i_
+
+ //HOR
+ {
++ int row;
+ __m256i mSwitch1 = _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+ __m256i mSwitch2 = _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
+ __m256i mSwitch3 = _mm256_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12);
+@@ -2421,8 +2421,9 @@ void uavs3d_if_hor_ver_luma_w16_avx2(const pel *src, int i_src, pel *dst, int i_
+ row = height + 7;
+ while (row--) {
+ S = _mm256_loadu_si256((__m256i*)(src));
+- uavs3d_prefetch(src + i_src, _MM_HINT_NTA);
++ src += i_src;
+ S0 = _mm256_permute4x64_epi64(S, 0x94);
++ uavs3d_prefetch(src, _MM_HINT_NTA);
+
+ r0 = _mm256_shuffle_epi8(S0, mSwitch1);
+ r1 = _mm256_shuffle_epi8(S0, mSwitch2);
+@@ -2438,9 +2439,8 @@ void uavs3d_if_hor_ver_luma_w16_avx2(const pel *src, int i_src, pel *dst, int i_
+ T1 = _mm256_add_epi16(T2, T3);
+ sum = _mm256_add_epi16(T0, T1);
+
+- _mm256_storeu_si256((__m256i*)(tmp), sum);
++ _mm256_store_si256((__m256i*)(tmp), sum);
+
+- src += i_src;
+ tmp += 16;
+ }
+ }
+@@ -2603,9 +2603,9 @@ void uavs3d_if_hor_ver_luma_w16_avx2(const pel *src, int i_src, pel *dst, int i_
+ mVal = _mm256_packs_epi32(mVal1, mVal2);
+ _mm_storeu_si128((__m128i*)(dst + 3 * i_dst), _mm_packus_epi16(_mm256_castsi256_si128(mVal), _mm256_extracti128_si256(mVal, 1)));
+
++ height -= 4;
+ tmp += 4 * i_tmp;
+ dst += 4 * i_dst;
+- height -= 4;
+ }
+ }
+ }
+@@ -2614,10 +2614,10 @@ void uavs3d_if_hor_ver_luma_w32_avx2(const pel *src, int i_src, pel *dst, int i_
+ {
+ ALIGNED_32(s16 tmp_res[(128 + 7) * 32]);
+ s16 *tmp = tmp_res;
+- int row, col;
+ const int i_tmp = 32;
+ //HOR
+ {
++ int row;
+ __m256i mSwitch1 = _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+ __m256i mSwitch2 = _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
+ __m256i mSwitch3 = _mm256_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12);
+@@ -2629,26 +2629,25 @@ void uavs3d_if_hor_ver_luma_w32_avx2(const pel *src, int i_src, pel *dst, int i_
+ __m256i mCoefy4_hor = _mm256_set1_epi16(*(s16*)(coef_x + 6));
+
+ __m256i T0, T1, T2, T3, T4, T5, T6, T7;
+- __m256i S0, S1, S2, S3;
++ __m256i S0, S1;
+
+ src = src - 3 * i_src - 3;
+
+ row = height + 7;
+ while (row--) {
+- uavs3d_prefetch(src + i_src, _MM_HINT_NTA);
+ S0 = _mm256_loadu_si256((__m256i*)(src));
+ S1 = _mm256_loadu_si256((__m256i*)(src + 8));
+- S2 = _mm256_insertf128_si256(S0, _mm256_castsi256_si128(S1), 0x1);
+- S3 = _mm256_insertf128_si256(S1, _mm256_extracti128_si256(S0, 1), 0x0);
+-
+- T0 = _mm256_shuffle_epi8(S2, mSwitch1);
+- T1 = _mm256_shuffle_epi8(S2, mSwitch2);
+- T2 = _mm256_shuffle_epi8(S2, mSwitch3);
+- T3 = _mm256_shuffle_epi8(S2, mSwitch4);
+- T4 = _mm256_shuffle_epi8(S3, mSwitch1);
+- T5 = _mm256_shuffle_epi8(S3, mSwitch2);
+- T6 = _mm256_shuffle_epi8(S3, mSwitch3);
+- T7 = _mm256_shuffle_epi8(S3, mSwitch4);
++ src += i_src;
++ uavs3d_prefetch(src, _MM_HINT_NTA);
++
++ T0 = _mm256_shuffle_epi8(S0, mSwitch1);
++ T1 = _mm256_shuffle_epi8(S0, mSwitch2);
++ T2 = _mm256_shuffle_epi8(S0, mSwitch3);
++ T3 = _mm256_shuffle_epi8(S0, mSwitch4);
++ T4 = _mm256_shuffle_epi8(S1, mSwitch1);
++ T5 = _mm256_shuffle_epi8(S1, mSwitch2);
++ T6 = _mm256_shuffle_epi8(S1, mSwitch3);
++ T7 = _mm256_shuffle_epi8(S1, mSwitch4);
+
+ T0 = _mm256_maddubs_epi16(T0, mCoefy1_hor);
+ T1 = _mm256_maddubs_epi16(T1, mCoefy2_hor);
+@@ -2666,10 +2665,11 @@ void uavs3d_if_hor_ver_luma_w32_avx2(const pel *src, int i_src, pel *dst, int i_
+ T0 = _mm256_add_epi16(T0, T2);
+ T4 = _mm256_add_epi16(T4, T6);
+
+- _mm256_storeu_si256((__m256i*)(tmp), T0);
+- _mm256_storeu_si256((__m256i*)(tmp + 16), T4);
++ T1 = _mm256_permute2x128_si256(T0, T4, 0x20);
++ T3 = _mm256_permute2x128_si256(T0, T4, 0x31);
++ _mm256_store_si256((__m256i*)(tmp), T1);
++ _mm256_store_si256((__m256i*)(tmp + 16), T3);
+
+- src += i_src;
+ tmp += i_tmp;
+ }
+ }
+@@ -2685,8 +2685,8 @@ void uavs3d_if_hor_ver_luma_w32_avx2(const pel *src, int i_src, pel *dst, int i_
+ const int i_tmp8 = 32 * 8;
+ const int i_tmp9 = 32 * 9;
+ const int i_tmp10 = 32 * 10;
+-
+- int shift = 12;
++ int col;
++ const int shift = 12;
+ __m256i mAddOffset = _mm256_set1_epi32(1 << 11);
+ __m128i mCoefy11 = _mm_set1_epi16(*(s16*)coef_y);
+ __m128i mCoefy22 = _mm_set1_epi16(*(s16*)(coef_y + 2));
+@@ -2850,7 +2850,6 @@ void uavs3d_if_hor_ver_luma_w32x_avx2(const pel *src, int i_src, pel *dst, int i
+ {
+ ALIGNED_32(s16 tmp_res[(128 + 7) * 128]);
+ s16 *tmp = tmp_res;
+- int row, col;
+ const int i_tmp = width;
+ __m256i mVal1, mVal2, mVal;
+ __m256i T0, T1, T2, T3, T4, T5, T6, T7, T8, T9;
+@@ -2859,6 +2858,7 @@ void uavs3d_if_hor_ver_luma_w32x_avx2(const pel *src, int i_src, pel *dst, int i
+
+ //HOR
+ {
++ int row, col;
+ __m256i mSwitch1 = _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+ __m256i mSwitch2 = _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
+ __m256i mSwitch3 = _mm256_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12);
+@@ -2878,17 +2878,15 @@ void uavs3d_if_hor_ver_luma_w32x_avx2(const pel *src, int i_src, pel *dst, int i
+ {
+ S0 = _mm256_loadu_si256((__m256i*)(src + col));
+ S1 = _mm256_loadu_si256((__m256i*)(src + col + 8));
+- S2 = _mm256_insertf128_si256(S0, _mm256_castsi256_si128(S1), 0x1);
+- S3 = _mm256_insertf128_si256(S1, _mm256_extracti128_si256(S0, 1), 0x0);
+-
+- T0 = _mm256_shuffle_epi8(S2, mSwitch1);
+- T1 = _mm256_shuffle_epi8(S2, mSwitch2);
+- T2 = _mm256_shuffle_epi8(S2, mSwitch3);
+- T3 = _mm256_shuffle_epi8(S2, mSwitch4);
+- T4 = _mm256_shuffle_epi8(S3, mSwitch1);
+- T5 = _mm256_shuffle_epi8(S3, mSwitch2);
+- T6 = _mm256_shuffle_epi8(S3, mSwitch3);
+- T7 = _mm256_shuffle_epi8(S3, mSwitch4);
++
++ T0 = _mm256_shuffle_epi8(S0, mSwitch1);
++ T1 = _mm256_shuffle_epi8(S0, mSwitch2);
++ T2 = _mm256_shuffle_epi8(S0, mSwitch3);
++ T3 = _mm256_shuffle_epi8(S0, mSwitch4);
++ T4 = _mm256_shuffle_epi8(S1, mSwitch1);
++ T5 = _mm256_shuffle_epi8(S1, mSwitch2);
++ T6 = _mm256_shuffle_epi8(S1, mSwitch3);
++ T7 = _mm256_shuffle_epi8(S1, mSwitch4);
+
+ T0 = _mm256_maddubs_epi16(T0, mCoefy1_hor);
+ T1 = _mm256_maddubs_epi16(T1, mCoefy2_hor);
+@@ -2906,8 +2904,10 @@ void uavs3d_if_hor_ver_luma_w32x_avx2(const pel *src, int i_src, pel *dst, int i
+ T0 = _mm256_add_epi16(T0, T2);
+ T4 = _mm256_add_epi16(T4, T6);
+
+- _mm256_storeu_si256((__m256i*)(tmp + col), T0);
+- _mm256_storeu_si256((__m256i*)(tmp + col + 16), T4);
++ T1 = _mm256_permute2x128_si256(T0, T4, 0x20);
++ T3 = _mm256_permute2x128_si256(T0, T4, 0x31);
++ _mm256_store_si256((__m256i*)(tmp + col), T1);
++ _mm256_store_si256((__m256i*)(tmp + col + 16), T3);
+ }
+ src += i_src;
+ tmp += i_tmp;
+@@ -2927,6 +2927,7 @@ void uavs3d_if_hor_ver_luma_w32x_avx2(const pel *src, int i_src, pel *dst, int i
+ const int i_tmp10 = i_tmp5 << 1;
+
+ int shift = 12;
++ int col;
+ __m256i mAddOffset = _mm256_set1_epi32(1 << 11);
+ __m128i mCoefy11 = _mm_set1_epi16(*(s16*)coef_y);
+ __m128i mCoefy22 = _mm_set1_epi16(*(s16*)(coef_y + 2));
+@@ -3186,45 +3187,121 @@ void uavs3d_if_cpy_w128_avx2(const pel *src, int i_src, pel *dst, int i_dst, int
+ }
+ }
+
++void uavs3d_if_hor_luma_w8_avx2(const pel *src, int i_src, pel *dst, int i_dst, int width, int height, const s8 *coeff, int max_val)
++{
++ __m256i max_pel = _mm256_set1_epi16((pel)max_val);
++ __m256i T0, T1, T2, T3, T4, T5;
++ __m256i M0, M1, M2, M3, M4, M5, M6, M7;
++ __m256i S0, S1, S2;
++ __m256i offset = _mm256_set1_epi32(32);
++ __m256i mShuffle0 = _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9);
++ __m256i mShuffle1 = _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13);
++ __m256i mCoef0 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coeff)[0]));
++ __m256i mCoef1 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coeff)[1]));
++ __m256i mCoef2 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coeff)[2]));
++ __m256i mCoef3 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coeff)[3]));
++ __m128i s0, s1;
++
++ src -= 3;
++
++ while (height) {
++ T0 = _mm256_loadu_si256((__m256i*)(src));
++ s0 = _mm_loadu_si128((__m128i*)(src + 4));
++ T1 = _mm256_loadu_si256((__m256i*)(src + i_src));
++ s1 = _mm_loadu_si128((__m128i*)(src + i_src + 4));
++ height -= 2;
++ src += i_src << 1;
++ uavs3d_prefetch(src, _MM_HINT_NTA);
++ uavs3d_prefetch(src + i_src, _MM_HINT_NTA);
++
++ S0 = _mm256_permute2x128_si256(T0, T1, 0x20);
++ S2 = _mm256_permute2x128_si256(T0, T1, 0x31);
++ S1 = _mm256_set_m128i(s1, s0);
++
++ T0 = _mm256_shuffle_epi8(S0, mShuffle0);
++ T1 = _mm256_shuffle_epi8(S0, mShuffle1);
++ T2 = _mm256_shuffle_epi8(S1, mShuffle0);
++ T3 = _mm256_shuffle_epi8(S1, mShuffle1);
++ T4 = _mm256_shuffle_epi8(S2, mShuffle0);
++ T5 = _mm256_shuffle_epi8(S2, mShuffle1);
++
++ M0 = _mm256_madd_epi16(T0, mCoef0);
++ M1 = _mm256_madd_epi16(T1, mCoef1);
++ M2 = _mm256_madd_epi16(T2, mCoef2);
++ M3 = _mm256_madd_epi16(T3, mCoef3);
++ M4 = _mm256_madd_epi16(T2, mCoef0);
++ M5 = _mm256_madd_epi16(T3, mCoef1);
++ M6 = _mm256_madd_epi16(T4, mCoef2);
++ M7 = _mm256_madd_epi16(T5, mCoef3);
++
++ M0 = _mm256_add_epi32(M0, M1);
++ M1 = _mm256_add_epi32(M2, M3);
++ M2 = _mm256_add_epi32(M4, M5);
++ M3 = _mm256_add_epi32(M6, M7);
++
++ M0 = _mm256_add_epi32(M0, M1);
++ M1 = _mm256_add_epi32(M2, M3);
++
++ M2 = _mm256_add_epi32(M0, offset);
++ M3 = _mm256_add_epi32(M1, offset);
++ M2 = _mm256_srai_epi32(M2, 6);
++ M3 = _mm256_srai_epi32(M3, 6);
++ M2 = _mm256_packus_epi32(M2, M3);
++ M2 = _mm256_min_epu16(M2, max_pel);
++
++ _mm_storeu_si128((__m128i*)(dst), _mm256_castsi256_si128(M2));
++ _mm_storeu_si128((__m128i*)(dst + i_dst), _mm256_extracti128_si256(M2, 1));
++
++ dst += i_dst << 1;
++ }
++}
++
+ void uavs3d_if_hor_luma_w16_avx2(const pel *src, int i_src, pel *dst, int i_dst, int width, int height, const s8 *coeff, int max_val)
+ {
+ __m256i max_pel = _mm256_set1_epi16((pel)max_val);
+- __m256i T0, T1, T2, T3, T4, T5, T6, T7;
++ __m256i T0, T1, T2, T3, T4, T5;
+ __m256i M0, M1, M2, M3, M4, M5, M6, M7;
++ __m256i S0, S1, S2;
+ __m256i offset = _mm256_set1_epi32(32);
+- s32* coef = (s32*)coeff;
+- __m128i mCoef0 = _mm_setr_epi32(coef[0], coef[1], coef[0], coef[1]);
+- __m256i mCoef = _mm256_cvtepi8_epi16(mCoef0);
++ __m256i mShuffle0 = _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9);
++ __m256i mShuffle1 = _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13);
++ __m256i mCoef0 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coeff)[0]));
++ __m256i mCoef1 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coeff)[1]));
++ __m256i mCoef2 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coeff)[2]));
++ __m256i mCoef3 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coeff)[3]));
+
+ src -= 3;
+
+ while (height--) {
+- uavs3d_prefetch(src + i_src, _MM_HINT_NTA);
+- T0 = _mm256_loadu_si256((__m256i*)(src + 0));
+- T1 = _mm256_loadu_si256((__m256i*)(src + 1));
+- T2 = _mm256_loadu_si256((__m256i*)(src + 2));
+- T3 = _mm256_loadu_si256((__m256i*)(src + 3));
+- T4 = _mm256_loadu_si256((__m256i*)(src + 4));
+- T5 = _mm256_loadu_si256((__m256i*)(src + 5));
+- T6 = _mm256_loadu_si256((__m256i*)(src + 6));
+- T7 = _mm256_loadu_si256((__m256i*)(src + 7));
+-
+- M0 = _mm256_madd_epi16(T0, mCoef);
+- M1 = _mm256_madd_epi16(T1, mCoef);
+- M2 = _mm256_madd_epi16(T2, mCoef);
+- M3 = _mm256_madd_epi16(T3, mCoef);
+- M4 = _mm256_madd_epi16(T4, mCoef);
+- M5 = _mm256_madd_epi16(T5, mCoef);
+- M6 = _mm256_madd_epi16(T6, mCoef);
+- M7 = _mm256_madd_epi16(T7, mCoef);
+-
+- M0 = _mm256_hadd_epi32(M0, M1);
+- M1 = _mm256_hadd_epi32(M2, M3);
+- M2 = _mm256_hadd_epi32(M4, M5);
+- M3 = _mm256_hadd_epi32(M6, M7);
+-
+- M0 = _mm256_hadd_epi32(M0, M1);
+- M1 = _mm256_hadd_epi32(M2, M3);
++ S0 = _mm256_lddqu_si256((__m256i*)(src));
++ S1 = _mm256_loadu_si256((__m256i*)(src + 4));
++ S2 = _mm256_loadu_si256((__m256i*)(src + 8));
++
++ src += i_src;
++ T0 = _mm256_shuffle_epi8(S0, mShuffle0);
++ T1 = _mm256_shuffle_epi8(S0, mShuffle1);
++ T2 = _mm256_shuffle_epi8(S1, mShuffle0);
++ T3 = _mm256_shuffle_epi8(S1, mShuffle1);
++ T4 = _mm256_shuffle_epi8(S2, mShuffle0);
++ T5 = _mm256_shuffle_epi8(S2, mShuffle1);
++ uavs3d_prefetch(src, _MM_HINT_NTA);
++
++ M0 = _mm256_madd_epi16(T0, mCoef0);
++ M1 = _mm256_madd_epi16(T1, mCoef1);
++ M2 = _mm256_madd_epi16(T2, mCoef2);
++ M3 = _mm256_madd_epi16(T3, mCoef3);
++ M4 = _mm256_madd_epi16(T2, mCoef0);
++ M5 = _mm256_madd_epi16(T3, mCoef1);
++ M6 = _mm256_madd_epi16(T4, mCoef2);
++ M7 = _mm256_madd_epi16(T5, mCoef3);
++
++ M0 = _mm256_add_epi32(M0, M1);
++ M1 = _mm256_add_epi32(M2, M3);
++ M2 = _mm256_add_epi32(M4, M5);
++ M3 = _mm256_add_epi32(M6, M7);
++
++ M0 = _mm256_add_epi32(M0, M1);
++ M1 = _mm256_add_epi32(M2, M3);
+
+ M2 = _mm256_add_epi32(M0, offset);
+ M3 = _mm256_add_epi32(M1, offset);
+@@ -3235,7 +3312,6 @@ void uavs3d_if_hor_luma_w16_avx2(const pel *src, int i_src, pel *dst, int i_dst,
+
+ _mm256_storeu_si256((__m256i*)(dst), M2);
+
+- src += i_src;
+ dst += i_dst;
+ }
+ }
+@@ -3244,12 +3320,16 @@ void uavs3d_if_hor_luma_w16x_avx2(const pel *src, int i_src, pel *dst, int i_dst
+ {
+ int col;
+ __m256i max_pel = _mm256_set1_epi16((pel)max_val);
+- __m256i T0, T1, T2, T3, T4, T5, T6, T7;
++ __m256i T0, T1, T2, T3, T4, T5;
+ __m256i M0, M1, M2, M3, M4, M5, M6, M7;
++ __m256i S0, S1, S2;
+ __m256i offset = _mm256_set1_epi32(32);
+- s32 *coef = (s32*)coeff;
+- __m128i mCoef0 = _mm_setr_epi32(coef[0], coef[1], coef[0], coef[1]);
+- __m256i mCoef = _mm256_cvtepi8_epi16(mCoef0);
++ __m256i mShuffle0 = _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9);
++ __m256i mShuffle1 = _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13);
++ __m256i mCoef0 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coeff)[0]));
++ __m256i mCoef1 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coeff)[1]));
++ __m256i mCoef2 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coeff)[2]));
++ __m256i mCoef3 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coeff)[3]));
+
+ src -= 3;
+
+@@ -3258,31 +3338,33 @@ void uavs3d_if_hor_luma_w16x_avx2(const pel *src, int i_src, pel *dst, int i_dst
+ uavs3d_prefetch(src + i_src, _MM_HINT_NTA);
+ for (col = 0; col < width; col += 16)
+ {
+- T0 = _mm256_loadu_si256((__m256i*)(p_src + 0));
+- T1 = _mm256_loadu_si256((__m256i*)(p_src + 1));
+- T2 = _mm256_loadu_si256((__m256i*)(p_src + 2));
+- T3 = _mm256_loadu_si256((__m256i*)(p_src + 3));
+- T4 = _mm256_loadu_si256((__m256i*)(p_src + 4));
+- T5 = _mm256_loadu_si256((__m256i*)(p_src + 5));
+- T6 = _mm256_loadu_si256((__m256i*)(p_src + 6));
+- T7 = _mm256_loadu_si256((__m256i*)(p_src + 7));
+-
+- M0 = _mm256_madd_epi16(T0, mCoef);
+- M1 = _mm256_madd_epi16(T1, mCoef);
+- M2 = _mm256_madd_epi16(T2, mCoef);
+- M3 = _mm256_madd_epi16(T3, mCoef);
+- M4 = _mm256_madd_epi16(T4, mCoef);
+- M5 = _mm256_madd_epi16(T5, mCoef);
+- M6 = _mm256_madd_epi16(T6, mCoef);
+- M7 = _mm256_madd_epi16(T7, mCoef);
+-
+- M0 = _mm256_hadd_epi32(M0, M1);
+- M1 = _mm256_hadd_epi32(M2, M3);
+- M2 = _mm256_hadd_epi32(M4, M5);
+- M3 = _mm256_hadd_epi32(M6, M7);
+-
+- M0 = _mm256_hadd_epi32(M0, M1);
+- M1 = _mm256_hadd_epi32(M2, M3);
++ S0 = _mm256_loadu_si256((__m256i*)(p_src));
++ S1 = _mm256_loadu_si256((__m256i*)(p_src + 4));
++ S2 = _mm256_loadu_si256((__m256i*)(p_src + 8));
++
++ T0 = _mm256_shuffle_epi8(S0, mShuffle0);
++ T1 = _mm256_shuffle_epi8(S0, mShuffle1);
++ T2 = _mm256_shuffle_epi8(S1, mShuffle0);
++ T3 = _mm256_shuffle_epi8(S1, mShuffle1);
++ T4 = _mm256_shuffle_epi8(S2, mShuffle0);
++ T5 = _mm256_shuffle_epi8(S2, mShuffle1);
++
++ M0 = _mm256_madd_epi16(T0, mCoef0);
++ M1 = _mm256_madd_epi16(T1, mCoef1);
++ M2 = _mm256_madd_epi16(T2, mCoef2);
++ M3 = _mm256_madd_epi16(T3, mCoef3);
++ M4 = _mm256_madd_epi16(T2, mCoef0);
++ M5 = _mm256_madd_epi16(T3, mCoef1);
++ M6 = _mm256_madd_epi16(T4, mCoef2);
++ M7 = _mm256_madd_epi16(T5, mCoef3);
++
++ M0 = _mm256_add_epi32(M0, M1);
++ M1 = _mm256_add_epi32(M2, M3);
++ M2 = _mm256_add_epi32(M4, M5);
++ M3 = _mm256_add_epi32(M6, M7);
++
++ M0 = _mm256_add_epi32(M0, M1);
++ M1 = _mm256_add_epi32(M2, M3);
+
+ M2 = _mm256_add_epi32(M0, offset);
+ M3 = _mm256_add_epi32(M1, offset);
+@@ -3299,6 +3381,47 @@ void uavs3d_if_hor_luma_w16x_avx2(const pel *src, int i_src, pel *dst, int i_dst
+ }
+ }
+
++void uavs3d_if_hor_chroma_w8_avx2(const pel *src, int i_src, pel *dst, int i_dst, int width, int height, const s8 *coeff, int max_val)
++{
++ const int offset = 32;
++ const int shift = 6;
++
++ __m128i coef0 = _mm_cvtepi8_epi16(_mm_set1_epi16(((s16*)coeff)[0]));
++ __m128i coef1 = _mm_cvtepi8_epi16(_mm_set1_epi16(((s16*)coeff)[1]));
++ __m256i mCoef0 = _mm256_set_m128i(coef1, coef0);
++ __m256i mCoef1 = _mm256_set_m128i(coef0, coef1);
++ __m256i mSwitch = _mm256_setr_epi8(0, 1, 4, 5, 2, 3, 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 0, 1, 4, 5, 2, 3, 6, 7, 4, 5, 8, 9, 6, 7, 10, 11);
++ __m256i mAddOffset = _mm256_set1_epi32((s16)offset);
++ __m256i T0, T1, S0, S1;
++ __m256i max_pel = _mm256_set1_epi16((pel)max_val);
++ __m128i s0;
++
++ src -= 2;
++
++ while (height--) {
++ uavs3d_prefetch(src + i_src * 2, _MM_HINT_NTA);
++ S0 = _mm256_loadu_si256((__m256i*)(src));
++ s0 = _mm_loadu_si128((__m128i*)(src + 4));
++ src += i_src;
++ S1 = _mm256_set_m128i(s0, s0);
++ uavs3d_prefetch(src, _MM_HINT_NTA);
++ T0 = _mm256_shuffle_epi8(S0, mSwitch);
++ T1 = _mm256_shuffle_epi8(S1, mSwitch);
++ T0 = _mm256_madd_epi16(T0, mCoef0);
++ T1 = _mm256_madd_epi16(T1, mCoef1);
++ T0 = _mm256_add_epi32(T0, T1);
++
++ T0 = _mm256_add_epi32(T0, mAddOffset);
++ T0 = _mm256_srai_epi32(T0, shift);
++ T0 = _mm256_min_epu16(T0, max_pel);
++ s0 = _mm_packus_epi32(_mm256_castsi256_si128(T0), _mm256_extracti128_si256(T0, 1));
++
++ _mm_storeu_si128((__m128i*)(dst), s0);
++
++ dst += i_dst;
++ }
++}
++
+ void uavs3d_if_hor_chroma_w16_avx2(const pel *src, int i_src, pel *dst, int i_dst, int width, int height, const s8 *coeff, int max_val)
+ {
+ const int offset = 32;
+@@ -3310,7 +3433,7 @@ void uavs3d_if_hor_chroma_w16_avx2(const pel *src, int i_src, pel *dst, int i_ds
+ __m256i mCoef1 = _mm256_cvtepi8_epi16(coef1);
+ __m256i mSwitch = _mm256_setr_epi8(0, 1, 4, 5, 2, 3, 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 0, 1, 4, 5, 2, 3, 6, 7, 4, 5, 8, 9, 6, 7, 10, 11);
+ __m256i mAddOffset = _mm256_set1_epi32((s16)offset);
+- __m256i T0, T1, T2, T3, S0, S1, S2, S3;
++ __m256i T0, T1, T2, T3, S0, S1, S2;
+ __m256i max_pel = _mm256_set1_epi16((pel)max_val);
+
+ src -= 2;
+@@ -3320,15 +3443,14 @@ void uavs3d_if_hor_chroma_w16_avx2(const pel *src, int i_src, pel *dst, int i_ds
+ S0 = _mm256_loadu_si256((__m256i*)(src));
+ S1 = _mm256_loadu_si256((__m256i*)(src + 4));
+ S2 = _mm256_loadu_si256((__m256i*)(src + 8));
+- S3 = _mm256_loadu_si256((__m256i*)(src + 12));
+- S0 = _mm256_permute4x64_epi64(S0, 0x94);
+- S1 = _mm256_permute4x64_epi64(S1, 0x94);
+- S2 = _mm256_permute4x64_epi64(S2, 0x94);
+- S3 = _mm256_permute4x64_epi64(S3, 0x94);
+- T0 = _mm256_madd_epi16(_mm256_shuffle_epi8(S0, mSwitch), mCoef0);
+- T1 = _mm256_madd_epi16(_mm256_shuffle_epi8(S1, mSwitch), mCoef1);
+- T2 = _mm256_madd_epi16(_mm256_shuffle_epi8(S2, mSwitch), mCoef0);
+- T3 = _mm256_madd_epi16(_mm256_shuffle_epi8(S3, mSwitch), mCoef1);
++ T0 = _mm256_shuffle_epi8(S0, mSwitch);
++ T1 = _mm256_shuffle_epi8(S1, mSwitch);
++ T2 = _mm256_shuffle_epi8(S1, mSwitch);
++ T3 = _mm256_shuffle_epi8(S2, mSwitch);
++ T0 = _mm256_madd_epi16(T0, mCoef0);
++ T1 = _mm256_madd_epi16(T1, mCoef1);
++ T2 = _mm256_madd_epi16(T2, mCoef0);
++ T3 = _mm256_madd_epi16(T3, mCoef1);
+ T0 = _mm256_add_epi32(T0, T1);
+ T2 = _mm256_add_epi32(T2, T3);
+
+@@ -3337,7 +3459,6 @@ void uavs3d_if_hor_chroma_w16_avx2(const pel *src, int i_src, pel *dst, int i_ds
+ T0 = _mm256_srai_epi32(T0, shift);
+ T2 = _mm256_srai_epi32(T2, shift);
+ T0 = _mm256_packus_epi32(T0, T2);
+- T0 = _mm256_permute4x64_epi64(T0, 0xd8);
+
+ T0 = _mm256_min_epu16(T0, max_pel);
+ _mm256_storeu_si256((__m256i*)(dst), T0);
+@@ -3359,7 +3480,7 @@ void uavs3d_if_hor_chroma_w16x_avx2(const pel *src, int i_src, pel *dst, int i_d
+ __m256i mCoef1 = _mm256_cvtepi8_epi16(coef1);
+ __m256i mSwitch = _mm256_setr_epi8(0, 1, 4, 5, 2, 3, 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 0, 1, 4, 5, 2, 3, 6, 7, 4, 5, 8, 9, 6, 7, 10, 11);
+ __m256i mAddOffset = _mm256_set1_epi32((s16)offset);
+- __m256i T0, T1, T2, T3, S0, S1, S2, S3;
++ __m256i T0, T1, T2, T3, S0, S1, S2;
+ __m256i max_pel = _mm256_set1_epi16((pel)max_val);
+
+ src -= 2;
+@@ -3370,15 +3491,14 @@ void uavs3d_if_hor_chroma_w16x_avx2(const pel *src, int i_src, pel *dst, int i_d
+ S0 = _mm256_loadu_si256((__m256i*)(src + col));
+ S1 = _mm256_loadu_si256((__m256i*)(src + col + 4));
+ S2 = _mm256_loadu_si256((__m256i*)(src + col + 8));
+- S3 = _mm256_loadu_si256((__m256i*)(src + col + 12));
+- S0 = _mm256_permute4x64_epi64(S0, 0x94);
+- S1 = _mm256_permute4x64_epi64(S1, 0x94);
+- S2 = _mm256_permute4x64_epi64(S2, 0x94);
+- S3 = _mm256_permute4x64_epi64(S3, 0x94);
+- T0 = _mm256_madd_epi16(_mm256_shuffle_epi8(S0, mSwitch), mCoef0);
+- T1 = _mm256_madd_epi16(_mm256_shuffle_epi8(S1, mSwitch), mCoef1);
+- T2 = _mm256_madd_epi16(_mm256_shuffle_epi8(S2, mSwitch), mCoef0);
+- T3 = _mm256_madd_epi16(_mm256_shuffle_epi8(S3, mSwitch), mCoef1);
++ T0 = _mm256_shuffle_epi8(S0, mSwitch);
++ T1 = _mm256_shuffle_epi8(S1, mSwitch);
++ T2 = _mm256_shuffle_epi8(S1, mSwitch);
++ T3 = _mm256_shuffle_epi8(S2, mSwitch);
++ T0 = _mm256_madd_epi16(T0, mCoef0);
++ T1 = _mm256_madd_epi16(T1, mCoef1);
++ T2 = _mm256_madd_epi16(T2, mCoef0);
++ T3 = _mm256_madd_epi16(T3, mCoef1);
+ T0 = _mm256_add_epi32(T0, T1);
+ T2 = _mm256_add_epi32(T2, T3);
+
+@@ -3387,9 +3507,8 @@ void uavs3d_if_hor_chroma_w16x_avx2(const pel *src, int i_src, pel *dst, int i_d
+ T0 = _mm256_srai_epi32(T0, shift);
+ T2 = _mm256_srai_epi32(T2, shift);
+ T0 = _mm256_packus_epi32(T0, T2);
+- T0 = _mm256_permute4x64_epi64(T0, 0xd8);
+-
+ T0 = _mm256_min_epu16(T0, max_pel);
++
+ _mm256_storeu_si256((__m256i*)(dst + col), T0);
+ }
+ src += i_src;
+@@ -3397,6 +3516,93 @@ void uavs3d_if_hor_chroma_w16x_avx2(const pel *src, int i_src, pel *dst, int i_d
+ }
+ }
+
++void uavs3d_if_ver_luma_w8_avx2(const pel *src, int i_src, pel *dst, int i_dst, int width, int height, const s8 *coeff, int max_val)
++{
++ const int i_src2 = i_src * 2;
++ const int i_src3 = i_src * 3;
++ const int i_src4 = i_src * 4;
++ const int i_src5 = i_src * 5;
++ const int i_src6 = i_src * 6;
++ const int i_src7 = i_src * 7;
++ __m128i coeff0 = _mm_set1_epi16(*(s16*)coeff);
++ __m128i coeff1 = _mm_set1_epi16(*(s16*)(coeff + 2));
++ __m128i coeff2 = _mm_set1_epi16(*(s16*)(coeff + 4));
++ __m128i coeff3 = _mm_set1_epi16(*(s16*)(coeff + 6));
++ __m256i max_pel = _mm256_set1_epi16((pel)max_val);
++ __m256i mAddOffset = _mm256_set1_epi32(32);
++ __m128i s0, s1, s2, s3, s4, s5, s6, s7, s8;
++ __m256i T0, T1, T2, T3, T4, T5, T6, T7;
++ __m256i N0, N1, N2, N3, N4, N5, N6, N7;
++ __m256i coeff00 = _mm256_cvtepi8_epi16(coeff0);
++ __m256i coeff01 = _mm256_cvtepi8_epi16(coeff1);
++ __m256i coeff02 = _mm256_cvtepi8_epi16(coeff2);
++ __m256i coeff03 = _mm256_cvtepi8_epi16(coeff3);
++
++ src -= i_src3;
++
++ while (height > 0) {
++ s0 = _mm_loadu_si128((__m128i*)(src));
++ s1 = _mm_loadu_si128((__m128i*)(src + i_src));
++ s2 = _mm_loadu_si128((__m128i*)(src + i_src2));
++ s3 = _mm_loadu_si128((__m128i*)(src + i_src3));
++ s4 = _mm_loadu_si128((__m128i*)(src + i_src4));
++ s5 = _mm_loadu_si128((__m128i*)(src + i_src5));
++ s6 = _mm_loadu_si128((__m128i*)(src + i_src6));
++ s7 = _mm_loadu_si128((__m128i*)(src + i_src7));
++ s8 = _mm_loadu_si128((__m128i*)(src + (i_src << 3)));
++
++ height -= 2;
++ src += i_src2;
++ uavs3d_prefetch(src + i_src7, _MM_HINT_NTA);
++
++ T0 = _mm256_set_m128i(s1, s0);
++ T1 = _mm256_set_m128i(s2, s1);
++ T2 = _mm256_set_m128i(s3, s2);
++ T3 = _mm256_set_m128i(s4, s3);
++ T4 = _mm256_set_m128i(s5, s4);
++ T5 = _mm256_set_m128i(s6, s5);
++ T6 = _mm256_set_m128i(s7, s6);
++ T7 = _mm256_set_m128i(s8, s7);
++
++ N0 = _mm256_unpacklo_epi16(T0, T1);
++ N1 = _mm256_unpacklo_epi16(T2, T3);
++ N2 = _mm256_unpacklo_epi16(T4, T5);
++ N3 = _mm256_unpacklo_epi16(T6, T7);
++ N4 = _mm256_unpackhi_epi16(T0, T1);
++ N5 = _mm256_unpackhi_epi16(T2, T3);
++ N6 = _mm256_unpackhi_epi16(T4, T5);
++ N7 = _mm256_unpackhi_epi16(T6, T7);
++
++ N0 = _mm256_madd_epi16(N0, coeff00);
++ N1 = _mm256_madd_epi16(N1, coeff01);
++ N2 = _mm256_madd_epi16(N2, coeff02);
++ N3 = _mm256_madd_epi16(N3, coeff03);
++ N4 = _mm256_madd_epi16(N4, coeff00);
++ N5 = _mm256_madd_epi16(N5, coeff01);
++ N6 = _mm256_madd_epi16(N6, coeff02);
++ N7 = _mm256_madd_epi16(N7, coeff03);
++
++ N0 = _mm256_add_epi32(N0, N1);
++ N1 = _mm256_add_epi32(N2, N3);
++ N2 = _mm256_add_epi32(N4, N5);
++ N3 = _mm256_add_epi32(N6, N7);
++
++ N0 = _mm256_add_epi32(N0, N1);
++ N1 = _mm256_add_epi32(N2, N3);
++
++ N0 = _mm256_add_epi32(N0, mAddOffset);
++ N1 = _mm256_add_epi32(N1, mAddOffset);
++ N0 = _mm256_srai_epi32(N0, 6);
++ N1 = _mm256_srai_epi32(N1, 6);
++ N0 = _mm256_packus_epi32(N0, N1);
++ N0 = _mm256_min_epu16(N0, max_pel);
++ _mm_storeu_si128((__m128i*)(dst), _mm256_castsi256_si128(N0));
++ _mm_storeu_si128((__m128i*)(dst + i_dst), _mm256_extracti128_si256(N0, 1));
++
++ dst += i_dst << 1;
++ }
++}
++
+ void uavs3d_if_ver_luma_w16_avx2(const pel *src, int i_src, pel *dst, int i_dst, int width, int height, const s8 *coeff, int max_val)
+ {
+ const int i_src2 = i_src * 2;
+@@ -3412,7 +3618,6 @@ void uavs3d_if_ver_luma_w16_avx2(const pel *src, int i_src, pel *dst, int i_dst,
+ __m256i max_pel = _mm256_set1_epi16((pel)max_val);
+ __m256i mAddOffset = _mm256_set1_epi32(32);
+ __m256i T0, T1, T2, T3, T4, T5, T6, T7;
+- __m256i M0, M1, M2, M3, M4, M5, M6, M7;
+ __m256i N0, N1, N2, N3, N4, N5, N6, N7;
+ __m256i coeff00 = _mm256_cvtepi8_epi16(coeff0);
+ __m256i coeff01 = _mm256_cvtepi8_epi16(coeff1);
+@@ -3422,7 +3627,6 @@ void uavs3d_if_ver_luma_w16_avx2(const pel *src, int i_src, pel *dst, int i_dst,
+ src -= 3 * i_src;
+
+ while (height--) {
+- uavs3d_prefetch(src + 8 * i_src, _MM_HINT_NTA);
+ T0 = _mm256_loadu_si256((__m256i*)(src));
+ T1 = _mm256_loadu_si256((__m256i*)(src + i_src));
+ T2 = _mm256_loadu_si256((__m256i*)(src + i_src2));
+@@ -3431,24 +3635,25 @@ void uavs3d_if_ver_luma_w16_avx2(const pel *src, int i_src, pel *dst, int i_dst,
+ T5 = _mm256_loadu_si256((__m256i*)(src + i_src5));
+ T6 = _mm256_loadu_si256((__m256i*)(src + i_src6));
+ T7 = _mm256_loadu_si256((__m256i*)(src + i_src7));
++ uavs3d_prefetch(src + 8 * i_src, _MM_HINT_NTA);
+
+- M0 = _mm256_unpacklo_epi16(T0, T1);
+- M1 = _mm256_unpacklo_epi16(T2, T3);
+- M2 = _mm256_unpacklo_epi16(T4, T5);
+- M3 = _mm256_unpacklo_epi16(T6, T7);
+- M4 = _mm256_unpackhi_epi16(T0, T1);
+- M5 = _mm256_unpackhi_epi16(T2, T3);
+- M6 = _mm256_unpackhi_epi16(T4, T5);
+- M7 = _mm256_unpackhi_epi16(T6, T7);
+-
+- N0 = _mm256_madd_epi16(M0, coeff00);
+- N1 = _mm256_madd_epi16(M1, coeff01);
+- N2 = _mm256_madd_epi16(M2, coeff02);
+- N3 = _mm256_madd_epi16(M3, coeff03);
+- N4 = _mm256_madd_epi16(M4, coeff00);
+- N5 = _mm256_madd_epi16(M5, coeff01);
+- N6 = _mm256_madd_epi16(M6, coeff02);
+- N7 = _mm256_madd_epi16(M7, coeff03);
++ N0 = _mm256_unpacklo_epi16(T0, T1);
++ N1 = _mm256_unpacklo_epi16(T2, T3);
++ N2 = _mm256_unpacklo_epi16(T4, T5);
++ N3 = _mm256_unpacklo_epi16(T6, T7);
++ N4 = _mm256_unpackhi_epi16(T0, T1);
++ N5 = _mm256_unpackhi_epi16(T2, T3);
++ N6 = _mm256_unpackhi_epi16(T4, T5);
++ N7 = _mm256_unpackhi_epi16(T6, T7);
++
++ N0 = _mm256_madd_epi16(N0, coeff00);
++ N1 = _mm256_madd_epi16(N1, coeff01);
++ N2 = _mm256_madd_epi16(N2, coeff02);
++ N3 = _mm256_madd_epi16(N3, coeff03);
++ N4 = _mm256_madd_epi16(N4, coeff00);
++ N5 = _mm256_madd_epi16(N5, coeff01);
++ N6 = _mm256_madd_epi16(N6, coeff02);
++ N7 = _mm256_madd_epi16(N7, coeff03);
+
+ N0 = _mm256_add_epi32(N0, N1);
+ N1 = _mm256_add_epi32(N2, N3);
+@@ -3568,20 +3773,23 @@ void uavs3d_if_ver_chroma_w16_avx2(const pel *src, int i_src, pel *dst, int i_ds
+
+ while (height) {
+ __m256i S0, S1, S2, S3, S4;
+- uavs3d_prefetch(src + 5 * i_src, _MM_HINT_NTA);
+- uavs3d_prefetch(src + 6 * i_src, _MM_HINT_NTA);
+- height -= 2;
+ S0 = _mm256_loadu_si256((__m256i*)(src));
+ S1 = _mm256_loadu_si256((__m256i*)(src + i_src));
+ S2 = _mm256_loadu_si256((__m256i*)(src + i_src2));
+ S3 = _mm256_loadu_si256((__m256i*)(src + i_src3));
+ S4 = _mm256_loadu_si256((__m256i*)(src + i_src4));
+
++ height -= 2;
++ src += i_src2;
++
+ T0 = _mm256_unpacklo_epi16(S0, S1);
+ T1 = _mm256_unpackhi_epi16(S0, S1);
+ T2 = _mm256_unpacklo_epi16(S2, S3);
+ T3 = _mm256_unpackhi_epi16(S2, S3);
+
++ uavs3d_prefetch(src + i_src3, _MM_HINT_NTA);
++ uavs3d_prefetch(src + i_src4, _MM_HINT_NTA);
++
+ T0 = _mm256_madd_epi16(T0, coeff0);
+ T1 = _mm256_madd_epi16(T1, coeff0);
+ T2 = _mm256_madd_epi16(T2, coeff1);
+@@ -3621,7 +3829,6 @@ void uavs3d_if_ver_chroma_w16_avx2(const pel *src, int i_src, pel *dst, int i_ds
+ mVal1 = _mm256_min_epu16(mVal1, max_pel);
+ _mm256_storeu_si256((__m256i*)(dst + i_dst), mVal1);
+
+- src += 2 * i_src;
+ dst += 2 * i_dst;
+ }
+ }
+@@ -3645,9 +3852,6 @@ void uavs3d_if_ver_chroma_w32_avx2(const pel *src, int i_src, pel *dst, int i_ds
+ src -= i_src;
+
+ while (height) {
+- uavs3d_prefetch(src + 5 * i_src, _MM_HINT_NTA);
+- uavs3d_prefetch(src + 6 * i_src, _MM_HINT_NTA);
+- height -= 2;
+ S0 = _mm256_loadu_si256((__m256i*)(src));
+ S5 = _mm256_loadu_si256((__m256i*)(src + 16));
+ S1 = _mm256_loadu_si256((__m256i*)(src + i_src));
+@@ -3659,6 +3863,9 @@ void uavs3d_if_ver_chroma_w32_avx2(const pel *src, int i_src, pel *dst, int i_ds
+ S4 = _mm256_loadu_si256((__m256i*)(src + i_src4));
+ S9 = _mm256_loadu_si256((__m256i*)(src + i_src4 + 16));
+
++ height -= 2;
++ src += i_src2;
++
+ T0 = _mm256_unpacklo_epi16(S0, S1);
+ T1 = _mm256_unpackhi_epi16(S0, S1);
+ T2 = _mm256_unpacklo_epi16(S2, S3);
+@@ -3668,6 +3875,9 @@ void uavs3d_if_ver_chroma_w32_avx2(const pel *src, int i_src, pel *dst, int i_ds
+ T6 = _mm256_unpacklo_epi16(S7, S8);
+ T7 = _mm256_unpackhi_epi16(S7, S8);
+
++ uavs3d_prefetch(src + i_src3, _MM_HINT_NTA);
++ uavs3d_prefetch(src + i_src4, _MM_HINT_NTA);
++
+ T0 = _mm256_madd_epi16(T0, coeff0);
+ T1 = _mm256_madd_epi16(T1, coeff0);
+ T2 = _mm256_madd_epi16(T2, coeff1);
+@@ -3738,7 +3948,6 @@ void uavs3d_if_ver_chroma_w32_avx2(const pel *src, int i_src, pel *dst, int i_ds
+ _mm256_storeu_si256((__m256i*)(dst + i_dst), T0);
+ _mm256_storeu_si256((__m256i*)(dst + i_dst + 16), T2);
+
+- src += 2 * i_src;
+ dst += 2 * i_dst;
+ }
+ }
+@@ -3820,22 +4029,18 @@ void uavs3d_if_ver_chroma_w32x_avx2(const pel *src, int i_src, pel *dst, int i_d
+ }
+ }
+
+-void uavs3d_if_hor_ver_luma_w16x_avx2(const pel *src, int i_src, pel *dst, int i_dst, int width, int height, const s8 *coef_x, const s8 *coef_y, int max_val)
++void uavs3d_if_hor_ver_luma_w4_avx2(const pel *src, int i_src, pel *dst, int i_dst, int width, int height, const s8 *coef_x, const s8 *coef_y, int max_val)
+ {
+- ALIGNED_32(s16 tmp_res[(128 + 7) * 128]);
++ ALIGNED_32(s16 tmp_res[(32 + 7) * 4]);
+ s16 *tmp = tmp_res;
+- int row, i;;
++ int row;
+ int add1, shift1;
+ int add2, shift2;
+- __m128i mCoef0;
+- __m256i mCoef, offset;
+- __m256i T0, T1, T2, T3, T4, T5, T6, T7;
+- __m256i M0, M1, M2, M3, M4, M5, M6, M7;
+- __m256i N0, N1, N2, N3, N4, N5, N6, N7;
+- int i_tmp = width;
+- s32 * coef;
+- __m128i coeff0, coeff1, coeff2, coeff3;
+- __m256i coeff00, coeff01, coeff02, coeff03;
++ __m256i offset;
++ __m256i T0, T1, T2, T3;
++ __m256i M0, M1, M2, M3;
++ const int i_tmp = 4;
++ __m256i mCoef0, mCoef1, mCoef2, mCoef3;
+ __m256i max_pel = _mm256_set1_epi16((pel)max_val);
+
+ if (max_val == 255) { // 8 bit_depth
+@@ -3851,80 +4056,311 @@ void uavs3d_if_hor_ver_luma_w16x_avx2(const pel *src, int i_src, pel *dst, int i
+ add2 = 1 << (shift2 - 1);
+
+ src += -3 * i_src - 3;
+- coef = (s32*)coef_x;
+- mCoef0 = _mm_setr_epi32(coef[0], coef[1], coef[0], coef[1]);
+- mCoef = _mm256_cvtepi8_epi16(mCoef0);
+- offset = _mm256_set1_epi32(add1);
+
+- row = height + 7;
++ {
++ __m128i s0, s1, s2, s3;
++ __m256i S0, S1;
++ __m256i mShuffle0 = _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9);
++ __m256i mShuffle1 = _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13);
++
++ mCoef0 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_x)[0]));
++ mCoef1 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_x)[1]));
++ mCoef2 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_x)[2]));
++ mCoef3 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_x)[3]));
++ offset = _mm256_set1_epi32(add1);
++
++ row = height + 6;
++
++ while (row > 0) {
++ s0 = _mm_loadu_si128((__m128i*)(src));
++ s1 = _mm_loadu_si128((__m128i*)(src + 4));
++ s2 = _mm_loadu_si128((__m128i*)(src + i_src));
++ s3 = _mm_loadu_si128((__m128i*)(src + i_src + 4));
++ row -= 2;
++ src += i_src << 1;
++ uavs3d_prefetch(src, _MM_HINT_NTA);
++ uavs3d_prefetch(src + i_src, _MM_HINT_NTA);
+
+- while (row--) {
+- const pel *p = src;
+- uavs3d_prefetch(src + i_src, _MM_HINT_NTA);
+- for (i = 0; i < width; i += 16) {
++ S0 = _mm256_set_m128i(s2, s0);
++ S1 = _mm256_set_m128i(s3, s1);
++
++ T0 = _mm256_shuffle_epi8(S0, mShuffle0);
++ T1 = _mm256_shuffle_epi8(S0, mShuffle1);
++ T2 = _mm256_shuffle_epi8(S1, mShuffle0);
++ T3 = _mm256_shuffle_epi8(S1, mShuffle1);
+
+- T0 = _mm256_loadu_si256((__m256i*)p++);
+- T1 = _mm256_loadu_si256((__m256i*)p++);
+- T2 = _mm256_loadu_si256((__m256i*)p++);
+- T3 = _mm256_loadu_si256((__m256i*)p++);
+- T4 = _mm256_loadu_si256((__m256i*)p++);
+- T5 = _mm256_loadu_si256((__m256i*)p++);
+- T6 = _mm256_loadu_si256((__m256i*)p++);
+- T7 = _mm256_loadu_si256((__m256i*)p++);
+-
+- M0 = _mm256_madd_epi16(T0, mCoef);
+- M1 = _mm256_madd_epi16(T1, mCoef);
+- M2 = _mm256_madd_epi16(T2, mCoef);
+- M3 = _mm256_madd_epi16(T3, mCoef);
+- M4 = _mm256_madd_epi16(T4, mCoef);
+- M5 = _mm256_madd_epi16(T5, mCoef);
+- M6 = _mm256_madd_epi16(T6, mCoef);
+- M7 = _mm256_madd_epi16(T7, mCoef);
+-
+- M0 = _mm256_hadd_epi32(M0, M1);
+- M1 = _mm256_hadd_epi32(M2, M3);
+- M2 = _mm256_hadd_epi32(M4, M5);
+- M3 = _mm256_hadd_epi32(M6, M7);
+-
+- M0 = _mm256_hadd_epi32(M0, M1);
+- M1 = _mm256_hadd_epi32(M2, M3);
++ M0 = _mm256_madd_epi16(T0, mCoef0);
++ M1 = _mm256_madd_epi16(T1, mCoef1);
++ M2 = _mm256_madd_epi16(T2, mCoef2);
++ M3 = _mm256_madd_epi16(T3, mCoef3);
++
++ M0 = _mm256_add_epi32(M0, M1);
++ M1 = _mm256_add_epi32(M2, M3);
++
++ M0 = _mm256_add_epi32(M0, M1);
++
++ M2 = _mm256_add_epi32(M0, offset);
++ M2 = _mm256_srai_epi32(M2, shift1);
++
++ s0 = _mm_packs_epi32(_mm256_castsi256_si128(M2), _mm256_extracti128_si256(M2, 1));
++ _mm_store_si128((__m128i*)(tmp), s0);
++
++ tmp += i_tmp * 2;
++ }
++ {
++ // the last row
++ __m128i t0, t1, t2, t3;
++ __m128i m0, m1, m2, m3;
++ s0 = _mm_loadu_si128((__m128i*)(src));
++ s1 = _mm_loadu_si128((__m128i*)(src + 4));
++ src += i_src;
++
++ t0 = _mm_shuffle_epi8(s0, _mm256_castsi256_si128(mShuffle0));
++ t1 = _mm_shuffle_epi8(s0, _mm256_castsi256_si128(mShuffle1));
++ t2 = _mm_shuffle_epi8(s1, _mm256_castsi256_si128(mShuffle0));
++ t3 = _mm_shuffle_epi8(s1, _mm256_castsi256_si128(mShuffle1));
++
++ m0 = _mm_madd_epi16(t0, _mm256_castsi256_si128(mCoef0));
++ m1 = _mm_madd_epi16(t1, _mm256_castsi256_si128(mCoef1));
++ m2 = _mm_madd_epi16(t2, _mm256_castsi256_si128(mCoef2));
++ m3 = _mm_madd_epi16(t3, _mm256_castsi256_si128(mCoef3));
++
++ m0 = _mm_add_epi32(m0, m1);
++ m1 = _mm_add_epi32(m2, m3);
++
++ m0 = _mm_add_epi32(m0, m1);
++
++ m0 = _mm_add_epi32(m0, _mm256_castsi256_si128(offset));
++ m0 = _mm_srai_epi32(m0, shift1);
++ m0 = _mm_packs_epi32(m0, m0);
++ _mm_storel_epi64((__m128i*)tmp, m0);
++ }
++ }
++
++ {
++ __m256i T4, T5, T6, T7, M4, M5, M6, M7;
++ __m128i d0, d1;
++
++ offset = _mm256_set1_epi32(add2);
++ tmp = tmp_res;
++
++ mCoef0 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_y)[0]));
++ mCoef1 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_y)[1]));
++ mCoef2 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_y)[2]));
++ mCoef3 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_y)[3]));
++
++ while (height > 0) {
++ T0 = _mm256_load_si256((__m256i*)(tmp));
++ T1 = _mm256_loadu_si256((__m256i*)(tmp + i_tmp));
++ T2 = _mm256_loadu_si256((__m256i*)(tmp + 2 * i_tmp));
++ T3 = _mm256_loadu_si256((__m256i*)(tmp + 3 * i_tmp));
++ T4 = _mm256_load_si256((__m256i*)(tmp + 4 * i_tmp));
++ T5 = _mm256_loadu_si256((__m256i*)(tmp + 5 * i_tmp));
++ T6 = _mm256_loadu_si256((__m256i*)(tmp + 6 * i_tmp));
++ T7 = _mm256_loadu_si256((__m256i*)(tmp + 7 * i_tmp));
++ height -= 4;
++ tmp += i_tmp * 4;
++
++ M0 = _mm256_unpacklo_epi16(T0, T1);
++ M1 = _mm256_unpacklo_epi16(T2, T3);
++ M2 = _mm256_unpacklo_epi16(T4, T5);
++ M3 = _mm256_unpacklo_epi16(T6, T7);
++ M4 = _mm256_unpackhi_epi16(T0, T1);
++ M5 = _mm256_unpackhi_epi16(T2, T3);
++ M6 = _mm256_unpackhi_epi16(T4, T5);
++ M7 = _mm256_unpackhi_epi16(T6, T7);
++
++ M0 = _mm256_madd_epi16(M0, mCoef0);
++ M1 = _mm256_madd_epi16(M1, mCoef1);
++ M2 = _mm256_madd_epi16(M2, mCoef2);
++ M3 = _mm256_madd_epi16(M3, mCoef3);
++ M4 = _mm256_madd_epi16(M4, mCoef0);
++ M5 = _mm256_madd_epi16(M5, mCoef1);
++ M6 = _mm256_madd_epi16(M6, mCoef2);
++ M7 = _mm256_madd_epi16(M7, mCoef3);
++
++ M0 = _mm256_add_epi32(M0, M1);
++ M1 = _mm256_add_epi32(M2, M3);
++ M2 = _mm256_add_epi32(M4, M5);
++ M3 = _mm256_add_epi32(M6, M7);
++
++ M0 = _mm256_add_epi32(M0, M1);
++ M1 = _mm256_add_epi32(M2, M3);
++
++ M0 = _mm256_add_epi32(M0, offset);
++ M1 = _mm256_add_epi32(M1, offset);
++ M0 = _mm256_srai_epi32(M0, shift2);
++ M1 = _mm256_srai_epi32(M1, shift2);
++ M0 = _mm256_packus_epi32(M0, M1);
++ M0 = _mm256_min_epu16(M0, max_pel);
++
++ d0 = _mm256_castsi256_si128(M0);
++ d1 = _mm256_extracti128_si256(M0, 1);
++ _mm_storel_epi64((__m128i*)(dst), d0);
++ _mm_storeh_pi((__m64*)(dst + i_dst), _mm_castsi128_ps(d0));
++ _mm_storel_epi64((__m128i*)(dst + (i_dst << 1)), d1);
++ _mm_storeh_pi((__m64*)(dst + i_dst * 3), _mm_castsi128_ps(d1));
++
++ dst += i_dst << 2;
++ }
++ }
++}
++
++void uavs3d_if_hor_ver_luma_w8_avx2(const pel *src, int i_src, pel *dst, int i_dst, int width, int height, const s8 *coef_x, const s8 *coef_y, int max_val)
++{
++ ALIGNED_32(s16 tmp_res[(64 + 7) * 8]);
++ s16 *tmp = tmp_res;
++ int row;
++ int add1, shift1;
++ int add2, shift2;
++ __m256i offset;
++ __m256i T0, T1, T2, T3, T4, T5;
++ __m256i M0, M1, M2, M3, M4, M5, M6, M7;
++ const int i_tmp = 8;
++ __m256i mCoef0, mCoef1, mCoef2, mCoef3;
++ __m256i max_pel = _mm256_set1_epi16((pel)max_val);
++
++ if (max_val == 255) { // 8 bit_depth
++ shift1 = 0;
++ shift2 = 12;
++ }
++ else { // 10 bit_depth
++ shift1 = 2;
++ shift2 = 10;
++ }
++
++ add1 = (1 << (shift1)) >> 1;
++ add2 = 1 << (shift2 - 1);
++
++ src += -3 * i_src - 3;
++
++ {
++ __m128i s0, s1;
++ __m256i S0, S1, S2;
++ __m256i mShuffle0 = _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9);
++ __m256i mShuffle1 = _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13);
++
++ mCoef0 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_x)[0]));
++ mCoef1 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_x)[1]));
++ mCoef2 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_x)[2]));
++ mCoef3 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_x)[3]));
++ offset = _mm256_set1_epi32(add1);
++
++ row = height + 6;
++
++ while (row > 0) {
++ T0 = _mm256_loadu_si256((__m256i*)(src));
++ s0 = _mm_loadu_si128((__m128i*)(src + 4));
++ T1 = _mm256_loadu_si256((__m256i*)(src + i_src));
++ s1 = _mm_loadu_si128((__m128i*)(src + i_src + 4));
++ row -= 2;
++ src += i_src << 1;
++ uavs3d_prefetch(src, _MM_HINT_NTA);
++ uavs3d_prefetch(src + i_src, _MM_HINT_NTA);
++
++ S0 = _mm256_permute2x128_si256(T0, T1, 0x20);
++ S2 = _mm256_permute2x128_si256(T0, T1, 0x31);
++ S1 = _mm256_set_m128i(s1, s0);
++
++ T0 = _mm256_shuffle_epi8(S0, mShuffle0);
++ T1 = _mm256_shuffle_epi8(S0, mShuffle1);
++ T2 = _mm256_shuffle_epi8(S1, mShuffle0);
++ T3 = _mm256_shuffle_epi8(S1, mShuffle1);
++ T4 = _mm256_shuffle_epi8(S2, mShuffle0);
++ T5 = _mm256_shuffle_epi8(S2, mShuffle1);
++
++ M0 = _mm256_madd_epi16(T0, mCoef0);
++ M1 = _mm256_madd_epi16(T1, mCoef1);
++ M2 = _mm256_madd_epi16(T2, mCoef2);
++ M3 = _mm256_madd_epi16(T3, mCoef3);
++ M4 = _mm256_madd_epi16(T2, mCoef0);
++ M5 = _mm256_madd_epi16(T3, mCoef1);
++ M6 = _mm256_madd_epi16(T4, mCoef2);
++ M7 = _mm256_madd_epi16(T5, mCoef3);
++
++ M0 = _mm256_add_epi32(M0, M1);
++ M1 = _mm256_add_epi32(M2, M3);
++ M2 = _mm256_add_epi32(M4, M5);
++ M3 = _mm256_add_epi32(M6, M7);
++
++ M0 = _mm256_add_epi32(M0, M1);
++ M1 = _mm256_add_epi32(M2, M3);
+
+ M2 = _mm256_add_epi32(M0, offset);
+ M3 = _mm256_add_epi32(M1, offset);
+ M2 = _mm256_srai_epi32(M2, shift1);
+ M3 = _mm256_srai_epi32(M3, shift1);
+ M2 = _mm256_packs_epi32(M2, M3);
+- _mm256_storeu_si256((__m256i*)(tmp + i), M2);
+
+- p += 8;
++ _mm256_store_si256((__m256i*)(tmp), M2);
++
++ tmp += i_tmp * 2;
++ }
++ {
++ // the last row
++ __m128i t0, t1, t2, t3, t4, t5;
++ __m128i m0, m1, m2, m3, m4, m5, m6, m7;
++ __m128i s2;
++ s0 = _mm_loadu_si128((__m128i*)(src));
++ s1 = _mm_loadu_si128((__m128i*)(src + 4));
++ s2 = _mm_loadu_si128((__m128i*)(src + 8));
++ src += i_src;
++
++ t0 = _mm_shuffle_epi8(s0, _mm256_castsi256_si128(mShuffle0));
++ t1 = _mm_shuffle_epi8(s0, _mm256_castsi256_si128(mShuffle1));
++ t2 = _mm_shuffle_epi8(s1, _mm256_castsi256_si128(mShuffle0));
++ t3 = _mm_shuffle_epi8(s1, _mm256_castsi256_si128(mShuffle1));
++ t4 = _mm_shuffle_epi8(s2, _mm256_castsi256_si128(mShuffle0));
++ t5 = _mm_shuffle_epi8(s2, _mm256_castsi256_si128(mShuffle1));
++
++ m0 = _mm_madd_epi16(t0, _mm256_castsi256_si128(mCoef0));
++ m1 = _mm_madd_epi16(t1, _mm256_castsi256_si128(mCoef1));
++ m2 = _mm_madd_epi16(t2, _mm256_castsi256_si128(mCoef2));
++ m3 = _mm_madd_epi16(t3, _mm256_castsi256_si128(mCoef3));
++ m4 = _mm_madd_epi16(t2, _mm256_castsi256_si128(mCoef0));
++ m5 = _mm_madd_epi16(t3, _mm256_castsi256_si128(mCoef1));
++ m6 = _mm_madd_epi16(t4, _mm256_castsi256_si128(mCoef2));
++ m7 = _mm_madd_epi16(t5, _mm256_castsi256_si128(mCoef3));
++
++ m0 = _mm_add_epi32(m0, m1);
++ m1 = _mm_add_epi32(m2, m3);
++ m2 = _mm_add_epi32(m4, m5);
++ m3 = _mm_add_epi32(m6, m7);
++
++ m0 = _mm_add_epi32(m0, m1);
++ m1 = _mm_add_epi32(m2, m3);
++
++ m2 = _mm_add_epi32(m0, _mm256_castsi256_si128(offset));
++ m3 = _mm_add_epi32(m1, _mm256_castsi256_si128(offset));
++ m2 = _mm_srai_epi32(m2, shift1);
++ m3 = _mm_srai_epi32(m3, shift1);
++ m2 = _mm_packs_epi32(m2, m3);
++ _mm_store_si128((__m128i*)tmp, m2);
+ }
+- tmp += i_tmp;
+- src += i_src;
+ }
+
+- offset = _mm256_set1_epi32(add2);
+- tmp = tmp_res;
++ {
++ __m256i N0, N1, N2, N3, N4, N5, N6, N7;
++ __m256i T6, T7;
++ offset = _mm256_set1_epi32(add2);
++ tmp = tmp_res;
+
+- coeff0 = _mm_set1_epi16(*(s16*)(coef_y));
+- coeff1 = _mm_set1_epi16(*(s16*)(coef_y + 2));
+- coeff2 = _mm_set1_epi16(*(s16*)(coef_y + 4));
+- coeff3 = _mm_set1_epi16(*(s16*)(coef_y + 6));
+- coeff00 = _mm256_cvtepi8_epi16(coeff0);
+- coeff01 = _mm256_cvtepi8_epi16(coeff1);
+- coeff02 = _mm256_cvtepi8_epi16(coeff2);
+- coeff03 = _mm256_cvtepi8_epi16(coeff3);
++ mCoef0 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_y)[0]));
++ mCoef1 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_y)[1]));
++ mCoef2 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_y)[2]));
++ mCoef3 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_y)[3]));
+
+- while (height--) {
+- const pel *p = (pel*)tmp;
+- for (i = 0; i < width; i += 16) {
+- T0 = _mm256_load_si256((__m256i*)(p));
+- T1 = _mm256_load_si256((__m256i*)(p + i_tmp));
+- T2 = _mm256_load_si256((__m256i*)(p + 2 * i_tmp));
+- T3 = _mm256_load_si256((__m256i*)(p + 3 * i_tmp));
+- T4 = _mm256_load_si256((__m256i*)(p + 4 * i_tmp));
+- T5 = _mm256_load_si256((__m256i*)(p + 5 * i_tmp));
+- T6 = _mm256_load_si256((__m256i*)(p + 6 * i_tmp));
+- T7 = _mm256_load_si256((__m256i*)(p + 7 * i_tmp));
++ while (height > 0) {
++ T0 = _mm256_load_si256((__m256i*)(tmp));
++ T1 = _mm256_loadu_si256((__m256i*)(tmp + i_tmp));
++ T2 = _mm256_load_si256((__m256i*)(tmp + 2 * i_tmp));
++ T3 = _mm256_loadu_si256((__m256i*)(tmp + 3 * i_tmp));
++ T4 = _mm256_load_si256((__m256i*)(tmp + 4 * i_tmp));
++ T5 = _mm256_loadu_si256((__m256i*)(tmp + 5 * i_tmp));
++ T6 = _mm256_load_si256((__m256i*)(tmp + 6 * i_tmp));
++ T7 = _mm256_loadu_si256((__m256i*)(tmp + 7 * i_tmp));
++ height -= 2;
++ tmp += i_tmp * 2;
+
+ M0 = _mm256_unpacklo_epi16(T0, T1);
+ M1 = _mm256_unpacklo_epi16(T2, T3);
+@@ -3935,14 +4371,14 @@ void uavs3d_if_hor_ver_luma_w16x_avx2(const pel *src, int i_src, pel *dst, int i
+ M6 = _mm256_unpackhi_epi16(T4, T5);
+ M7 = _mm256_unpackhi_epi16(T6, T7);
+
+- N0 = _mm256_madd_epi16(M0, coeff00);
+- N1 = _mm256_madd_epi16(M1, coeff01);
+- N2 = _mm256_madd_epi16(M2, coeff02);
+- N3 = _mm256_madd_epi16(M3, coeff03);
+- N4 = _mm256_madd_epi16(M4, coeff00);
+- N5 = _mm256_madd_epi16(M5, coeff01);
+- N6 = _mm256_madd_epi16(M6, coeff02);
+- N7 = _mm256_madd_epi16(M7, coeff03);
++ N0 = _mm256_madd_epi16(M0, mCoef0);
++ N1 = _mm256_madd_epi16(M1, mCoef1);
++ N2 = _mm256_madd_epi16(M2, mCoef2);
++ N3 = _mm256_madd_epi16(M3, mCoef3);
++ N4 = _mm256_madd_epi16(M4, mCoef0);
++ N5 = _mm256_madd_epi16(M5, mCoef1);
++ N6 = _mm256_madd_epi16(M6, mCoef2);
++ N7 = _mm256_madd_epi16(M7, mCoef3);
+
+ N0 = _mm256_add_epi32(N0, N1);
+ N1 = _mm256_add_epi32(N2, N3);
+@@ -3958,14 +4394,164 @@ void uavs3d_if_hor_ver_luma_w16x_avx2(const pel *src, int i_src, pel *dst, int i
+ N1 = _mm256_srai_epi32(N1, shift2);
+ N0 = _mm256_packus_epi32(N0, N1);
+ N0 = _mm256_min_epu16(N0, max_pel);
+- _mm256_storeu_si256((__m256i*)(dst + i), N0);
+
+- p += 16;
++ _mm_storeu_si128((__m128i*)(dst), _mm256_castsi256_si128(N0));
++ _mm_storeu_si128((__m128i*)(dst + i_dst), _mm256_extracti128_si256(N0, 1));
++
++ dst += i_dst << 1;
+ }
+- dst += i_dst;
+- tmp += i_tmp;
++ }
++}
++
++void uavs3d_if_hor_ver_luma_w16x_avx2(const pel *src, int i_src, pel *dst, int i_dst, int width, int height, const s8 *coef_x, const s8 *coef_y, int max_val)
++{
++ ALIGNED_32(s16 tmp_res[(128 + 7) * 128]);
++ s16 *tmp = tmp_res;
++ int row, i;;
++ int add1, shift1;
++ int add2, shift2;
++ __m256i offset;
++ __m256i T0, T1, T2, T3, T4, T5;
++ __m256i M0, M1, M2, M3, M4, M5, M6, M7;
++ int i_tmp = width;
++ __m256i mCoef0, mCoef1, mCoef2, mCoef3;
++ __m256i max_pel = _mm256_set1_epi16((pel)max_val);
++
++ if (max_val == 255) { // 8 bit_depth
++ shift1 = 0;
++ shift2 = 12;
++ }
++ else { // 10 bit_depth
++ shift1 = 2;
++ shift2 = 10;
+ }
+
++ add1 = (1 << (shift1)) >> 1;
++ add2 = 1 << (shift2 - 1);
++
++ src += -3 * i_src - 3;
++
++ {
++ __m256i S0, S1, S2;
++ __m256i mShuffle0 = _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9);
++ __m256i mShuffle1 = _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13);
++
++ mCoef0 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_x)[0]));
++ mCoef1 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_x)[1]));
++ mCoef2 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_x)[2]));
++ mCoef3 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_x)[3]));
++ offset = _mm256_set1_epi32(add1);
++
++ row = height + 7;
++
++ while (row--) {
++ const pel *p = src;
++ uavs3d_prefetch(src + i_src, _MM_HINT_NTA);
++ for (i = 0; i < width; i += 16) {
++ S0 = _mm256_loadu_si256((__m256i*)(p));
++ S1 = _mm256_loadu_si256((__m256i*)(p + 4));
++ S2 = _mm256_loadu_si256((__m256i*)(p + 8));
++
++ T0 = _mm256_shuffle_epi8(S0, mShuffle0);
++ T1 = _mm256_shuffle_epi8(S0, mShuffle1);
++ T2 = _mm256_shuffle_epi8(S1, mShuffle0);
++ T3 = _mm256_shuffle_epi8(S1, mShuffle1);
++ T4 = _mm256_shuffle_epi8(S2, mShuffle0);
++ T5 = _mm256_shuffle_epi8(S2, mShuffle1);
++
++ M0 = _mm256_madd_epi16(T0, mCoef0);
++ M1 = _mm256_madd_epi16(T1, mCoef1);
++ M2 = _mm256_madd_epi16(T2, mCoef2);
++ M3 = _mm256_madd_epi16(T3, mCoef3);
++ M4 = _mm256_madd_epi16(T2, mCoef0);
++ M5 = _mm256_madd_epi16(T3, mCoef1);
++ M6 = _mm256_madd_epi16(T4, mCoef2);
++ M7 = _mm256_madd_epi16(T5, mCoef3);
++
++ M0 = _mm256_add_epi32(M0, M1);
++ M1 = _mm256_add_epi32(M2, M3);
++ M2 = _mm256_add_epi32(M4, M5);
++ M3 = _mm256_add_epi32(M6, M7);
++
++ M0 = _mm256_add_epi32(M0, M1);
++ M1 = _mm256_add_epi32(M2, M3);
++
++ p += 16;
++ M2 = _mm256_add_epi32(M0, offset);
++ M3 = _mm256_add_epi32(M1, offset);
++ M2 = _mm256_srai_epi32(M2, shift1);
++ M3 = _mm256_srai_epi32(M3, shift1);
++ M2 = _mm256_packs_epi32(M2, M3);
++ _mm256_storeu_si256((__m256i*)(tmp + i), M2);
++ }
++ tmp += i_tmp;
++ src += i_src;
++ }
++ }
++
++ {
++ __m256i N0, N1, N2, N3, N4, N5, N6, N7;
++ __m256i T6, T7;
++ offset = _mm256_set1_epi32(add2);
++ tmp = tmp_res;
++
++ mCoef0 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_y)[0]));
++ mCoef1 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_y)[1]));
++ mCoef2 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_y)[2]));
++ mCoef3 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_y)[3]));
++
++ while (height--) {
++ const pel *p = (pel*)tmp;
++ for (i = 0; i < width; i += 16) {
++ T0 = _mm256_load_si256((__m256i*)(p));
++ T1 = _mm256_load_si256((__m256i*)(p + i_tmp));
++ T2 = _mm256_load_si256((__m256i*)(p + 2 * i_tmp));
++ T3 = _mm256_load_si256((__m256i*)(p + 3 * i_tmp));
++ T4 = _mm256_load_si256((__m256i*)(p + 4 * i_tmp));
++ T5 = _mm256_load_si256((__m256i*)(p + 5 * i_tmp));
++ T6 = _mm256_load_si256((__m256i*)(p + 6 * i_tmp));
++ T7 = _mm256_load_si256((__m256i*)(p + 7 * i_tmp));
++
++ M0 = _mm256_unpacklo_epi16(T0, T1);
++ M1 = _mm256_unpacklo_epi16(T2, T3);
++ M2 = _mm256_unpacklo_epi16(T4, T5);
++ M3 = _mm256_unpacklo_epi16(T6, T7);
++ M4 = _mm256_unpackhi_epi16(T0, T1);
++ M5 = _mm256_unpackhi_epi16(T2, T3);
++ M6 = _mm256_unpackhi_epi16(T4, T5);
++ M7 = _mm256_unpackhi_epi16(T6, T7);
++
++ N0 = _mm256_madd_epi16(M0, mCoef0);
++ N1 = _mm256_madd_epi16(M1, mCoef1);
++ N2 = _mm256_madd_epi16(M2, mCoef2);
++ N3 = _mm256_madd_epi16(M3, mCoef3);
++ N4 = _mm256_madd_epi16(M4, mCoef0);
++ N5 = _mm256_madd_epi16(M5, mCoef1);
++ N6 = _mm256_madd_epi16(M6, mCoef2);
++ N7 = _mm256_madd_epi16(M7, mCoef3);
++
++ N0 = _mm256_add_epi32(N0, N1);
++ N1 = _mm256_add_epi32(N2, N3);
++ N2 = _mm256_add_epi32(N4, N5);
++ N3 = _mm256_add_epi32(N6, N7);
++
++ N0 = _mm256_add_epi32(N0, N1);
++ N1 = _mm256_add_epi32(N2, N3);
++
++ N0 = _mm256_add_epi32(N0, offset);
++ N1 = _mm256_add_epi32(N1, offset);
++ N0 = _mm256_srai_epi32(N0, shift2);
++ N1 = _mm256_srai_epi32(N1, shift2);
++ N0 = _mm256_packus_epi32(N0, N1);
++ N0 = _mm256_min_epu16(N0, max_pel);
++ _mm256_storeu_si256((__m256i*)(dst + i), N0);
++
++ p += 16;
++ }
++ dst += i_dst;
++ tmp += i_tmp;
++ }
++ }
+ }
+
+ void uavs3d_if_hor_ver_chroma_w8_avx2(const pel *src, int i_src, pel *dst, int i_dst, int width, int height, const s8 *coef_x, const s8 *coef_y, int max_val)
+@@ -3979,14 +4565,6 @@ void uavs3d_if_hor_ver_chroma_w8_avx2(const pel *src, int i_src, pel *dst, int i
+ int shift1, shift2;
+ int add1, add2;
+
+- __m128i coef0 = _mm_set1_epi16(*(s16*)coef_x);
+- __m128i coef1 = _mm_set1_epi16(*(s16*)(coef_x + 2));
+- __m256i mCoef0 = _mm256_cvtepi8_epi16(coef0);
+- __m256i mCoef1 = _mm256_cvtepi8_epi16(coef1);
+- __m256i mSwitch = _mm256_setr_epi8(0, 1, 4, 5, 2, 3, 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 0, 1, 4, 5, 2, 3, 6, 7, 4, 5, 8, 9, 6, 7, 10, 11);
+- __m256i T0, T1, S0, S1, sum;
+- __m256i mAddOffset;
+-
+ if (max_val == 255) { // 8 bit_depth
+ shift1 = 0;
+ shift2 = 12;
+@@ -3999,25 +4577,34 @@ void uavs3d_if_hor_ver_chroma_w8_avx2(const pel *src, int i_src, pel *dst, int i
+ add1 = (1 << (shift1)) >> 1;
+ add2 = 1 << (shift2 - 1);
+
+- mAddOffset = _mm256_set1_epi32(add1);
+ //HOR
++ __m128i coef0 = _mm_cvtepi8_epi16(_mm_set1_epi16(((s16*)coef_x)[0]));
++ __m128i coef1 = _mm_cvtepi8_epi16(_mm_set1_epi16(((s16*)coef_x)[1]));
++ __m256i mCoef0 = _mm256_set_m128i(coef1, coef0);
++ __m256i mCoef1 = _mm256_set_m128i(coef0, coef1);
++ __m256i mSwitch = _mm256_setr_epi8(0, 1, 4, 5, 2, 3, 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 0, 1, 4, 5, 2, 3, 6, 7, 4, 5, 8, 9, 6, 7, 10, 11);
++ __m256i T0, T1, S0, S1, sum;
++ __m256i mAddOffset = _mm256_set1_epi32(add1);
++ __m128i mDst;
++ __m128i s0;
++
+ src = src - i_src - 2;
+ row = height + 3;
+ while (row--) {
+- uavs3d_prefetch(src + i_src, _MM_HINT_NTA);
+ S0 = _mm256_loadu_si256((__m256i*)(src));
+- S1 = _mm256_loadu_si256((__m256i*)(src + 4));
+- S0 = _mm256_permute4x64_epi64(S0, 0x94);
+- S1 = _mm256_permute4x64_epi64(S1, 0x94);
+- T0 = _mm256_madd_epi16(_mm256_shuffle_epi8(S0, mSwitch), mCoef0);
+- T1 = _mm256_madd_epi16(_mm256_shuffle_epi8(S1, mSwitch), mCoef1);
++ s0 = _mm_loadu_si128((__m128i*)(src + 4));
++ uavs3d_prefetch(src + i_src, _MM_HINT_NTA);
++ S1 = _mm256_set_m128i(s0, s0);
++ T0 = _mm256_shuffle_epi8(S0, mSwitch);
++ T1 = _mm256_shuffle_epi8(S1, mSwitch);
++ T0 = _mm256_madd_epi16(T0, mCoef0);
++ T1 = _mm256_madd_epi16(T1, mCoef1);
+ sum = _mm256_add_epi32(T0, T1);
+
+ sum = _mm256_add_epi32(sum, mAddOffset);
+ sum = _mm256_srai_epi32(sum, shift1);
+- sum = _mm256_packs_epi32(sum, sum);
+- sum = _mm256_permute4x64_epi64(sum, 0xd8);
+- _mm_storeu_si128((__m128i*)(tmp), _mm256_castsi256_si128(sum));
++ mDst = _mm_packs_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
++ _mm_store_si128((__m128i*)(tmp), mDst);
+
+ src += i_src;
+ tmp += i_tmp;
+@@ -4027,46 +4614,49 @@ void uavs3d_if_hor_ver_chroma_w8_avx2(const pel *src, int i_src, pel *dst, int i
+ tmp = tmp_res;
+
+ {
+- __m128i max_val1 = _mm_set1_epi16((pel)max_val);
+- __m128i coeff0, coeff1, mVal, mAddOffset2;
+- __m128i M0, M1, M2, M3;
++ __m128i coeff0, coeff1;
++ __m256i MaxVal = _mm256_set1_epi16((pel)max_val);
++ __m256i C0, C1, mVal, mAddOffset2;
++ __m256i M0, M1, M2, M3;
+
+ coeff0 = _mm_set1_epi16(*(s16*)coef_y);
+ coeff1 = _mm_set1_epi16(*(s16*)(coef_y + 2));
+- mAddOffset2 = _mm_set1_epi32(add2);
+-
+- coeff0 = _mm_cvtepi8_epi16(coeff0);
+- coeff1 = _mm_cvtepi8_epi16(coeff1);
+- while (height--) {
+- __m128i T00 = _mm_load_si128((__m128i*)(tmp));
+- __m128i T10 = _mm_load_si128((__m128i*)(tmp + i_tmp));
+- __m128i T20 = _mm_load_si128((__m128i*)(tmp + i_tmp2));
+- __m128i T30 = _mm_load_si128((__m128i*)(tmp + i_tmp3));
++ mAddOffset2 = _mm256_set1_epi32(add2);
+
+- M0 = _mm_unpacklo_epi16(T00, T10);
+- M1 = _mm_unpacklo_epi16(T20, T30);
+- M2 = _mm_unpackhi_epi16(T00, T10);
+- M3 = _mm_unpackhi_epi16(T20, T30);
+-
+- M0 = _mm_madd_epi16(M0, coeff0);
+- M1 = _mm_madd_epi16(M1, coeff1);
+- M2 = _mm_madd_epi16(M2, coeff0);
+- M3 = _mm_madd_epi16(M3, coeff1);
+-
+- M0 = _mm_add_epi32(M0, M1);
+- M2 = _mm_add_epi32(M2, M3);
+-
+- M0 = _mm_add_epi32(M0, mAddOffset2);
+- M2 = _mm_add_epi32(M2, mAddOffset2);
+- M0 = _mm_srai_epi32(M0, shift2);
+- M2 = _mm_srai_epi32(M2, shift2);
+-
+- mVal = _mm_packus_epi32(M0, M2);
+- mVal = _mm_min_epu16(mVal, max_val1);
+- _mm_storeu_si128((__m128i*)dst, mVal);
+-
+- tmp += i_tmp;
+- dst += i_dst;
++ C0 = _mm256_cvtepi8_epi16(coeff0);
++ C1 = _mm256_cvtepi8_epi16(coeff1);
++ while (height) {
++ __m256i T00 = _mm256_load_si256((__m256i*)(tmp));
++ __m256i T10 = _mm256_loadu_si256((__m256i*)(tmp + i_tmp));
++ __m256i T20 = _mm256_load_si256((__m256i*)(tmp + i_tmp2));
++ __m256i T30 = _mm256_loadu_si256((__m256i*)(tmp + i_tmp3));
++
++ M0 = _mm256_unpacklo_epi16(T00, T10);
++ M1 = _mm256_unpacklo_epi16(T20, T30);
++ M2 = _mm256_unpackhi_epi16(T00, T10);
++ M3 = _mm256_unpackhi_epi16(T20, T30);
++
++ M0 = _mm256_madd_epi16(M0, C0);
++ M1 = _mm256_madd_epi16(M1, C1);
++ M2 = _mm256_madd_epi16(M2, C0);
++ M3 = _mm256_madd_epi16(M3, C1);
++
++ M0 = _mm256_add_epi32(M0, M1);
++ M2 = _mm256_add_epi32(M2, M3);
++
++ M0 = _mm256_add_epi32(M0, mAddOffset2);
++ M2 = _mm256_add_epi32(M2, mAddOffset2);
++ M0 = _mm256_srai_epi32(M0, shift2);
++ M2 = _mm256_srai_epi32(M2, shift2);
++
++ mVal = _mm256_packus_epi32(M0, M2);
++ mVal = _mm256_min_epu16(mVal, MaxVal);
++ _mm_storeu_si128((__m128i*)dst, _mm256_castsi256_si128(mVal));
++ _mm_storeu_si128((__m128i*)(dst + i_dst), _mm256_extracti128_si256(mVal, 1));
++
++ height -= 2;
++ tmp += i_tmp2;
++ dst += i_dst << 1;
+ }
+ }
+ }
+@@ -4113,28 +4703,26 @@ void uavs3d_if_hor_ver_chroma_w16x_avx2(const pel *src, int i_src, pel *dst, int
+ while (row--) {
+ uavs3d_prefetch(src + i_src, _MM_HINT_NTA);
+ for (col = 0; col < width; col += 16) {
+- S0 = _mm256_loadu_si256((__m256i*)(src + col));
+- S1 = _mm256_loadu_si256((__m256i*)(src + col + 4));
++ S0 = _mm256_loadu_si256((__m256i*)(src + col));
++ S1 = _mm256_loadu_si256((__m256i*)(src + col + 4));
+ S2 = _mm256_loadu_si256((__m256i*)(src + col + 8));
+- S3 = _mm256_loadu_si256((__m256i*)(src + col + 12));
+- S0 = _mm256_permute4x64_epi64(S0, 0x94);
+- S1 = _mm256_permute4x64_epi64(S1, 0x94);
+- S2 = _mm256_permute4x64_epi64(S2, 0x94);
+- S3 = _mm256_permute4x64_epi64(S3, 0x94);
+- T0 = _mm256_madd_epi16(_mm256_shuffle_epi8(S0, mSwitch), mCoef0);
+- T1 = _mm256_madd_epi16(_mm256_shuffle_epi8(S1, mSwitch), mCoef1);
+- T2 = _mm256_madd_epi16(_mm256_shuffle_epi8(S2, mSwitch), mCoef0);
+- T3 = _mm256_madd_epi16(_mm256_shuffle_epi8(S3, mSwitch), mCoef1);
+- T0 = _mm256_add_epi32(T0, T1);
+- T2 = _mm256_add_epi32(T2, T3);
++ T0 = _mm256_shuffle_epi8(S0, mSwitch);
++ T1 = _mm256_shuffle_epi8(S1, mSwitch);
++ T2 = _mm256_shuffle_epi8(S1, mSwitch);
++ T3 = _mm256_shuffle_epi8(S2, mSwitch);
++ S0 = _mm256_madd_epi16(T0, mCoef0);
++ S1 = _mm256_madd_epi16(T1, mCoef1);
++ S2 = _mm256_madd_epi16(T2, mCoef0);
++ S3 = _mm256_madd_epi16(T3, mCoef1);
++ T0 = _mm256_add_epi32(S0, S1);
++ T2 = _mm256_add_epi32(S2, S3);
+
+ T0 = _mm256_add_epi32(T0, mAddOffset);
+ T2 = _mm256_add_epi32(T2, mAddOffset);
+ T0 = _mm256_srai_epi32(T0, shift1);
+ T2 = _mm256_srai_epi32(T2, shift1);
+ T0 = _mm256_packs_epi32(T0, T2);
+- T0 = _mm256_permute4x64_epi64(T0, 0xd8);
+- _mm256_storeu_si256((__m256i*)(tmp + col), T0);
++ _mm256_store_si256((__m256i*)(tmp + col), T0);
+ }
+ src += i_src;
+ tmp += i_tmp;
+diff --git a/source/decore/avx2/intra_pred_avx2.c b/source/decore/avx2/intra_pred_avx2.c
+index 07e19b2..18961f6 100644
+--- a/source/decore/avx2/intra_pred_avx2.c
++++ b/source/decore/avx2/intra_pred_avx2.c
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+@@ -204,7 +199,7 @@ void uavs3d_ipred_hor_avx2(pel *src, pel *dst, int i_dst, int width, int height)
+
+ void uavs3d_ipred_dc_avx2(pel *src, pel *dst, int i_dst, int width, int height, u16 avail_cu, int bit_depth)
+ {
+- int i, x, y;
++ int x, y;
+ int dc;
+ pel *p_src = src - 1;
+ int left_avail = IS_AVAIL(avail_cu, AVAIL_LE);
+@@ -212,6 +207,7 @@ void uavs3d_ipred_dc_avx2(pel *src, pel *dst, int i_dst, int width, int height,
+
+ if (left_avail && above_avail) {
+ int length = width + height + 1;
++ int i;
+ __m128i sum = _mm_setzero_si128();
+ __m128i val;
+
+@@ -2738,7 +2734,6 @@ void uavs3d_ipred_ang_xy_18_avx2(pel *src, pel *dst, int i_dst, int mode, int wi
+ dst += i_dst;
+ }
+ break;
+- break;
+ }
+
+ }
+@@ -3452,10 +3447,10 @@ void uavs3d_ipred_ver_avx2(pel *src, pel *dst, int i_dst, int width, int height)
+ T0 = _mm256_loadu_si256((__m256i *)(src));
+ T1 = _mm256_loadu_si256((__m256i *)(src + 16));
+ for (y = 0; y < height; y += 2) {
+- _mm256_store_si256((__m256i *)(dst), T0);
+- _mm256_store_si256((__m256i *)(dst + 16), T1);
+- _mm256_store_si256((__m256i *)(dst + i_dst), T0);
+- _mm256_store_si256((__m256i *)(dst + i_dst + 16), T1);
++ _mm256_storeu_si256((__m256i *)(dst), T0);
++ _mm256_storeu_si256((__m256i *)(dst + 16), T1);
++ _mm256_storeu_si256((__m256i *)(dst + i_dst), T0);
++ _mm256_storeu_si256((__m256i *)(dst + i_dst + 16), T1);
+ dst += i_dst2;
+ }
+ break;
+@@ -3468,14 +3463,14 @@ void uavs3d_ipred_ver_avx2(pel *src, pel *dst, int i_dst, int width, int height)
+ T2 = _mm256_loadu_si256((__m256i *)(src + 32));
+ T3 = _mm256_loadu_si256((__m256i *)(src + 48));
+ for (y = 0; y < height; y += 2) {
+- _mm256_store_si256((__m256i *)(dst), T0);
+- _mm256_store_si256((__m256i *)(dst + 16), T1);
+- _mm256_store_si256((__m256i *)(dst + 32), T2);
+- _mm256_store_si256((__m256i *)(dst + 48), T3);
+- _mm256_store_si256((__m256i *)(dst + i_dst), T0);
+- _mm256_store_si256((__m256i *)(dst + i_dst + 16), T1);
+- _mm256_store_si256((__m256i *)(dst + i_dst + 32), T2);
+- _mm256_store_si256((__m256i *)(dst + i_dst + 48), T3);
++ _mm256_storeu_si256((__m256i *)(dst), T0);
++ _mm256_storeu_si256((__m256i *)(dst + 16), T1);
++ _mm256_storeu_si256((__m256i *)(dst + 32), T2);
++ _mm256_storeu_si256((__m256i *)(dst + 48), T3);
++ _mm256_storeu_si256((__m256i *)(dst + i_dst), T0);
++ _mm256_storeu_si256((__m256i *)(dst + i_dst + 16), T1);
++ _mm256_storeu_si256((__m256i *)(dst + i_dst + 32), T2);
++ _mm256_storeu_si256((__m256i *)(dst + i_dst + 48), T3);
+ dst += i_dst2;
+ }
+ break;
+@@ -3539,10 +3534,10 @@ void uavs3d_ipred_hor_avx2(pel *src, pel *dst, int i_dst, int width, int height)
+ for (y = 0; y < height; y += 2) {
+ T0 = _mm256_set1_epi16(src[-y]);
+ T1 = _mm256_set1_epi16(src[-y - 1]);
+- _mm256_store_si256((__m256i *)(dst), T0);
+- _mm256_store_si256((__m256i *)(dst + 16), T0);
+- _mm256_store_si256((__m256i *)(dst + i_dst), T1);
+- _mm256_store_si256((__m256i *)(dst + i_dst + 16), T1);
++ _mm256_storeu_si256((__m256i *)(dst), T0);
++ _mm256_storeu_si256((__m256i *)(dst + 16), T0);
++ _mm256_storeu_si256((__m256i *)(dst + i_dst), T1);
++ _mm256_storeu_si256((__m256i *)(dst + i_dst + 16), T1);
+ dst += i_dst2;
+ }
+ break;
+@@ -3553,14 +3548,14 @@ void uavs3d_ipred_hor_avx2(pel *src, pel *dst, int i_dst, int width, int height)
+ for (y = 0; y < height; y += 2) {
+ T0 = _mm256_set1_epi16(src[-y]);
+ T1 = _mm256_set1_epi16(src[-y - 1]);
+- _mm256_store_si256((__m256i *)(dst), T0);
+- _mm256_store_si256((__m256i *)(dst + 16), T0);
+- _mm256_store_si256((__m256i *)(dst + 32), T0);
+- _mm256_store_si256((__m256i *)(dst + 48), T0);
+- _mm256_store_si256((__m256i *)(dst + i_dst), T1);
+- _mm256_store_si256((__m256i *)(dst + i_dst + 16), T1);
+- _mm256_store_si256((__m256i *)(dst + i_dst + 32), T1);
+- _mm256_store_si256((__m256i *)(dst + i_dst + 48), T1);
++ _mm256_storeu_si256((__m256i *)(dst), T0);
++ _mm256_storeu_si256((__m256i *)(dst + 16), T0);
++ _mm256_storeu_si256((__m256i *)(dst + 32), T0);
++ _mm256_storeu_si256((__m256i *)(dst + 48), T0);
++ _mm256_storeu_si256((__m256i *)(dst + i_dst), T1);
++ _mm256_storeu_si256((__m256i *)(dst + i_dst + 16), T1);
++ _mm256_storeu_si256((__m256i *)(dst + i_dst + 32), T1);
++ _mm256_storeu_si256((__m256i *)(dst + i_dst + 48), T1);
+ dst += i_dst2;
+ }
+ break;
+diff --git a/source/decore/avx2/itrans_avx2.c b/source/decore/avx2/itrans_avx2.c
+index 9b1df21..d8766b6 100644
+--- a/source/decore/avx2/itrans_avx2.c
++++ b/source/decore/avx2/itrans_avx2.c
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+@@ -79,14 +74,14 @@
+ #define TRANSPOSE_16x8_32BIT_16BIT(I00, I01, I02, I03, I04, I05, I06, I07, I08, I09, I10, I11, I12, I13, I14, I15, O0, O1, O2, O3, O4, O5, O6, O7)\
+ TRANSPOSE_8x8_32BIT_16BIT(I00, I01, I02, I03, I04, I05, I06, I07, I04, I05, I06, I07); \
+ TRANSPOSE_8x8_32BIT_16BIT(I08, I09, I10, I11, I12, I13, I14, I15, I12, I13, I14, I15); \
+- O0 = _mm256_insertf128_si256(I04, _mm256_castsi256_si128(I12), 1); \
+- O1 = _mm256_insertf128_si256(I05, _mm256_castsi256_si128(I13), 1); \
+- O2 = _mm256_insertf128_si256(I06, _mm256_castsi256_si128(I14), 1); \
+- O3 = _mm256_insertf128_si256(I07, _mm256_castsi256_si128(I15), 1); \
+- O4 = _mm256_insertf128_si256(I12, _mm256_extracti128_si256(I04, 1), 0); \
+- O5 = _mm256_insertf128_si256(I13, _mm256_extracti128_si256(I05, 1), 0); \
+- O6 = _mm256_insertf128_si256(I14, _mm256_extracti128_si256(I06, 1), 0); \
+- O7 = _mm256_insertf128_si256(I15, _mm256_extracti128_si256(I07, 1), 0)
++ O0 = _mm256_permute2x128_si256(I04, I12, 0x20); \
++ O1 = _mm256_permute2x128_si256(I05, I13, 0x20); \
++ O2 = _mm256_permute2x128_si256(I06, I14, 0x20); \
++ O3 = _mm256_permute2x128_si256(I07, I15, 0x20); \
++ O4 = _mm256_permute2x128_si256(I04, I12, 0x31); \
++ O5 = _mm256_permute2x128_si256(I05, I13, 0x31); \
++ O6 = _mm256_permute2x128_si256(I06, I14, 0x31); \
++ O7 = _mm256_permute2x128_si256(I07, I15, 0x31)
+
+
+ static void uavs3d_always_inline dct2_butterfly_h4_avx2(s16* src, s16* dst, int line, int shift, int bit_depth)
+@@ -272,10 +267,10 @@ static void uavs3d_always_inline dct2_butterfly_h8_avx2(s16* src, int i_src, s16
+
+ // transpose 8x8 : 8 x 8(32bit) --> 4 x 16(16bit)
+ TRANSPOSE_8x8_32BIT_16BIT(d0, d1, d2, d3, d4, d5, d6, d7, d4, d5, d6, d7);
+- d0 = _mm256_insertf128_si256(d4, _mm256_castsi256_si128(d5), 1);
+- d1 = _mm256_insertf128_si256(d6, _mm256_castsi256_si128(d7), 1);
+- d2 = _mm256_insertf128_si256(d5, _mm256_extracti128_si256(d4, 1), 0);
+- d3 = _mm256_insertf128_si256(d7, _mm256_extracti128_si256(d6, 1), 0);
++ d0 = _mm256_permute2x128_si256(d4, d5, 0x20);
++ d2 = _mm256_permute2x128_si256(d4, d5, 0x31);
++ d1 = _mm256_permute2x128_si256(d6, d7, 0x20);
++ d3 = _mm256_permute2x128_si256(d6, d7, 0x31);
+
+ if (bit_depth != MAX_TX_DYNAMIC_RANGE) {
+ __m256i max_val = _mm256_set1_epi16((1 << bit_depth) - 1);
+diff --git a/source/decore/avx2/pixel_avx2.c b/source/decore/avx2/pixel_avx2.c
+index 8031fe7..10d48f9 100644
+--- a/source/decore/avx2/pixel_avx2.c
++++ b/source/decore/avx2/pixel_avx2.c
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+@@ -986,8 +981,8 @@ void uavs3d_recon_chroma_w16_avx2(s16 *resi_u, s16 *resi_v, pel *pred, int width
+ r1 = _mm256_loadu_si256((const __m256i*)(resi_v));
+ r2 = _mm256_unpacklo_epi16(r0, r1); // UV interlaced: uv0-uv4 uv8-uv12
+ r3 = _mm256_unpackhi_epi16(r0, r1);
+- r0 = _mm256_insertf128_si256(r2, _mm256_castsi256_si128(r3), 0x1); // uv0-uv8
+- r1 = _mm256_insertf128_si256(r3, _mm256_extracti128_si256(r2, 1), 0x0);
++ r0 = _mm256_permute2x128_si256(r2, r3, 0x20); // uv0-uv8
++ r1 = _mm256_permute2x128_si256(r2, r3, 0x31);
+ p0 = _mm256_adds_epi16(p0, r0);
+ p1 = _mm256_adds_epi16(p1, r1);
+
+@@ -1035,8 +1030,8 @@ void uavs3d_recon_chroma_w16_avx2(s16 *resi_u, s16 *resi_v, pel *pred, int width
+ r1 = _mm256_loadu_si256((const __m256i*)(resi_v));
+ r2 = _mm256_unpacklo_epi16(zero, r1); // UV interlaced: uv0-uv4 uv8-uv12
+ r3 = _mm256_unpackhi_epi16(zero, r1);
+- r0 = _mm256_insertf128_si256(r2, _mm256_castsi256_si128(r3), 0x1); // uv0-uv8
+- r1 = _mm256_insertf128_si256(r3, _mm256_extracti128_si256(r2, 1), 0x0);
++ r0 = _mm256_permute2x128_si256(r2, r3, 0x20); // uv0-uv8
++ r1 = _mm256_permute2x128_si256(r2, r3, 0x31);
+ p0 = _mm256_adds_epi16(p0, r0);
+ p1 = _mm256_adds_epi16(p1, r1);
+
+@@ -1073,8 +1068,8 @@ void uavs3d_recon_chroma_w16x_avx2(s16 *resi_u, s16 *resi_v, pel *pred, int widt
+ r1 = _mm256_loadu_si256((const __m256i*)(resi_v + j));
+ r2 = _mm256_unpacklo_epi16(r0, r1); // UV interlaced: uv0-uv4 uv8-uv12
+ r3 = _mm256_unpackhi_epi16(r0, r1);
+- r0 = _mm256_insertf128_si256(r2, _mm256_castsi256_si128(r3), 0x1); // uv0-uv8
+- r1 = _mm256_insertf128_si256(r3, _mm256_extracti128_si256(r2, 1), 0x0);
++ r0 = _mm256_permute2x128_si256(r2, r3, 0x20); // uv0-uv8
++ r1 = _mm256_permute2x128_si256(r2, r3, 0x31);
+ p0 = _mm256_adds_epi16(p0, r0);
+ p1 = _mm256_adds_epi16(p1, r1);
+
+@@ -1126,8 +1121,8 @@ void uavs3d_recon_chroma_w16x_avx2(s16 *resi_u, s16 *resi_v, pel *pred, int widt
+ r1 = _mm256_loadu_si256((const __m256i*)(resi_v + j));
+ r2 = _mm256_unpacklo_epi16(zero, r1); // UV interlaced: uv0-uv4 uv8-uv12
+ r3 = _mm256_unpackhi_epi16(zero, r1);
+- r0 = _mm256_insertf128_si256(r2, _mm256_castsi256_si128(r3), 0x1); // uv0-uv8
+- r1 = _mm256_insertf128_si256(r3, _mm256_extracti128_si256(r2, 1), 0x0);
++ r0 = _mm256_permute2x128_si256(r2, r3, 0x20); // uv0-uv8
++ r1 = _mm256_permute2x128_si256(r2, r3, 0x31);
+ p0 = _mm256_adds_epi16(p0, r0);
+ p1 = _mm256_adds_epi16(p1, r1);
+
+diff --git a/source/decore/avx2/sao_avx2.c b/source/decore/avx2/sao_avx2.c
+index 73c01b4..7d2d527 100644
+--- a/source/decore/avx2/sao_avx2.c
++++ b/source/decore/avx2/sao_avx2.c
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decore/com_def.h b/source/decore/com_def.h
+index a8e9446..8b7ad27 100644
+--- a/source/decore/com_def.h
++++ b/source/decore/com_def.h
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+@@ -40,14 +35,18 @@
+
+ #include "com_sys.h"
+
+-#define BIT_DEPTH 8
+-
+ #define CHECK_RAND_STRM 0
+
+-#if (BIT_DEPTH == 8)
+-typedef u8 pel; /* pixel type */
++#ifndef COMPILE_10BIT
++#define COMPILE_10BIT 0
++#endif
++
++#if COMPILE_10BIT
++typedef unsigned short pel; /* pixel type */
++#define BIT_DEPTH 10
+ #else
+-typedef s16 pel; /* pixel type */
++typedef unsigned char pel; /* pixel type */
++#define BIT_DEPTH 8
+ #endif
+
+ /************************* profile & level **********************************************/
+diff --git a/source/decore/com_sys.h b/source/decore/com_sys.h
+index 0cb4359..2ea3237 100644
+--- a/source/decore/com_sys.h
++++ b/source/decore/com_sys.h
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decore/com_table.c b/source/decore/com_table.c
+index bbc40d9..2042e6d 100644
+--- a/source/decore/com_table.c
++++ b/source/decore/com_table.c
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decore/com_table.h b/source/decore/com_table.h
+index 2c2bb3d..c419405 100644
+--- a/source/decore/com_table.h
++++ b/source/decore/com_table.h
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decore/com_type.h b/source/decore/com_type.h
+index 5bb8337..0a7db50 100644
+--- a/source/decore/com_type.h
++++ b/source/decore/com_type.h
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decore/com_util.c b/source/decore/com_util.c
+index 7e374c9..353804b 100644
+--- a/source/decore/com_util.c
++++ b/source/decore/com_util.c
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decore/com_util.h b/source/decore/com_util.h
+index b6887bb..721a0c1 100644
+--- a/source/decore/com_util.h
++++ b/source/decore/com_util.h
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+@@ -41,10 +36,10 @@
+ #include "com_type.h"
+
+ /* function selection define based on platforms */
+-#if (defined(__ANDROID__) && defined(__aarch64__)) || (defined(__APPLE__) && defined(__arm64__))
++#if defined(_arm64) || (defined(__APPLE__) && defined(__arm64__))
+ #define ENABLE_FUNCTION_C 1
+ #define ENABLE_FUNCTION_ARM64 1
+-#elif (defined(__ANDROID__) && defined(__arm__)) || (defined(__APPLE__) && defined(__ARM_NEON__))
++#elif defined(_armv7a) || (defined(__APPLE__) && defined(__ARM_NEON__))
+ #define ENABLE_FUNCTION_C 1
+ #define ENABLE_FUNCTION_ARM32 1
+ #elif (defined(__WIN32__) || defined(_WIN32)) || (defined(__MACOSX__) || defined(macintosh) || defined(__linux__) || defined(__unix__)) && (defined(__i386__) || defined(__x86_64__) || defined(__AMD64__))
+diff --git a/source/decore/deblock.c b/source/decore/deblock.c
+index 0996c57..f9764d6 100644
+--- a/source/decore/deblock.c
++++ b/source/decore/deblock.c
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+@@ -65,7 +60,7 @@ static int uavs3d_always_inline skip_filter(com_map_t *map, com_ref_pic_t refp[M
+ {
+ com_scu_t MbQ = map->map_scu[scup + offset];
+ com_pic_t *q_pic0, *q_pic1;
+- const com_scu_t mask = {0, 0, 0, 0, 1, 0, 0};
++ const com_scu_t mask = {0, 1, 0, 0, 1, 0, 0};
+
+ if ((*(u8*)&MbQ) & (*(u8*)&mask)) {
+ return 0;
+@@ -210,7 +205,7 @@ void com_deblock_set_edge(com_core_t *core)
+ int scu_x = core->cu_pix_x >> MIN_CU_LOG2;
+ int scu_y = core->cu_pix_y >> MIN_CU_LOG2;
+ const int grad_mask = (LOOPFILTER_GRID >> 2) - 1;
+- const com_scu_t mask = { 0, 0, 0, 0, 1, 0, 0 };
++ const com_scu_t mask = { 0, 1, 0, 0, 1, 0, 0 };
+ com_scu_t scu = map->map_scu[scup];
+
+ if ((*(u8*)&scu) & (*(u8*)&mask)) {
+diff --git a/source/decore/inter_pred.c b/source/decore/inter_pred.c
+index c53d399..470c84c 100644
+--- a/source/decore/inter_pred.c
++++ b/source/decore/inter_pred.c
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+@@ -486,10 +481,10 @@ void uavs3d_always_inline com_affine_mc_chroma(com_core_t *core, pel *dstc, int
+ int max_posx = (seqhdr->pic_width + 4) >> 1;
+ int max_posy = (seqhdr->pic_height + 4) >> 1;
+ int i_asb_mv = cu_width >> 2;
+- s32(*asb_mv1)[MV_D] = asb_mv0 + i_asb_mv;
+ int i_src = ref_pic->stride_chroma;
+
+ if (sub_blk_size == 4) {
++ s32(*asb_mv1)[MV_D] = asb_mv0 + i_asb_mv;
+ for (h = 0; h < cu_height; h += 8) {
+ int base_y = (y + h) << 4;
+ for (w = 0; w < cu_width; w += 8, asb_mv0 += 2, asb_mv1 += 2) {
+diff --git a/source/decore/intra_pred.c b/source/decore/intra_pred.c
+index de1eef6..c767be0 100644
+--- a/source/decore/intra_pred.c
++++ b/source/decore/intra_pred.c
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+@@ -320,7 +315,7 @@ void ipred_plane(pel *src, pel *dst, int i_dst, int w, int h, int bit_depth)
+ int ib_shift[5] = { 7, 10, 11, 15, 19 };
+ int idx_w = g_tbl_log2[w] - 2;
+ int idx_h = g_tbl_log2[h] - 2;
+- int im_h, is_h, im_v, is_v, temp, temp2;
++ int im_h, is_h, im_v, is_v, temp;
+ int max_pel = (1 << bit_depth) - 1;
+ int val;
+
+@@ -343,7 +338,7 @@ void ipred_plane(pel *src, pel *dst, int i_dst, int w, int h, int bit_depth)
+ temp = a - (h2 - 1) * c - (w2 - 1) * b + 16;
+
+ for (y = 0; y < h; y++) {
+- temp2 = temp;
++ int temp2 = temp;
+ for (x = 0; x < w; x++) {
+ val = temp2 >> 5;
+ dst[x] = (pel)COM_CLIP3(0, max_pel, val);
+@@ -368,7 +363,7 @@ void ipred_plane_ipf(pel *src, s16 *dst, int w, int h)
+ int ib_shift[5] = { 7, 10, 11, 15, 19 };
+ int idx_w = g_tbl_log2[w] - 2;
+ int idx_h = g_tbl_log2[h] - 2;
+- int im_h, is_h, im_v, is_v, temp, temp2;
++ int im_h, is_h, im_v, is_v, temp;
+ im_h = ib_mult[idx_w];
+ is_h = ib_shift[idx_w];
+ im_v = ib_mult[idx_h];
+@@ -388,7 +383,7 @@ void ipred_plane_ipf(pel *src, s16 *dst, int w, int h)
+ temp = a - (h2 - 1) * c - (w2 - 1) * b + 16;
+
+ for (y = 0; y < h; y++) {
+- temp2 = temp;
++ int temp2 = temp;
+ for (x = 0; x < w; x++) {
+ dst[x] = (s16)(temp2 >> 5);
+ temp2 += b;
+@@ -416,7 +411,7 @@ void ipred_plane_uv(pel *src, pel *dst, int i_dst, int w, int h, int bit_depth)
+ int idx_w = g_tbl_log2[w] - 2;
+ int idx_h = g_tbl_log2[h] - 2;
+ int im_h, is_h, im_v, is_v;
+- int temp_u, temp_v, temp2_u, temp2_v;
++ int temp_u, temp_v;
+ int max_pel = (1 << bit_depth) - 1;
+ int val_u, val_v;
+
+@@ -448,8 +443,8 @@ void ipred_plane_uv(pel *src, pel *dst, int i_dst, int w, int h, int bit_depth)
+ temp_v = a_v - (h2 - 1) * c_v - (w2 - 1) * b_v + 16;
+
+ for (y = 0; y < h; y++) {
+- temp2_u = temp_u;
+- temp2_v = temp_v;
++ int temp2_u = temp_u;
++ int temp2_v = temp_v;
+ for (x = 0; x < width2; x += 2) {
+ val_u = temp2_u >> 5;
+ val_v = temp2_v >> 5;
+@@ -475,8 +470,7 @@ void ipred_bi(pel *src, pel *dst, int i_dst, int w, int h, int bit_depth)
+ int ishift = COM_MIN(ishift_x, ishift_y);
+ int ishift_xy = ishift_x + ishift_y + 1;
+ int offset = 1 << (ishift_x + ishift_y);
+- int a, b, c, wt, wxy, tmp;
+- int predx;
++ int a, b, c, wt, tmp;
+ int ref_up[MAX_CU_SIZE], ref_le[MAX_CU_SIZE], up[MAX_CU_SIZE], le[MAX_CU_SIZE], wy[MAX_CU_SIZE];
+ int wc, tbl_wc[6] = {-1, 21, 13, 7, 4, 2};
+ int max_pel = (1 << bit_depth) - 1;
+@@ -510,8 +504,8 @@ void ipred_bi(pel *src, pel *dst, int i_dst, int w, int h, int bit_depth)
+ tmp += wt;
+ }
+ for( y = 0; y < h; y++ ) {
+- predx = ref_le[y];
+- wxy = 0;
++ int predx = ref_le[y];
++ int wxy = 0;
+ for( x = 0; x < w; x++ ) {
+ predx += le[y];
+ ref_up[x] += up[x];
+@@ -534,8 +528,7 @@ void ipred_bi_ipf(pel *src, s16 *dst, int w, int h)
+ int ishift = COM_MIN(ishift_x, ishift_y);
+ int ishift_xy = ishift_x + ishift_y + 1;
+ int offset = 1 << (ishift_x + ishift_y);
+- int a, b, c, wt, wxy, tmp;
+- int predx;
++ int a, b, c, wt, tmp;
+ int ref_up[MAX_CU_SIZE], ref_le[MAX_CU_SIZE], up[MAX_CU_SIZE], le[MAX_CU_SIZE], wy[MAX_CU_SIZE];
+ int wc, tbl_wc[6] = { -1, 21, 13, 7, 4, 2 };
+ wc = ishift_x > ishift_y ? ishift_x - ishift_y : ishift_y - ishift_x;
+@@ -566,8 +559,8 @@ void ipred_bi_ipf(pel *src, s16 *dst, int w, int h)
+ tmp += wt;
+ }
+ for (y = 0; y < h; y++) {
+- predx = ref_le[y];
+- wxy = 0;
++ int predx = ref_le[y];
++ int wxy = 0;
+ for (x = 0; x < w; x++) {
+ predx += le[y];
+ ref_up[x] += up[x];
+@@ -589,9 +582,8 @@ void ipred_bi_uv(pel *src, pel *dst, int i_dst, int w, int h, int bit_depth)
+ int ishift = COM_MIN(ishift_x, ishift_y);
+ int ishift_xy = ishift_x + ishift_y + 1;
+ int offset = 1 << (ishift_x + ishift_y);
+- int a_u, b_u, c_u, wt_u, wxy_u, tmp_u;
+- int a_v, b_v, c_v, wt_v, wxy_v, tmp_v;
+- int predx_u, predx_v;
++ int a_u, b_u, c_u, wt_u, tmp_u;
++ int a_v, b_v, c_v, wt_v, tmp_v;
+ int ref_up[MAX_CU_SIZE], ref_le[MAX_CU_SIZE], up[MAX_CU_SIZE], le[MAX_CU_SIZE], wy[MAX_CU_SIZE];
+ int wc, tbl_wc[6] = { -1, 21, 13, 7, 4, 2 };
+ int w2 = w << 1;
+@@ -640,9 +632,10 @@ void ipred_bi_uv(pel *src, pel *dst, int i_dst, int w, int h, int bit_depth)
+ }
+ for (y = 0; y < h; y++) {
+ int y2 = y << 1;
+- predx_u = ref_le[y2 ];
+- predx_v = ref_le[y2 + 1];
+- wxy_u = wxy_v = 0;
++ int predx_u = ref_le[y2 ];
++ int predx_v = ref_le[y2 + 1];
++ int wxy_u = 0;
++ int wxy_v = 0;
+ for (x = 0; x < w2; x += 2) {
+ predx_u += le[y2];
+ predx_v += le[y2 + 1];
+@@ -1034,7 +1027,6 @@ static void uavs3d_always_inline ipf_core_s16(pel *src, pel *dst, int i_dst, s16
+ s32 filter_idx_ver = (s32)g_tbl_log2[h] - 2; //Block Size
+ s32 ver_filter_range = COM_MIN(h, 10);
+ s32 hor_filter_range = COM_MIN(w, 10);
+- int max_val = (1 << bit_depth) - 1;
+
+ // TODO: g_ipf_pred_param doesn't support 128
+ if (filter_idx_hor > 4) {
+@@ -1300,7 +1292,6 @@ static void xPredIntraAngAdi_X_8(pel *pSrc, pel *dst, int i_dst, int uiDirMode,
+ int line_size = iWidth + iHeight / 2 - 1;
+ int real_size = min(line_size, iWidth * 2 + 1);
+ int i;
+- int pad1, pad2;
+ int aligned_line_size = ((line_size + 15) >> 4) << 4;
+ pel *pfirst[2] = { first_line, first_line + aligned_line_size };
+
+@@ -1311,6 +1302,8 @@ static void xPredIntraAngAdi_X_8(pel *pSrc, pel *dst, int i_dst, int uiDirMode,
+
+ // padding
+ if (real_size < line_size) {
++ int pad1, pad2;
++
+ pfirst[1][real_size - 1] = pfirst[1][real_size - 2];
+
+ pad1 = pfirst[0][real_size - 1];
+@@ -1466,7 +1459,6 @@ static void xPredIntraAngAdi_Y_28(pel *pSrc, pel *dst, int i_dst, int uiDirMode,
+ int real_size = min(line_size, iHeight * 4 + 1);
+ int i;
+ int iHeight2 = iHeight << 1;
+- int pad1, pad2;
+
+ for (i = 0; i < real_size; i += 2, pSrc--) {
+ first_line[i] = (pSrc[0] + (pSrc[-1] + pSrc[-2]) * 3 + pSrc[-3] + 4) >> 3;
+@@ -1475,6 +1467,7 @@ static void xPredIntraAngAdi_Y_28(pel *pSrc, pel *dst, int i_dst, int uiDirMode,
+
+ // padding
+ if (real_size < line_size) {
++ int pad1, pad2;
+ first_line[i - 1] = first_line[i - 3];
+
+ pad1 = first_line[i - 2];
+diff --git a/source/decore/inv_trans.c b/source/decore/inv_trans.c
+index 2be533a..2d60b20 100644
+--- a/source/decore/inv_trans.c
++++ b/source/decore/inv_trans.c
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decore/modules.h b/source/decore/modules.h
+index 026d237..00d65b1 100644
+--- a/source/decore/modules.h
++++ b/source/decore/modules.h
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decore/pic_manager.c b/source/decore/pic_manager.c
+index 1c9be09..1a09a38 100644
+--- a/source/decore/pic_manager.c
++++ b/source/decore/pic_manager.c
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decore/recon.c b/source/decore/recon.c
+index c6466ba..951957a 100644
+--- a/source/decore/recon.c
++++ b/source/decore/recon.c
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decore/sao.c b/source/decore/sao.c
+index b39466d..9004046 100644
+--- a/source/decore/sao.c
++++ b/source/decore/sao.c
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decore/sse/alf_sse.c b/source/decore/sse/alf_sse.c
+index 2880605..24e203e 100644
+--- a/source/decore/sse/alf_sse.c
++++ b/source/decore/sse/alf_sse.c
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decore/sse/deblock_sse.c b/source/decore/sse/deblock_sse.c
+index 271c2fc..ab88636 100644
+--- a/source/decore/sse/deblock_sse.c
++++ b/source/decore/sse/deblock_sse.c
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decore/sse/inter_pred_sse.c b/source/decore/sse/inter_pred_sse.c
+index 170d079..7faf0d7 100644
+--- a/source/decore/sse/inter_pred_sse.c
++++ b/source/decore/sse/inter_pred_sse.c
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+@@ -55,25 +50,59 @@ void uavs3d_if_cpy_sse(const pel *src, int i_src, pel *dst, int i_dst, int width
+
+ void uavs3d_if_cpy_w4_sse(const pel *src, int i_src, pel *dst, int i_dst, int width, int height)
+ {
+- while (height) {
++ if (height < 4) {
+ CP32(dst, src);
+ CP32(dst + i_dst, src + i_src);
+- height -= 2;
+- src += i_src << 1;
+- dst += i_dst << 1;
++ }
++ else {
++ int i_src2 = i_src << 1;
++ int i_dst2 = i_dst << 1;
++ int i_src3 = i_src + i_src2;
++ int i_dst3 = i_dst + i_dst2;
++ int i_src4 = i_src << 2;
++ int i_dst4 = i_dst << 2;
++ while (height > 0) {
++ CP32(dst, src);
++ CP32(dst + i_dst, src + i_src);
++ CP32(dst + i_dst2, src + i_src2);
++ CP32(dst + i_dst3, src + i_src3);
++ height -= 4;
++ src += i_src4;
++ dst += i_dst4;
++ }
+ }
+ }
+
+ void uavs3d_if_cpy_w8_sse(const pel *src, int i_src, pel *dst, int i_dst, int width, int height)
+ {
+- int i_src2 = i_src << 1;
+- int i_dst2 = i_dst << 1;
+- while (height) {
+- CP64(dst, src);
+- CP64(dst + i_dst, src + i_src);
+- src += i_src2;
+- dst += i_dst2;
+- height -= 2;
++ if (height < 4) {
++ __m128i m0, m1;
++ m0 = _mm_loadl_epi64((const __m128i*)src);
++ m1 = _mm_loadl_epi64((const __m128i*)(src + i_src));
++ _mm_storel_epi64((__m128i*)dst, m0);
++ _mm_storel_epi64((__m128i*)(dst + i_dst), m1);
++ } else {
++ __m128i m0, m1, m2, m3;
++ int i_src2 = i_src << 1;
++ int i_dst2 = i_dst << 1;
++ int i_src3 = i_src + i_src2;
++ int i_dst3 = i_dst + i_dst2;
++ int i_src4 = i_src << 2;
++ int i_dst4 = i_dst << 2;
++ while (height) {
++ m0 = _mm_loadl_epi64((const __m128i*)src);
++ m1 = _mm_loadl_epi64((const __m128i*)(src + i_src));
++ m2 = _mm_loadl_epi64((const __m128i*)(src + i_src2));
++ m3 = _mm_loadl_epi64((const __m128i*)(src + i_src3));
++ height -= 4;
++ src += i_src4;
++
++ _mm_storel_epi64((__m128i*)dst, m0);
++ _mm_storel_epi64((__m128i*)(dst + i_dst), m1);
++ _mm_storel_epi64((__m128i*)(dst + i_dst2), m2);
++ _mm_storel_epi64((__m128i*)(dst + i_dst3), m3);
++ dst += i_dst4;
++ }
+ }
+ }
+
+@@ -371,7 +400,7 @@ void uavs3d_if_hor_luma_w8_sse(const pel *src, int i_src, pel *dst, int i_dst, i
+ height -= 2;
+
+ _mm_storel_epi64((__m128i*)dst, T0);
+- M64(dst + i_dst) = _mm_extract_epi64(T0, 1);
++ _mm_storeh_pi((__m64*)(dst + i_dst), _mm_castsi128_ps(T0));
+
+ src += i_src << 1;
+ dst += i_dst << 1;
+@@ -512,7 +541,6 @@ void uavs3d_if_ver_chroma_w16x_sse(const pel *src, int i_src, pel *dst, int i_ds
+ const int offset = 32;
+ const int shift = 6;
+ __m128i mAddOffset = _mm_set1_epi16(offset);
+- pel const *p;
+ __m128i coeff0 = _mm_set1_epi16(*(s16*)coeff);
+ __m128i coeff1 = _mm_set1_epi16(*(s16*)(coeff + 2));
+ __m128i mVal1, mVal2;
+@@ -520,7 +548,7 @@ void uavs3d_if_ver_chroma_w16x_sse(const pel *src, int i_src, pel *dst, int i_ds
+ src -= i_src;
+
+ while (height--) {
+- p = src;
++ pel const *p = src;
+ uavs3d_prefetch(src + 4 * i_src, _MM_HINT_NTA);
+ for (col = 0; col < width; col += 16) {
+ __m128i T01 = _mm_loadu_si128((__m128i*)(p));
+@@ -739,7 +767,6 @@ void uavs3d_if_ver_luma_w16x_sse(const pel *src, int i_src, pel *dst, int i_dst,
+ int col;
+ const int offset = 32;
+ const int shift = 6;
+- pel const *p;
+ __m128i mAddOffset = _mm_set1_epi16(offset);
+ __m128i coeff0 = _mm_set1_epi16(*(s16*)coeff);
+ __m128i coeff1 = _mm_set1_epi16(*(s16*)(coeff + 2));
+@@ -750,7 +777,7 @@ void uavs3d_if_ver_luma_w16x_sse(const pel *src, int i_src, pel *dst, int i_dst,
+ src -= 3 * i_src;
+
+ while (height--) {
+- p = src;
++ pel const *p = src;
+ uavs3d_prefetch(src + 8 * i_src, _MM_HINT_NTA);
+ for (col = 0; col < width; col += 16) {
+ __m128i T01 = _mm_loadu_si128((__m128i*)(p));
+@@ -811,7 +838,6 @@ void uavs3d_if_hor_ver_chroma_w8x_sse(const pel *src, int i_src, pel *dst, int i
+ {
+ int row, col;
+ int shift;
+- s16 const *p;
+ ALIGNED_16(s16 tmp_res[(64 + 3) * 64*2]);
+ s16 *tmp = tmp_res;
+ const int i_tmp = width;
+@@ -856,7 +882,7 @@ void uavs3d_if_hor_ver_chroma_w8x_sse(const pel *src, int i_src, pel *dst, int i
+ coeff1_ver = _mm_cvtepi8_epi16(coeff1_ver);
+
+ while (height--) {
+- p = tmp;
++ s16 const *p = tmp;
+ for (col = 0; col < width; col += 8) {
+ __m128i T00 = _mm_load_si128((__m128i*)(p));
+ __m128i T10 = _mm_load_si128((__m128i*)(p + i_tmp));
+@@ -1411,7 +1437,6 @@ void uavs3d_if_hor_ver_luma_w8x_sse(const pel *src, int i_src, pel *dst, int i_d
+ {
+ int row, col;
+ int shift;
+- s16 const *p;
+
+ ALIGNED_16(s16 tmp_res[(128 + 7) * 128]);
+ s16 *tmp = tmp_res;
+@@ -1473,7 +1498,7 @@ void uavs3d_if_hor_ver_luma_w8x_sse(const pel *src, int i_src, pel *dst, int i_d
+ mCoefy4_ver = _mm_cvtepi8_epi16(mCoefy4_ver);
+
+ while (height--) {
+- p = tmp;
++ s16 const *p = tmp;
+ for (col = 0; col < width; col += 8) {
+ __m128i T00 = _mm_load_si128((__m128i*)(p));
+ __m128i T10 = _mm_load_si128((__m128i*)(p + i_tmp));
+@@ -2315,14 +2340,13 @@ void uavs3d_if_hor_ver_luma_w8_sse(const pel *src, int i_src, pel *dst, int i_ds
+ int rows;
+ int add1, shift1;
+ int add2, shift2;
+- __m128i T0, T1, T2, T3, T4, T5, T6, T7;
++ __m128i T0, T1, T2, T3, T4, T5;
+ __m128i M0, M1, M2, M3, M4, M5, M6, M7;
+- __m128i N0, N1, N2, N3, N4, N5, N6, N7;
+- __m128i mCoef, offset, max_pel;
++ __m128i offset, max_pel;
+ ALIGNED_16(s16 tmp_res[(64 + 7) * 8]);
+ s16 *tmp = tmp_res;
+ const int i_tmp = 8;
+- __m128i coeff00, coeff01, coeff02, coeff03;
++ __m128i mCoef0, mCoef1, mCoef2, mCoef3;
+
+ if (max_val == 255) { // 8 bit_depth
+ shift1 = 0;
+@@ -2338,110 +2362,129 @@ void uavs3d_if_hor_ver_luma_w8_sse(const pel *src, int i_src, pel *dst, int i_ds
+
+ src += -3 * i_src - 3;
+
+- mCoef = _mm_loadl_epi64((__m128i*)coef_x);
+- offset = _mm_set1_epi32(add1);
+- mCoef = _mm_cvtepi8_epi16(mCoef);
+-
+- // HOR
+- rows = height + 7;
+- while (rows--) {
+- uavs3d_prefetch(src + i_src, _MM_HINT_NTA);
+- T0 = _mm_loadu_si128((__m128i*)(src + 0));
+- T1 = _mm_loadu_si128((__m128i*)(src + 1));
+- T2 = _mm_loadu_si128((__m128i*)(src + 2));
+- T3 = _mm_loadu_si128((__m128i*)(src + 3));
+- T4 = _mm_loadu_si128((__m128i*)(src + 4));
+- T5 = _mm_loadu_si128((__m128i*)(src + 5));
+- T6 = _mm_loadu_si128((__m128i*)(src + 6));
+- T7 = _mm_loadu_si128((__m128i*)(src + 7));
+-
+- M0 = _mm_madd_epi16(T0, mCoef);
+- M1 = _mm_madd_epi16(T1, mCoef);
+- M2 = _mm_madd_epi16(T2, mCoef);
+- M3 = _mm_madd_epi16(T3, mCoef);
+- M4 = _mm_madd_epi16(T4, mCoef);
+- M5 = _mm_madd_epi16(T5, mCoef);
+- M6 = _mm_madd_epi16(T6, mCoef);
+- M7 = _mm_madd_epi16(T7, mCoef);
++ {
++ __m128i mShuffle0 = _mm_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9);
++ __m128i mShuffle1 = _mm_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13);
++ __m128i S0, S1, S2;
++
++ mCoef0 = _mm_set1_epi16(((s16*)coef_x)[0]);
++ mCoef1 = _mm_set1_epi16(((s16*)coef_x)[1]);
++ mCoef2 = _mm_set1_epi16(((s16*)coef_x)[2]);
++ mCoef3 = _mm_set1_epi16(((s16*)coef_x)[3]);
++ mCoef0 = _mm_cvtepi8_epi16(mCoef0);
++ mCoef1 = _mm_cvtepi8_epi16(mCoef1);
++ mCoef2 = _mm_cvtepi8_epi16(mCoef2);
++ mCoef3 = _mm_cvtepi8_epi16(mCoef3);
++ offset = _mm_set1_epi32(add1);
++
++ // HOR
++ rows = height + 7;
++ while (rows--) {
++ S0 = _mm_loadu_si128((__m128i*)(src));
++ S1 = _mm_loadu_si128((__m128i*)(src + 4));
++ S2 = _mm_loadu_si128((__m128i*)(src + 8));
++ src += i_src;
++ uavs3d_prefetch(src, _MM_HINT_NTA);
++
++ T0 = _mm_shuffle_epi8(S0, mShuffle0);
++ T1 = _mm_shuffle_epi8(S0, mShuffle1);
++ T2 = _mm_shuffle_epi8(S1, mShuffle0);
++ T3 = _mm_shuffle_epi8(S1, mShuffle1);
++ T4 = _mm_shuffle_epi8(S2, mShuffle0);
++ T5 = _mm_shuffle_epi8(S2, mShuffle1);
++
++ M0 = _mm_madd_epi16(T0, mCoef0);
++ M1 = _mm_madd_epi16(T1, mCoef1);
++ M2 = _mm_madd_epi16(T2, mCoef2);
++ M3 = _mm_madd_epi16(T3, mCoef3);
++ M4 = _mm_madd_epi16(T2, mCoef0);
++ M5 = _mm_madd_epi16(T3, mCoef1);
++ M6 = _mm_madd_epi16(T4, mCoef2);
++ M7 = _mm_madd_epi16(T5, mCoef3);
+
+- M0 = _mm_hadd_epi32(M0, M1);
+- M1 = _mm_hadd_epi32(M2, M3);
+- M2 = _mm_hadd_epi32(M4, M5);
+- M3 = _mm_hadd_epi32(M6, M7);
++ M0 = _mm_add_epi32(M0, M1);
++ M1 = _mm_add_epi32(M2, M3);
++ M2 = _mm_add_epi32(M4, M5);
++ M3 = _mm_add_epi32(M6, M7);
+
+- M0 = _mm_hadd_epi32(M0, M1);
+- M1 = _mm_hadd_epi32(M2, M3);
++ M0 = _mm_add_epi32(M0, M1);
++ M1 = _mm_add_epi32(M2, M3);
+
+- M2 = _mm_add_epi32(M0, offset);
+- M3 = _mm_add_epi32(M1, offset);
+- M2 = _mm_srai_epi32(M2, shift1);
+- M3 = _mm_srai_epi32(M3, shift1);
+- M2 = _mm_packs_epi32(M2, M3);
+- _mm_storeu_si128((__m128i*)tmp, M2);
++ M2 = _mm_add_epi32(M0, offset);
++ M3 = _mm_add_epi32(M1, offset);
++ M2 = _mm_srai_epi32(M2, shift1);
++ M3 = _mm_srai_epi32(M3, shift1);
++ M2 = _mm_packs_epi32(M2, M3);
++ _mm_store_si128((__m128i*)tmp, M2);
+
+- tmp += i_tmp;
+- src += i_src;
++ tmp += i_tmp;
++ }
+ }
+
+- offset = _mm_set1_epi32(add2);
+- max_pel = _mm_set1_epi16((pel)max_val);
+- tmp = tmp_res;
+-
+- coeff00 = _mm_set1_epi16(*(s16*)coef_y);
+- coeff01 = _mm_set1_epi16(*(s16*)(coef_y + 2));
+- coeff02 = _mm_set1_epi16(*(s16*)(coef_y + 4));
+- coeff03 = _mm_set1_epi16(*(s16*)(coef_y + 6));
+- coeff00 = _mm_cvtepi8_epi16(coeff00);
+- coeff01 = _mm_cvtepi8_epi16(coeff01);
+- coeff02 = _mm_cvtepi8_epi16(coeff02);
+- coeff03 = _mm_cvtepi8_epi16(coeff03);
+-
+- while (height--) {
+- T0 = _mm_load_si128((__m128i*)(tmp));
+- T1 = _mm_load_si128((__m128i*)(tmp + i_tmp));
+- T2 = _mm_load_si128((__m128i*)(tmp + 2 * i_tmp));
+- T3 = _mm_load_si128((__m128i*)(tmp + 3 * i_tmp));
+- T4 = _mm_load_si128((__m128i*)(tmp + 4 * i_tmp));
+- T5 = _mm_load_si128((__m128i*)(tmp + 5 * i_tmp));
+- T6 = _mm_load_si128((__m128i*)(tmp + 6 * i_tmp));
+- T7 = _mm_load_si128((__m128i*)(tmp + 7 * i_tmp));
++ {
++ __m128i N0, N1, N2, N3, N4, N5, N6, N7;
++ __m128i T6, T7;
++
++ offset = _mm_set1_epi32(add2);
++ max_pel = _mm_set1_epi16((pel)max_val);
++ tmp = tmp_res;
++
++ mCoef0 = _mm_set1_epi16(((s16*)coef_y)[0]);
++ mCoef1 = _mm_set1_epi16(((s16*)coef_y)[1]);
++ mCoef2 = _mm_set1_epi16(((s16*)coef_y)[2]);
++ mCoef3 = _mm_set1_epi16(((s16*)coef_y)[3]);
++ mCoef0 = _mm_cvtepi8_epi16(mCoef0);
++ mCoef1 = _mm_cvtepi8_epi16(mCoef1);
++ mCoef2 = _mm_cvtepi8_epi16(mCoef2);
++ mCoef3 = _mm_cvtepi8_epi16(mCoef3);
++
++ while (height--) {
++ T0 = _mm_load_si128((__m128i*)(tmp));
++ T1 = _mm_load_si128((__m128i*)(tmp + i_tmp));
++ T2 = _mm_load_si128((__m128i*)(tmp + 2 * i_tmp));
++ T3 = _mm_load_si128((__m128i*)(tmp + 3 * i_tmp));
++ T4 = _mm_load_si128((__m128i*)(tmp + 4 * i_tmp));
++ T5 = _mm_load_si128((__m128i*)(tmp + 5 * i_tmp));
++ T6 = _mm_load_si128((__m128i*)(tmp + 6 * i_tmp));
++ T7 = _mm_load_si128((__m128i*)(tmp + 7 * i_tmp));
+
+- M0 = _mm_unpacklo_epi16(T0, T1);
+- M1 = _mm_unpacklo_epi16(T2, T3);
+- M2 = _mm_unpacklo_epi16(T4, T5);
+- M3 = _mm_unpacklo_epi16(T6, T7);
+- M4 = _mm_unpackhi_epi16(T0, T1);
+- M5 = _mm_unpackhi_epi16(T2, T3);
+- M6 = _mm_unpackhi_epi16(T4, T5);
+- M7 = _mm_unpackhi_epi16(T6, T7);
++ M0 = _mm_unpacklo_epi16(T0, T1);
++ M1 = _mm_unpacklo_epi16(T2, T3);
++ M2 = _mm_unpacklo_epi16(T4, T5);
++ M3 = _mm_unpacklo_epi16(T6, T7);
++ M4 = _mm_unpackhi_epi16(T0, T1);
++ M5 = _mm_unpackhi_epi16(T2, T3);
++ M6 = _mm_unpackhi_epi16(T4, T5);
++ M7 = _mm_unpackhi_epi16(T6, T7);
+
+- N0 = _mm_madd_epi16(M0, coeff00);
+- N1 = _mm_madd_epi16(M1, coeff01);
+- N2 = _mm_madd_epi16(M2, coeff02);
+- N3 = _mm_madd_epi16(M3, coeff03);
+- N4 = _mm_madd_epi16(M4, coeff00);
+- N5 = _mm_madd_epi16(M5, coeff01);
+- N6 = _mm_madd_epi16(M6, coeff02);
+- N7 = _mm_madd_epi16(M7, coeff03);
++ N0 = _mm_madd_epi16(M0, mCoef0);
++ N1 = _mm_madd_epi16(M1, mCoef1);
++ N2 = _mm_madd_epi16(M2, mCoef2);
++ N3 = _mm_madd_epi16(M3, mCoef3);
++ N4 = _mm_madd_epi16(M4, mCoef0);
++ N5 = _mm_madd_epi16(M5, mCoef1);
++ N6 = _mm_madd_epi16(M6, mCoef2);
++ N7 = _mm_madd_epi16(M7, mCoef3);
+
+- N0 = _mm_add_epi32(N0, N1);
+- N1 = _mm_add_epi32(N2, N3);
+- N2 = _mm_add_epi32(N4, N5);
+- N3 = _mm_add_epi32(N6, N7);
++ N0 = _mm_add_epi32(N0, N1);
++ N1 = _mm_add_epi32(N2, N3);
++ N2 = _mm_add_epi32(N4, N5);
++ N3 = _mm_add_epi32(N6, N7);
+
+- N0 = _mm_add_epi32(N0, N1);
+- N1 = _mm_add_epi32(N2, N3);
++ N0 = _mm_add_epi32(N0, N1);
++ N1 = _mm_add_epi32(N2, N3);
+
+- N0 = _mm_add_epi32(N0, offset);
+- N1 = _mm_add_epi32(N1, offset);
+- N0 = _mm_srai_epi32(N0, shift2);
+- N1 = _mm_srai_epi32(N1, shift2);
+- N0 = _mm_packus_epi32(N0, N1);
+- N0 = _mm_min_epu16(N0, max_pel);
+- _mm_storeu_si128((__m128i*)(dst), N0);
++ N0 = _mm_add_epi32(N0, offset);
++ N1 = _mm_add_epi32(N1, offset);
++ N0 = _mm_srai_epi32(N0, shift2);
++ N1 = _mm_srai_epi32(N1, shift2);
++ N0 = _mm_packus_epi32(N0, N1);
++ N0 = _mm_min_epu16(N0, max_pel);
++ _mm_storeu_si128((__m128i*)(dst), N0);
+
+- dst += i_dst;
+- tmp += i_tmp;
++ dst += i_dst;
++ tmp += i_tmp;
++ }
+ }
+ }
+
+diff --git a/source/decore/sse/intra_pred_sse.c b/source/decore/sse/intra_pred_sse.c
+index b877834..d77b556 100644
+--- a/source/decore/sse/intra_pred_sse.c
++++ b/source/decore/sse/intra_pred_sse.c
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+@@ -713,13 +708,14 @@ void uavs3d_ipred_chroma_hor_sse(pel *src, pel *dst, int i_dst, int width, int h
+
+ void uavs3d_ipred_dc_sse(pel *src, pel *dst, int i_dst, int width, int height, u16 avail_cu, int bit_depth)
+ {
+- int i, x, y;
++ int x, y;
+ int dc;
+ pel *p_src = src - 1;
+ int left_avail = IS_AVAIL(avail_cu, AVAIL_LE);
+ int above_avail = IS_AVAIL(avail_cu, AVAIL_UP);
+
+ if (left_avail && above_avail) {
++ int i;
+ int length = width + height + 1;
+ __m128i sum = _mm_setzero_si128();
+ __m128i val;
+@@ -828,7 +824,7 @@ void uavs3d_ipred_dc_sse(pel *src, pel *dst, int i_dst, int width, int height, u
+ void uavs3d_ipred_chroma_dc_sse(pel *src, pel *dst, int i_dst, int width, int height, u16 avail_cu, int bit_depth)
+ {
+ __m128i T;
+- int i, x, y;
++ int x, y;
+ int dcU, dcV;
+ pel *p_src = src - 2;
+ int left_avail = IS_AVAIL(avail_cu, AVAIL_LE);
+@@ -838,6 +834,7 @@ void uavs3d_ipred_chroma_dc_sse(pel *src, pel *dst, int i_dst, int width, int he
+ int height2 = height << 1;
+ int wh = width + height;
+ int length = (wh << 1) + 2; // 2*(width + height + 1)
++ int i;
+ __m128i sum = _mm_setzero_si128();
+ __m128i val;
+
+@@ -1787,7 +1784,6 @@ void uavs3d_ipred_ipf_s16_sse(pel *src, pel *dst, int i_dst, s16* pred, int flt_
+ {
+ pel *p_top = src + 1;
+ int row;
+- int max_val = (1 << bit_depth) - 1;
+ __m128i c_32 = _mm_set1_epi16(32);
+ __m128i zero = _mm_setzero_si128();
+ if (w == 4) {
+diff --git a/source/decore/sse/itrans_sse.c b/source/decore/sse/itrans_sse.c
+index f7a5051..217e88e 100644
+--- a/source/decore/sse/itrans_sse.c
++++ b/source/decore/sse/itrans_sse.c
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decore/sse/pixel_sse.c b/source/decore/sse/pixel_sse.c
+index 46ce33f..804b71d 100644
+--- a/source/decore/sse/pixel_sse.c
++++ b/source/decore/sse/pixel_sse.c
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decore/sse/sao_sse.c b/source/decore/sse/sao_sse.c
+index 3459b3e..5f4723b 100644
+--- a/source/decore/sse/sao_sse.c
++++ b/source/decore/sse/sao_sse.c
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decore/sse/sse.c b/source/decore/sse/sse.c
+index cb8c119..570edf5 100644
+--- a/source/decore/sse/sse.c
++++ b/source/decore/sse/sse.c
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/source/decore/sse/sse.h b/source/decore/sse/sse.h
+index 967808a..4e10ab7 100644
+--- a/source/decore/sse/sse.h
++++ b/source/decore/sse/sse.h
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+@@ -45,12 +40,18 @@
+
+ #include "modules.h"
+
+-#ifdef _WIN32
+-
+-#ifndef _WIN64
+-#define _mm_extract_epi64(a, i) (a.m128i_i64[i])
+-#endif
+-
++#if __x86_64__
++#elif __i386__ && !defined(_mm_extract_epi64)
++#define _mm_extract_epi64 _mm_extract_epi64
++#include <stdint.h>
++static inline int64_t _mm_extract_epi64(__m128i a, const int imm8) {
++ return imm8 ? ((int64_t)_mm_extract_epi16(a, 7) << 48) |
++ ((int64_t)_mm_extract_epi16(a, 6) << 32) |
++ (_mm_extract_epi16(a, 5) << 16) | _mm_extract_epi16(a, 4)
++ : ((int64_t)_mm_extract_epi16(a, 3) << 48) |
++ ((int64_t)_mm_extract_epi16(a, 2) << 32) |
++ (_mm_extract_epi16(a, 1) << 16) | _mm_extract_epi16(a, 0);
++}
+ #endif
+
+ ALIGNED_32(extern pel uavs3d_simd_mask[15][16]);
+diff --git a/source/decore/threadpool.h b/source/decore/threadpool.h
+index 3370beb..6a74bac 100644
+--- a/source/decore/threadpool.h
++++ b/source/decore/threadpool.h
+@@ -11,9 +11,6 @@ typedef volatile long atom_t; // 32 bits, signed
+ #if defined(_WIN32)
+ #include "win32thread.h"
+ #else
+-
+-#pragma comment(lib, "pthreadVC2.lib")
+-
+ #include <pthread.h>
+ #define uavs3d_pthread_t pthread_t
+ #define uavs3d_pthread_create pthread_create
+diff --git a/test/utest.c b/test/utest.c
+index 724c7d8..e4df366 100644
+--- a/test/utest.c
++++ b/test/utest.c
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+@@ -202,8 +197,8 @@ static int app_img_write(char * fname, uavs3d_io_frm_t * img, com_seqh_t *seqhdr
+ return -1;
+ }
+ for (i = 0; i < 3; i++) {
+- int hor_size = seqhdr->horizontal_size >> (i ? 1 : 0);
+- int ver_size = seqhdr->vertical_size >> (i ? 1 : 0);
++ int hor_size = seqhdr->display_horizontal_size >> (i ? 1 : 0);
++ int ver_size = seqhdr->display_vertical_size >> (i ? 1 : 0);
+ unsigned char * p8 = (unsigned char *)img->buffer[i];
+
+ for (j = 0; j < ver_size; j++) {
+@@ -412,10 +407,13 @@ void output_callback(uavs3d_io_frm_t *frm) {
+ }
+
+ #if defined(__APPLE__) && (defined(__arm64__) || defined(__ARM_NEON__))
+-int uavs3d_decode_sample(int argc, const char **argv)
+-#else
+-int main(int argc, const char **argv)
++#include <TargetConditionals.h>
++#if !TARGET_OS_OSX
++#define main uavs3d_decode_sample
++#endif
+ #endif
++
++int main(int argc, const char **argv)
+ {
+ int decoding = 1;
+ unsigned char * bs_buf = NULL;
+@@ -465,7 +463,7 @@ int main(int argc, const char **argv)
+ dec_cfg.log_level = 1;
+ dec_cfg.frm_threads = 1;
+
+- if (argc < 2) {
++ if ((argc < 2) || !(argc % 2)) {
+ log_level_0("Error config, please check arguments: \n");
+ print_help();
+ return -1;
+@@ -588,8 +586,8 @@ finished:
+ if (frame_num) {
+ width = dec_frame.width[0];
+ height = dec_frame.height[0];
+- hor_size = dec_frame.seqhdr->horizontal_size;
+- ver_size = dec_frame.seqhdr->vertical_size;
++ hor_size = dec_frame.seqhdr->display_horizontal_size;
++ ver_size = dec_frame.seqhdr->display_vertical_size;
+
+ log_level_1("=========================================================================================\n");
+ log_level_1(" Resolution = %d x %d (Coding: %d x %d)\n", hor_size, ver_size, width, height);
+diff --git a/test/utest.h b/test/utest.h
+index aceb8a4..1dbd239 100644
+--- a/test/utest.h
++++ b/test/utest.h
+@@ -1,5 +1,5 @@
+ /**************************************************************************************
+- * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
++ * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School",
+ * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
+ *
+ * All rights reserved.
+@@ -11,12 +11,7 @@
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+- * 3. All advertising materials mentioning features or use of this software
+- * must display the following acknowledgement:
+- * This product includes the software uAVS3d developed by
+- * Peking University Shenzhen Graduate School, Peng Cheng Laboratory
+- * and Guangdong Bohua UHD Innovation Corporation.
+- * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
++ * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School,
+ * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+diff --git a/version.sh b/version.sh
+index 65e2df7..034d454 100755
+--- a/version.sh
++++ b/version.sh
+@@ -15,12 +15,12 @@ else
+ shell_dir=$1
+ fi
+
+-VER_R=`git rev-list origin/master | sort | wc -l | gawk '{print $1}'`
+-VER_L=`git rev-list HEAD | sort | wc -l | gawk '{print $1}'`
++VER_R=`git rev-list origin/master | sort | wc -l | awk '{print $1}'`
++VER_L=`git rev-list HEAD | sort | wc -l | awk '{print $1}'`
+ VER_SHA1=`git log -n 1 | head -n 1 | cut -d ' ' -f 2`
+
+ major_version="1"
+-minor_version="1"
++minor_version="2"
+ type_version="release"
+
+ # generate the file version.h
diff --git a/multimedia/uavs3d/cmakelist.patch b/multimedia/uavs3d/fix-libdir-in-cmakelists.patch
index d3420525b0..375a1e87e2 100644
--- a/multimedia/uavs3d/cmakelist.patch
+++ b/multimedia/uavs3d/fix-libdir-in-cmakelists.patch
@@ -1,31 +1,22 @@
---- CMakeLists.txt 2022-09-11 07:28:37.000000000 +0530
-+++ CMakeListsn.txt 2023-02-05 13:56:32.245000000 +0530
-@@ -1,4 +1,3 @@
--
- set(LIBNAME uavs3d)
-
- # check cpu
-@@ -13,6 +12,8 @@
+--- /home/ben/sbo/ff/uavs3d/source/CMakeLists.txt 2024-02-21 15:38:58.265755999 +0530
++++ /home/ben/sbo/ff/uavs3d/source/CMakeListsn.txt 2024-02-21 16:03:45.133751729 +0530
+@@ -13,6 +13,8 @@
" Unexpected pointer size ${CMAKE_SIZEOF_VOID_P} for ${CMAKE_SYSTEM_PROCESSOR}\n")
endif()
elseif("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "i386" OR
+ "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "i586" OR
-+ "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "i686" OR
++ "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "i686" OR
"${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86")
set(UAVS3D_TARGET_CPU "x86")
- elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "aarch64")
-@@ -104,7 +105,17 @@
+ elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "aarch64" OR
+@@ -109,7 +111,13 @@
find_package(Threads REQUIRED)
set(prefix "${CMAKE_INSTALL_PREFIX}")
set(includedir "include")
+if(${CMAKE_SIZEOF_VOID_P} EQUAL 4)
-+set(libdir "lib")
-+elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "^arm")
set(libdir "lib")
+elseif(${CMAKE_SIZEOF_VOID_P} EQUAL 8)
+set(libdir "lib64")
-+elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "aarch64")
-+set(libdir "lib64")
+else()
+message(FATAL_ERROR " Compiling for wrong architecture in Slackware GNU/Linux \n")
+endif()
diff --git a/multimedia/uavs3d/uavs3d.SlackBuild b/multimedia/uavs3d/uavs3d.SlackBuild
index 91029aad22..141e072f29 100644
--- a/multimedia/uavs3d/uavs3d.SlackBuild
+++ b/multimedia/uavs3d/uavs3d.SlackBuild
@@ -2,7 +2,7 @@
# Slackware build script for uavs3d
-# Copyright 2022-23 Vijay Marcel
+# Copyright 2022-24 Vijay Marcel
# All rights reserved.
#
# Redistribution and use of this script, with or without modification, is
@@ -25,19 +25,18 @@
cd $(dirname $0) ; CWD=$(pwd)
PRGNAM=uavs3d
-VERSION=${VERSION:-20220911_0133ee4}
-COMMIT=${COMMIT:-0133ee4b4bbbef7b88802e7ad019b14b9b852c2b}
+VERSION=${VERSION:-1.1}
BUILD=${BUILD:-1}
TAG=${TAG:-_SBo}
PKGTYPE=${PKGTYPE:-tgz}
+sys_arch=${sys_arch:-$(uname -m)}
-if [ -z "$ARCH" ]; then
- case "$( uname -m )" in
- i?86) export ARCH=i586 ;;
- arm*) export ARCH=arm ;;
- *) export ARCH=$( uname -m ) ;;
- esac
-fi
+case "$sys_arch" in
+ i586) export ARCH=i586 ;;
+ i686) export ARCH=i686 ;;
+ x86_64) export ARCH=x86_64 ;;
+ *) echo "This program will not build on $sys_arch platform" && exit 1 ;;
+esac
TMP=${TMP:-/tmp/SBo}
PKG=$TMP/package-$PRGNAM
@@ -53,13 +52,9 @@ elif [ "$ARCH" = "x86_64" ]; then
SLKCFLAGS="-march=x86-64 -mtune=generic -O2 -pipe -fPIC"
LIBDIRSUFFIX="64"
else
- SLKCFLAGS="-O2"
- LIBDIRSUFFIX=""
+ echo "This program will not build on $sys_arch platform" && exit 1
fi
-# If the variable PRINT_PACKAGE_NAME is set, then this script will report what
-# the name of the created package would be, and then exit. This information
-# could be useful to other scripts.
if [ ! -z "${PRINT_PACKAGE_NAME}" ]; then
echo "$PRGNAM-$VERSION-$ARCH-$BUILD$TAG.$PKGTYPE"
exit 0
@@ -71,9 +66,9 @@ trap 'echo "$0 FAILED at line $LINENO!" | tee -a $OUTPUT/error-${PRGNAM}.log' ER
rm -rf $PKG
mkdir -p $TMP $PKG $OUTPUT
cd $TMP
-rm -rf $PRGNAM-$COMMIT
-tar xvf $CWD/$PRGNAM-$COMMIT.tar.gz
-cd $PRGNAM-$COMMIT
+rm -rf $PRGNAM-$VERSION
+tar xvf $CWD/$PRGNAM-$VERSION.tar.gz
+cd $PRGNAM-$VERSION
chown -R root:root .
find -L . \
@@ -82,13 +77,14 @@ find -L . \
\( -perm 666 -o -perm 664 -o -perm 640 -o -perm 600 -o -perm 444 \
-o -perm 440 -o -perm 400 \) -exec chmod 644 {} \;
+patch -p1 --verbose --unified < $CWD/fix-build-issue.patch
cd source
-patch --verbose --unified < $CWD/cmakelist.patch
+patch --verbose --unified < $CWD/fix-libdir-in-cmakelists.patch
cd ..
mkdir -pv build/linux
-cmake -B build/linux -S $TMP/$PRGNAM-$COMMIT \
+cmake -B build/linux -S $TMP/$PRGNAM-$VERSION \
-DCMAKE_C_FLAGS:STRING="$SLKCFLAGS" \
-DCMAKE_CXX_FLAGS:STRING="$SLKCFLAGS" \
-DCMAKE_BUILD_TYPE:STRING=Release \
@@ -99,12 +95,12 @@ cmake -B build/linux -S $TMP/$PRGNAM-$COMMIT \
cmake --build build/linux
-make -C $TMP/$PRGNAM-$COMMIT/build/linux DESTDIR="$PKG" install
+make -C $TMP/$PRGNAM-$VERSION/build/linux DESTDIR="$PKG" install
-install -D -m755 $TMP/$PRGNAM-$COMMIT/build/linux/uavs3dec -t "$PKG/usr/bin"
+install -D -m755 $TMP/$PRGNAM-$VERSION/build/linux/uavs3dec -t "$PKG/usr/bin"
find $PKG -print0 | xargs -0 file | grep -e "executable" -e "shared object" \
- | grep ELF | cut -f 1 -d : | xargs strip --strip-unneeded 2> /dev/null || true
+ | grep ELF | cut -f 1 -d : | xargs strip --strip-unneeded --remove-section=.comment --remove-section=.note 2> /dev/null || true
# Don't ship .la files:
rm -f $PKG/{,usr/}lib${LIBDIRSUFFIX}/*.la
diff --git a/multimedia/uavs3d/uavs3d.info b/multimedia/uavs3d/uavs3d.info
index 5582757b68..ee0adc5d16 100644
--- a/multimedia/uavs3d/uavs3d.info
+++ b/multimedia/uavs3d/uavs3d.info
@@ -1,8 +1,8 @@
PRGNAM="uavs3d"
-VERSION="20220911_0133ee4"
+VERSION="1.1"
HOMEPAGE="https://github.com/uavs3/uavs3d"
-DOWNLOAD="https://github.com/uavs3/uavs3d/archive/0133ee4/uavs3d-0133ee4b4bbbef7b88802e7ad019b14b9b852c2b.tar.gz"
-MD5SUM="bf9b1d5d85e6f89e0953572e43a26c33"
+DOWNLOAD="https://github.com/uavs3/uavs3d/archive/v1.1/uavs3d-1.1.tar.gz"
+MD5SUM="a22d9d4f1da4d1e2d0b19a25754505c3"
DOWNLOAD_x86_64=""
MD5SUM_x86_64=""
REQUIRES=""