// Tencent is pleased to support the open source community by making ncnn available. // // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. // // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except // in compliance with the License. You may obtain a copy of the License at // // https://opensource.org/licenses/BSD-3-Clause // // Unless required by applicable law or agreed to in writing, software distributed // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. #include "mat.h" #if __ARM_NEON #include #endif // __ARM_NEON #include "platform.h" namespace ncnn { #if NCNN_PIXEL_ROTATE // should be a kanna ascii art here in my local branch // but we shall ask the original art author for permission first ... // https://www.reddit.com/r/anime/comments/5uxjn4/i_recreated_the_kanna_ascii_art_from_kobayashisan/ static void kanna_rotate_1_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride) { const int srcwgap = srcstride - srcw; const int wgap = stride - w; const unsigned char* src0 = src; const unsigned char* src1 = src + srcstride; unsigned char* dst0 = dst; unsigned char* dst1 = dst + stride; int y = 0; for (; y + 1 < srch; y += 2) { #if __ARM_NEON int nn = srcw >> 5; int remain = srcw - (nn << 5); #if __aarch64__ for (; nn > 0; nn--) { uint8x16_t _src0 = vld1q_u8(src0); uint8x16_t _src0n = vld1q_u8(src0 + 16); vst1q_u8(dst0, _src0); vst1q_u8(dst0 + 16, _src0n); uint8x16_t _src1 = vld1q_u8(src1); uint8x16_t _src1n = vld1q_u8(src1 + 16); vst1q_u8(dst1, _src1); vst1q_u8(dst1 + 16, _src1n); src0 += 32; src1 += 32; dst0 += 32; dst1 += 32; } #else if (nn > 0) { asm volatile( "0: \n" "pld [%1, #256] \n" "vld1.u8 {d0-d3}, [%1]! \n" "pld [%2, #256] \n" "vld1.u8 {d4-d7}, [%2]! \n" "subs %0, #1 \n" "vst1.u8 {d0-d3}, [%3]! \n" "vst1.u8 {d4-d7}, [%4]! \n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(src1), // %2 "=r"(dst0), // %3 "=r"(dst1) // %4 : "0"(nn), "1"(src0), "2"(src1), "3"(dst0), "4"(dst1) : "cc", "memory", "q0", "q1", "q2", "q3"); } #endif // __aarch64__ #else int remain = srcw; #endif // __ARM_NEON for (; remain > 0; remain--) { *dst0++ = *src0++; *dst1++ = *src1++; } src0 += srcwgap + srcstride; src1 += srcwgap + srcstride; dst0 += wgap + stride; dst1 += wgap + stride; } for (; y < srch; y++) { #if __ARM_NEON int nn = srcw >> 5; int remain = srcw - (nn << 5); #if __aarch64__ for (; nn > 0; nn--) { uint8x16_t _src = vld1q_u8(src0); uint8x16_t _src2 = vld1q_u8(src0 + 16); vst1q_u8(dst0, _src); vst1q_u8(dst0 + 16, _src2); src0 += 32; dst0 += 32; } #else if (nn > 0) { asm volatile( "0: \n" "pld [%1, #256] \n" "vld1.u8 {d0-d3}, [%1]! \n" "subs %0, #1 \n" "vst1.u8 {d0-d3}, [%2]! \n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(dst0) // %2 : "0"(nn), "1"(src0), "2"(dst0) : "cc", "memory", "q0", "q1"); } #endif // __aarch64__ #else int remain = srcw; #endif // __ARM_NEON for (; remain > 0; remain--) { *dst0++ = *src0++; } src0 += srcwgap; dst0 += wgap; } } static void kanna_rotate_1_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride) { const int srcwgap = srcstride - srcw * 2; const int wgap = stride - w * 2; int size = srcw * 2; const unsigned char* src0 = src; const unsigned char* src1 = src + srcstride; unsigned char* dst0 = dst; unsigned char* dst1 = dst + stride; int y = 0; for (; y + 1 < srch; y += 2) { #if __ARM_NEON int nn = size >> 5; int remain = size - (nn << 5); #if __aarch64__ for (; nn > 0; nn--) { uint8x16_t _src0 = vld1q_u8(src0); uint8x16_t _src0n = vld1q_u8(src0 + 16); vst1q_u8(dst0, _src0); vst1q_u8(dst0 + 16, _src0n); uint8x16_t _src1 = vld1q_u8(src1); uint8x16_t _src1n = vld1q_u8(src1 + 16); vst1q_u8(dst1, _src1); vst1q_u8(dst1 + 16, _src1n); src0 += 32; src1 += 32; dst0 += 32; dst1 += 32; } #else if (nn > 0) { asm volatile( "0: \n" "pld [%1, #256] \n" "vld1.u8 {d0-d3}, [%1]! \n" "pld [%2, #256] \n" "vld1.u8 {d4-d7}, [%2]! \n" "subs %0, #1 \n" "vst1.u8 {d0-d3}, [%3]! \n" "vst1.u8 {d4-d7}, [%4]! \n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(src1), // %2 "=r"(dst0), // %3 "=r"(dst1) // %4 : "0"(nn), "1"(src0), "2"(src1), "3"(dst0), "4"(dst1) : "cc", "memory", "q0", "q1", "q2", "q3"); } #endif // __aarch64__ #else int remain = size; #endif // __ARM_NEON for (; remain > 0; remain--) { *dst0++ = *src0++; *dst1++ = *src1++; } src0 += srcwgap + srcstride; src1 += srcwgap + srcstride; dst0 += wgap + stride; dst1 += wgap + stride; } for (; y < srch; y++) { #if __ARM_NEON int nn = size >> 5; int remain = size - (nn << 5); #if __aarch64__ for (; nn > 0; nn--) { uint8x16_t _src = vld1q_u8(src0); uint8x16_t _src2 = vld1q_u8(src0 + 16); vst1q_u8(dst0, _src); vst1q_u8(dst0 + 16, _src2); src0 += 32; dst0 += 32; } #else if (nn > 0) { asm volatile( "0: \n" "pld [%1, #256] \n" "vld1.u8 {d0-d3}, [%1]! \n" "subs %0, #1 \n" "vst1.u8 {d0-d3}, [%2]! \n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(dst0) // %2 : "0"(nn), "1"(src0), "2"(dst0) : "cc", "memory", "q0", "q1"); } #endif // __aarch64__ #else int remain = size; #endif // __ARM_NEON for (; remain > 0; remain--) { *dst0++ = *src0++; } src0 += srcwgap; dst0 += wgap; } } static void kanna_rotate_1_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride) { const int srcwgap = srcstride - srcw * 3; const int wgap = stride - w * 3; int size = srcw * 3; const unsigned char* src0 = src; const unsigned char* src1 = src + srcstride; unsigned char* dst0 = dst; unsigned char* dst1 = dst + stride; int y = 0; for (; y + 1 < srch; y += 2) { #if __ARM_NEON int nn = size >> 5; int remain = size - (nn << 5); #if __aarch64__ for (; nn > 0; nn--) { uint8x16_t _src0 = vld1q_u8(src0); uint8x16_t _src0n = vld1q_u8(src0 + 16); vst1q_u8(dst0, _src0); vst1q_u8(dst0 + 16, _src0n); uint8x16_t _src1 = vld1q_u8(src1); uint8x16_t _src1n = vld1q_u8(src1 + 16); vst1q_u8(dst1, _src1); vst1q_u8(dst1 + 16, _src1n); src0 += 32; src1 += 32; dst0 += 32; dst1 += 32; } #else if (nn > 0) { asm volatile( "0: \n" "pld [%1, #256] \n" "vld1.u8 {d0-d3}, [%1]! \n" "pld [%2, #256] \n" "vld1.u8 {d4-d7}, [%2]! \n" "subs %0, #1 \n" "vst1.u8 {d0-d3}, [%3]! \n" "vst1.u8 {d4-d7}, [%4]! \n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(src1), // %2 "=r"(dst0), // %3 "=r"(dst1) // %4 : "0"(nn), "1"(src0), "2"(src1), "3"(dst0), "4"(dst1) : "cc", "memory", "q0", "q1", "q2", "q3"); } #endif // __aarch64__ #else int remain = size; #endif // __ARM_NEON for (; remain > 0; remain--) { *dst0++ = *src0++; *dst1++ = *src1++; } src0 += srcwgap + srcstride; src1 += srcwgap + srcstride; dst0 += wgap + stride; dst1 += wgap + stride; } for (; y < srch; y++) { #if __ARM_NEON int nn = size >> 5; int remain = size - (nn << 5); #if __aarch64__ for (; nn > 0; nn--) { uint8x16_t _src = vld1q_u8(src0); uint8x16_t _src2 = vld1q_u8(src0 + 16); vst1q_u8(dst0, _src); vst1q_u8(dst0 + 16, _src2); src0 += 32; dst0 += 32; } #else if (nn > 0) { asm volatile( "0: \n" "pld [%1, #256] \n" "vld1.u8 {d0-d3}, [%1]! \n" "subs %0, #1 \n" "vst1.u8 {d0-d3}, [%2]! \n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(dst0) // %2 : "0"(nn), "1"(src0), "2"(dst0) : "cc", "memory", "q0", "q1"); } #endif // __aarch64__ #else int remain = size; #endif // __ARM_NEON for (; remain > 0; remain--) { *dst0++ = *src0++; } src0 += srcwgap; dst0 += wgap; } } static void kanna_rotate_1_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride) { const int srcwgap = srcstride - srcw * 4; const int wgap = stride - w * 4; int size = srcw * 4; const unsigned char* src0 = src; const unsigned char* src1 = src + srcstride; unsigned char* dst0 = dst; unsigned char* dst1 = dst + stride; int y = 0; for (; y + 1 < srch; y += 2) { #if __ARM_NEON int nn = size >> 5; int remain = size - (nn << 5); #if __aarch64__ for (; nn > 0; nn--) { uint8x16_t _src0 = vld1q_u8(src0); uint8x16_t _src0n = vld1q_u8(src0 + 16); vst1q_u8(dst0, _src0); vst1q_u8(dst0 + 16, _src0n); uint8x16_t _src1 = vld1q_u8(src1); uint8x16_t _src1n = vld1q_u8(src1 + 16); vst1q_u8(dst1, _src1); vst1q_u8(dst1 + 16, _src1n); src0 += 32; src1 += 32; dst0 += 32; dst1 += 32; } #else if (nn > 0) { asm volatile( "0: \n" "pld [%1, #256] \n" "vld1.u8 {d0-d3}, [%1]! \n" "pld [%2, #256] \n" "vld1.u8 {d4-d7}, [%2]! \n" "subs %0, #1 \n" "vst1.u8 {d0-d3}, [%3]! \n" "vst1.u8 {d4-d7}, [%4]! \n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(src1), // %2 "=r"(dst0), // %3 "=r"(dst1) // %4 : "0"(nn), "1"(src0), "2"(src1), "3"(dst0), "4"(dst1) : "cc", "memory", "q0", "q1", "q2", "q3"); } #endif // __aarch64__ #else int remain = size; #endif // __ARM_NEON for (; remain > 0; remain--) { *dst0++ = *src0++; *dst1++ = *src1++; } src0 += srcwgap + srcstride; src1 += srcwgap + srcstride; dst0 += wgap + stride; dst1 += wgap + stride; } for (; y < srch; y++) { #if __ARM_NEON int nn = size >> 5; int remain = size - (nn << 5); #if __aarch64__ for (; nn > 0; nn--) { uint8x16_t _src = vld1q_u8(src0); uint8x16_t _src2 = vld1q_u8(src0 + 16); vst1q_u8(dst0, _src); vst1q_u8(dst0 + 16, _src2); src0 += 32; dst0 += 32; } #else if (nn > 0) { asm volatile( "0: \n" "pld [%1, #256] \n" "vld1.u8 {d0-d3}, [%1]! \n" "subs %0, #1 \n" "vst1.u8 {d0-d3}, [%2]! \n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(dst0) // %2 : "0"(nn), "1"(src0), "2"(dst0) : "cc", "memory", "q0", "q1"); } #endif // __aarch64__ #else int remain = size; #endif // __ARM_NEON for (; remain > 0; remain--) { *dst0++ = *src0++; } src0 += srcwgap; dst0 += wgap; } } static void kanna_rotate_2_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride) { const int srcwgap = srcstride - srcw; const int wgap = stride + w; const unsigned char* src0 = src; unsigned char* dst0 = dst + w - 1; int y = 0; for (; y < srch; y++) { #if __ARM_NEON dst0 -= 15; int nn = srcw >> 4; int remain = srcw - (nn << 4); #if __aarch64__ for (; nn > 0; nn--) { uint8x8_t _src = vld1_u8(src0); uint8x8_t _src2 = vld1_u8(src0 + 8); _src = vrev64_u8(_src); _src2 = vrev64_u8(_src2); vst1_u8(dst0, _src2); vst1_u8(dst0 + 8, _src); src0 += 16; dst0 -= 16; } #else if (nn > 0) { asm volatile( "mov r4, #-16 \n" "0: \n" "pld [%1, #128] \n" "vld1.u8 {d0-d1}, [%1]! \n" "vrev64.u8 d3, d0 \n" "vrev64.u8 d2, d1 \n" "subs %0, #1 \n" "vst1.u8 {d2-d3}, [%2], r4 \n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(dst0) // %2 : "0"(nn), "1"(src0), "2"(dst0) : "cc", "memory", "q0", "q1", "r4"); } #endif // __aarch64__ dst0 += 15; #else int remain = srcw; #endif // __ARM_NEON for (; remain > 0; remain--) { *dst0 = *src0; src0 += 1; dst0 -= 1; } src0 += srcwgap; dst0 += wgap; } } static void kanna_rotate_2_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride) { const int srcwgap = srcstride - srcw * 2; const int wgap = stride + w * 2; const unsigned char* src0 = src; unsigned char* dst0 = dst + w * 2 - 2; int y = 0; for (; y < srch; y++) { #if __ARM_NEON dst0 -= 7 * 2; int nn = srcw >> 4; int remain = srcw - (nn << 4); #if __aarch64__ for (; nn > 0; nn--) { uint8x8x2_t _src = vld2_u8(src0); uint8x8x2_t _src2 = vld2_u8(src0 + 8 * 2); _src.val[0] = vrev64_u8(_src.val[0]); _src.val[1] = vrev64_u8(_src.val[1]); _src2.val[0] = vrev64_u8(_src2.val[0]); _src2.val[1] = vrev64_u8(_src2.val[1]); vst2_u8(dst0, _src); vst2_u8(dst0 - 8 * 2, _src2); src0 += 16 * 2; dst0 -= 16 * 2; } #else if (nn > 0) { asm volatile( "mov r4, #-16 \n" "0: \n" "pld [%1, #128] \n" "vld2.u8 {d0-d1}, [%1]! \n" "vrev64.u8 d0, d0 \n" "pld [%1, #128] \n" "vld2.u8 {d2-d3}, [%1]! \n" "vrev64.u8 d1, d1 \n" "vrev64.u8 d2, d2 \n" "vst2.u8 {d0-d1}, [%2], r4 \n" "vrev64.u8 d3, d3 \n" "subs %0, #1 \n" "vst2.u8 {d2-d3}, [%2], r4 \n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(dst0) // %2 : "0"(nn), "1"(src0), "2"(dst0) : "cc", "memory", "q0", "q1", "r4"); } #endif // __aarch64__ dst0 += 7 * 2; #else int remain = srcw; #endif // __ARM_NEON for (; remain > 0; remain--) { dst0[0] = src0[0]; dst0[1] = src0[1]; src0 += 2; dst0 -= 2; } src0 += srcwgap; dst0 += wgap; } } static void kanna_rotate_2_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride) { const int srcwgap = srcstride - srcw * 3; const int wgap = stride + w * 3; const unsigned char* src0 = src; unsigned char* dst0 = dst + w * 3 - 3; int y = 0; for (; y < srch; y++) { #if __ARM_NEON dst0 -= 7 * 3; int nn = srcw >> 4; int remain = srcw - (nn << 4); #if __aarch64__ for (; nn > 0; nn--) { uint8x8x3_t _src = vld3_u8(src0); uint8x8x3_t _src2 = vld3_u8(src0 + 8 * 3); _src.val[0] = vrev64_u8(_src.val[0]); _src.val[1] = vrev64_u8(_src.val[1]); _src.val[2] = vrev64_u8(_src.val[2]); _src2.val[0] = vrev64_u8(_src2.val[0]); _src2.val[1] = vrev64_u8(_src2.val[1]); _src2.val[2] = vrev64_u8(_src2.val[2]); vst3_u8(dst0, _src); vst3_u8(dst0 - 8 * 3, _src2); src0 += 16 * 3; dst0 -= 16 * 3; } #else if (nn > 0) { asm volatile( "mov r4, #-24 \n" "0: \n" "pld [%1, #192] \n" "vld3.u8 {d0-d2}, [%1]! \n" "vrev64.u8 d0, d0 \n" "vrev64.u8 d1, d1 \n" "pld [%1, #192] \n" "vld3.u8 {d4-d6}, [%1]! \n" "vrev64.u8 d2, d2 \n" "vrev64.u8 d4, d4 \n" "vst3.u8 {d0-d2}, [%2], r4 \n" "vrev64.u8 d5, d5 \n" "vrev64.u8 d6, d6 \n" "subs %0, #1 \n" "vst3.u8 {d4-d6}, [%2], r4 \n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(dst0) // %2 : "0"(nn), "1"(src0), "2"(dst0) : "cc", "memory", "q0", "q1", "q2", "q3", "r4"); } #endif // __aarch64__ dst0 += 7 * 3; #else int remain = srcw; #endif // __ARM_NEON for (; remain > 0; remain--) { dst0[0] = src0[0]; dst0[1] = src0[1]; dst0[2] = src0[2]; src0 += 3; dst0 -= 3; } src0 += srcwgap; dst0 += wgap; } } static void kanna_rotate_2_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride) { const int srcwgap = srcstride - srcw * 4; const int wgap = stride + w * 4; const unsigned char* src0 = src; unsigned char* dst0 = dst + w * 4 - 4; int y = 0; for (; y < srch; y++) { #if __ARM_NEON dst0 -= 7 * 4; int nn = srcw >> 4; int remain = srcw - (nn << 4); #if __aarch64__ for (; nn > 0; nn--) { uint8x8x4_t _src = vld4_u8(src0); uint8x8x4_t _src2 = vld4_u8(src0 + 8 * 4); _src.val[0] = vrev64_u8(_src.val[0]); _src.val[1] = vrev64_u8(_src.val[1]); _src.val[2] = vrev64_u8(_src.val[2]); _src.val[3] = vrev64_u8(_src.val[3]); _src2.val[0] = vrev64_u8(_src2.val[0]); _src2.val[1] = vrev64_u8(_src2.val[1]); _src2.val[2] = vrev64_u8(_src2.val[2]); _src2.val[3] = vrev64_u8(_src2.val[3]); vst4_u8(dst0, _src); vst4_u8(dst0 - 8 * 4, _src2); src0 += 16 * 4; dst0 -= 16 * 4; } #else if (nn > 0) { asm volatile( "mov r4, #-32 \n" "0: \n" "pld [%1, #256] \n" "vld4.u8 {d0-d3}, [%1]! \n" "vrev64.u8 d0, d0 \n" "vrev64.u8 d1, d1 \n" "vrev64.u8 d2, d2 \n" "pld [%1, #256] \n" "vld4.u8 {d4-d7}, [%1]! \n" "vrev64.u8 d3, d3 \n" "vrev64.u8 d4, d4 \n" "vrev64.u8 d5, d5 \n" "vst4.u8 {d0-d3}, [%2], r4 \n" "vrev64.u8 d6, d6 \n" "vrev64.u8 d7, d7 \n" "subs %0, #1 \n" "vst4.u8 {d4-d7}, [%2], r4 \n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(dst0) // %2 : "0"(nn), "1"(src0), "2"(dst0) : "cc", "memory", "q0", "q1", "q2", "q3", "r4"); } #endif // __aarch64__ dst0 += 7 * 4; #else int remain = srcw; #endif // __ARM_NEON for (; remain > 0; remain--) { dst0[0] = src0[0]; dst0[1] = src0[1]; dst0[2] = src0[2]; dst0[3] = src0[3]; src0 += 4; dst0 -= 4; } src0 += srcwgap; dst0 += wgap; } } static void kanna_rotate_3_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride) { const int srcwgap = srcstride - srcw; const int wgap = stride - w; // point to the last dst pixel unsigned char* dstend = dst + stride * h - wgap; const unsigned char* src0 = src; unsigned char* dst0 = dstend - 1; int y = 0; for (; y < srch; y++) { #if __ARM_NEON dst0 -= 15; int nn = srcw >> 4; int remain = srcw - (nn << 4); #if __aarch64__ for (; nn > 0; nn--) { uint8x8_t _src = vld1_u8(src0); uint8x8_t _src2 = vld1_u8(src0 + 8); _src = vrev64_u8(_src); _src2 = vrev64_u8(_src2); vst1_u8(dst0, _src2); vst1_u8(dst0 + 8, _src); src0 += 16; dst0 -= 16; } #else if (nn > 0) { asm volatile( "mov r4, #-16 \n" "0: \n" "pld [%1, #128] \n" "vld1.u8 {d0-d1}, [%1]! \n" "vrev64.u8 d3, d0 \n" "vrev64.u8 d2, d1 \n" "subs %0, #1 \n" "vst1.u8 {d2-d3}, [%2], r4 \n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(dst0) // %2 : "0"(nn), "1"(src0), "2"(dst0) : "cc", "memory", "q0", "q1", "r4"); } #endif // __aarch64__ dst0 += 15; #else int remain = srcw; #endif // __ARM_NEON for (; remain > 0; remain--) { *dst0 = *src0; src0 += 1; dst0 -= 1; } src0 += srcwgap; dst0 -= wgap; } } static void kanna_rotate_3_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride) { const int srcwgap = srcstride - srcw * 2; const int wgap = stride - w * 2; // point to the last dst pixel unsigned char* dstend = dst + stride * h - wgap; const unsigned char* src0 = src; unsigned char* dst0 = dstend - 2; int y = 0; for (; y < srch; y++) { #if __ARM_NEON dst0 -= 7 * 2; int nn = srcw >> 4; int remain = srcw - (nn << 4); #if __aarch64__ for (; nn > 0; nn--) { uint8x8x2_t _src = vld2_u8(src0); uint8x8x2_t _src2 = vld2_u8(src0 + 8 * 2); _src.val[0] = vrev64_u8(_src.val[0]); _src.val[1] = vrev64_u8(_src.val[1]); _src2.val[0] = vrev64_u8(_src2.val[0]); _src2.val[1] = vrev64_u8(_src2.val[1]); vst2_u8(dst0, _src); vst2_u8(dst0 - 8 * 2, _src2); src0 += 16 * 2; dst0 -= 16 * 2; } #else if (nn > 0) { asm volatile( "mov r4, #-16 \n" "0: \n" "pld [%1, #128] \n" "vld2.u8 {d0-d1}, [%1]! \n" "vrev64.u8 d0, d0 \n" "pld [%1, #128] \n" "vld2.u8 {d2-d3}, [%1]! \n" "vrev64.u8 d1, d1 \n" "vrev64.u8 d2, d2 \n" "vst2.u8 {d0-d1}, [%2], r4 \n" "vrev64.u8 d3, d3 \n" "subs %0, #1 \n" "vst2.u8 {d2-d3}, [%2], r4 \n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(dst0) // %2 : "0"(nn), "1"(src0), "2"(dst0) : "cc", "memory", "q0", "q1", "r4"); } #endif // __aarch64__ dst0 += 7 * 2; #else int remain = srcw; #endif // __ARM_NEON for (; remain > 0; remain--) { dst0[0] = src0[0]; dst0[1] = src0[1]; src0 += 2; dst0 -= 2; } src0 += srcwgap; dst0 -= wgap; } } static void kanna_rotate_3_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride) { const int srcwgap = srcstride - srcw * 3; const int wgap = stride - w * 3; // point to the last dst pixel unsigned char* dstend = dst + stride * h - wgap; const unsigned char* src0 = src; unsigned char* dst0 = dstend - 3; int y = 0; for (; y < srch; y++) { #if __ARM_NEON dst0 -= 7 * 3; int nn = srcw >> 4; int remain = srcw - (nn << 4); #if __aarch64__ for (; nn > 0; nn--) { uint8x8x3_t _src = vld3_u8(src0); uint8x8x3_t _src2 = vld3_u8(src0 + 8 * 3); _src.val[0] = vrev64_u8(_src.val[0]); _src.val[1] = vrev64_u8(_src.val[1]); _src.val[2] = vrev64_u8(_src.val[2]); _src2.val[0] = vrev64_u8(_src2.val[0]); _src2.val[1] = vrev64_u8(_src2.val[1]); _src2.val[2] = vrev64_u8(_src2.val[2]); vst3_u8(dst0, _src); vst3_u8(dst0 - 8 * 3, _src2); src0 += 16 * 3; dst0 -= 16 * 3; } #else if (nn > 0) { asm volatile( "mov r4, #-24 \n" "0: \n" "pld [%1, #192] \n" "vld3.u8 {d0-d2}, [%1]! \n" "vrev64.u8 d0, d0 \n" "vrev64.u8 d1, d1 \n" "pld [%1, #192] \n" "vld3.u8 {d4-d6}, [%1]! \n" "vrev64.u8 d2, d2 \n" "vrev64.u8 d4, d4 \n" "vst3.u8 {d0-d2}, [%2], r4 \n" "vrev64.u8 d5, d5 \n" "vrev64.u8 d6, d6 \n" "subs %0, #1 \n" "vst3.u8 {d4-d6}, [%2], r4 \n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(dst0) // %2 : "0"(nn), "1"(src0), "2"(dst0) : "cc", "memory", "q0", "q1", "q2", "q3", "r4"); } #endif // __aarch64__ dst0 += 7 * 3; #else int remain = srcw; #endif // __ARM_NEON for (; remain > 0; remain--) { dst0[0] = src0[0]; dst0[1] = src0[1]; dst0[2] = src0[2]; src0 += 3; dst0 -= 3; } src0 += srcwgap; dst0 -= wgap; } } static void kanna_rotate_3_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride) { const int srcwgap = srcstride - srcw * 4; const int wgap = stride - w * 4; // point to the last dst pixel unsigned char* dstend = dst + stride * h - wgap; const unsigned char* src0 = src; unsigned char* dst0 = dstend - 4; int y = 0; for (; y < srch; y++) { #if __ARM_NEON dst0 -= 7 * 4; int nn = srcw >> 4; int remain = srcw - (nn << 4); #if __aarch64__ for (; nn > 0; nn--) { uint8x8x4_t _src = vld4_u8(src0); uint8x8x4_t _src2 = vld4_u8(src0 + 8 * 4); _src.val[0] = vrev64_u8(_src.val[0]); _src.val[1] = vrev64_u8(_src.val[1]); _src.val[2] = vrev64_u8(_src.val[2]); _src.val[3] = vrev64_u8(_src.val[3]); _src2.val[0] = vrev64_u8(_src2.val[0]); _src2.val[1] = vrev64_u8(_src2.val[1]); _src2.val[2] = vrev64_u8(_src2.val[2]); _src2.val[3] = vrev64_u8(_src2.val[3]); vst4_u8(dst0, _src); vst4_u8(dst0 - 8 * 4, _src2); src0 += 16 * 4; dst0 -= 16 * 4; } #else if (nn > 0) { asm volatile( "mov r4, #-32 \n" "0: \n" "pld [%1, #256] \n" "vld4.u8 {d0-d3}, [%1]! \n" "vrev64.u8 d0, d0 \n" "vrev64.u8 d1, d1 \n" "vrev64.u8 d2, d2 \n" "pld [%1, #256] \n" "vld4.u8 {d4-d7}, [%1]! \n" "vrev64.u8 d3, d3 \n" "vrev64.u8 d4, d4 \n" "vrev64.u8 d5, d5 \n" "vst4.u8 {d0-d3}, [%2], r4 \n" "vrev64.u8 d6, d6 \n" "vrev64.u8 d7, d7 \n" "subs %0, #1 \n" "vst4.u8 {d4-d7}, [%2], r4 \n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(dst0) // %2 : "0"(nn), "1"(src0), "2"(dst0) : "cc", "memory", "q0", "q1", "q2", "q3", "r4"); } #endif // __aarch64__ dst0 += 7 * 4; #else int remain = srcw; #endif // __ARM_NEON for (; remain > 0; remain--) { dst0[0] = src0[0]; dst0[1] = src0[1]; dst0[2] = src0[2]; dst0[3] = src0[3]; src0 += 4; dst0 -= 4; } src0 += srcwgap; dst0 -= wgap; } } static void kanna_rotate_4_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride) { const int srcwgap = srcstride - srcw; const int wgap = stride + w; // point to the last dst pixel row unsigned char* dstend = dst + stride * (h - 1); const unsigned char* src0 = src; const unsigned char* src1 = src + srcstride; unsigned char* dst0 = dstend; unsigned char* dst1 = dstend - stride; int y = 0; for (; y + 1 < srch; y += 2) { #if __ARM_NEON int nn = srcw >> 5; int remain = srcw - (nn << 5); #if __aarch64__ for (; nn > 0; nn--) { uint8x16_t _src0 = vld1q_u8(src0); uint8x16_t _src0n = vld1q_u8(src0 + 16); vst1q_u8(dst0, _src0); vst1q_u8(dst0 + 16, _src0n); uint8x16_t _src1 = vld1q_u8(src1); uint8x16_t _src1n = vld1q_u8(src1 + 16); vst1q_u8(dst1, _src1); vst1q_u8(dst1 + 16, _src1n); src0 += 32; src1 += 32; dst0 += 32; dst1 += 32; } #else if (nn > 0) { asm volatile( "0: \n" "pld [%1, #256] \n" "vld1.u8 {d0-d3}, [%1]! \n" "pld [%2, #256] \n" "vld1.u8 {d4-d7}, [%2]! \n" "subs %0, #1 \n" "vst1.u8 {d0-d3}, [%3]! \n" "vst1.u8 {d4-d7}, [%4]! \n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(src1), // %2 "=r"(dst0), // %3 "=r"(dst1) // %4 : "0"(nn), "1"(src0), "2"(src1), "3"(dst0), "4"(dst1) : "cc", "memory", "q0", "q1", "q2", "q3"); } #endif // __aarch64__ #else int remain = srcw; #endif // __ARM_NEON for (; remain > 0; remain--) { *dst0++ = *src0++; *dst1++ = *src1++; } src0 += srcwgap + srcstride; src1 += srcwgap + srcstride; dst0 -= wgap + stride; dst1 -= wgap + stride; } for (; y < srch; y++) { #if __ARM_NEON int nn = srcw >> 5; int remain = srcw - (nn << 5); #if __aarch64__ for (; nn > 0; nn--) { uint8x16_t _src = vld1q_u8(src0); uint8x16_t _src2 = vld1q_u8(src0 + 16); vst1q_u8(dst0, _src); vst1q_u8(dst0 + 16, _src2); src0 += 32; dst0 += 32; } #else if (nn > 0) { asm volatile( "0: \n" "pld [%1, #256] \n" "vld1.u8 {d0-d3}, [%1]! \n" "subs %0, #1 \n" "vst1.u8 {d0-d3}, [%2]! \n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(dst0) // %2 : "0"(nn), "1"(src0), "2"(dst0) : "cc", "memory", "q0", "q1"); } #endif // __aarch64__ #else int remain = srcw; #endif // __ARM_NEON for (; remain > 0; remain--) { *dst0++ = *src0++; } src0 += srcwgap; dst0 -= wgap; } } static void kanna_rotate_4_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride) { const int srcwgap = srcstride - srcw * 2; const int wgap = stride + w * 2; // point to the last dst pixel row unsigned char* dstend = dst + stride * (h - 1); int size = srcw * 2; const unsigned char* src0 = src; const unsigned char* src1 = src + srcstride; unsigned char* dst0 = dstend; unsigned char* dst1 = dstend - stride; int y = 0; for (; y + 1 < srch; y += 2) { #if __ARM_NEON int nn = size >> 5; int remain = size - (nn << 5); #if __aarch64__ for (; nn > 0; nn--) { uint8x16_t _src0 = vld1q_u8(src0); uint8x16_t _src0n = vld1q_u8(src0 + 16); vst1q_u8(dst0, _src0); vst1q_u8(dst0 + 16, _src0n); uint8x16_t _src1 = vld1q_u8(src1); uint8x16_t _src1n = vld1q_u8(src1 + 16); vst1q_u8(dst1, _src1); vst1q_u8(dst1 + 16, _src1n); src0 += 32; src1 += 32; dst0 += 32; dst1 += 32; } #else if (nn > 0) { asm volatile( "0: \n" "pld [%1, #256] \n" "vld1.u8 {d0-d3}, [%1]! \n" "pld [%2, #256] \n" "vld1.u8 {d4-d7}, [%2]! \n" "subs %0, #1 \n" "vst1.u8 {d0-d3}, [%3]! \n" "vst1.u8 {d4-d7}, [%4]! \n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(src1), // %2 "=r"(dst0), // %3 "=r"(dst1) // %4 : "0"(nn), "1"(src0), "2"(src1), "3"(dst0), "4"(dst1) : "cc", "memory", "q0", "q1", "q2", "q3"); } #endif // __aarch64__ #else int remain = size; #endif // __ARM_NEON for (; remain > 0; remain--) { *dst0++ = *src0++; *dst1++ = *src1++; } src0 += srcwgap + srcstride; src1 += srcwgap + srcstride; dst0 -= wgap + stride; dst1 -= wgap + stride; } for (; y < srch; y++) { #if __ARM_NEON int nn = size >> 5; int remain = size - (nn << 5); #if __aarch64__ for (; nn > 0; nn--) { uint8x16_t _src = vld1q_u8(src0); uint8x16_t _src2 = vld1q_u8(src0 + 16); vst1q_u8(dst0, _src); vst1q_u8(dst0 + 16, _src2); src0 += 32; dst0 += 32; } #else if (nn > 0) { asm volatile( "0: \n" "pld [%1, #256] \n" "vld1.u8 {d0-d3}, [%1]! \n" "subs %0, #1 \n" "vst1.u8 {d0-d3}, [%2]! \n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(dst0) // %2 : "0"(nn), "1"(src0), "2"(dst0) : "cc", "memory", "q0", "q1"); } #endif // __aarch64__ #else int remain = size; #endif // __ARM_NEON for (; remain > 0; remain--) { *dst0++ = *src0++; } src0 += srcwgap; dst0 -= wgap; } } static void kanna_rotate_4_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride) { const int srcwgap = srcstride - srcw * 3; const int wgap = stride + w * 3; // point to the last dst pixel row unsigned char* dstend = dst + stride * (h - 1); int size = srcw * 3; const unsigned char* src0 = src; const unsigned char* src1 = src + srcstride; unsigned char* dst0 = dstend; unsigned char* dst1 = dstend - stride; int y = 0; for (; y + 1 < srch; y += 2) { #if __ARM_NEON int nn = size >> 5; int remain = size - (nn << 5); #if __aarch64__ for (; nn > 0; nn--) { uint8x16_t _src0 = vld1q_u8(src0); uint8x16_t _src0n = vld1q_u8(src0 + 16); vst1q_u8(dst0, _src0); vst1q_u8(dst0 + 16, _src0n); uint8x16_t _src1 = vld1q_u8(src1); uint8x16_t _src1n = vld1q_u8(src1 + 16); vst1q_u8(dst1, _src1); vst1q_u8(dst1 + 16, _src1n); src0 += 32; src1 += 32; dst0 += 32; dst1 += 32; } #else if (nn > 0) { asm volatile( "0: \n" "pld [%1, #256] \n" "vld1.u8 {d0-d3}, [%1]! \n" "pld [%2, #256] \n" "vld1.u8 {d4-d7}, [%2]! \n" "subs %0, #1 \n" "vst1.u8 {d0-d3}, [%3]! \n" "vst1.u8 {d4-d7}, [%4]! \n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(src1), // %2 "=r"(dst0), // %3 "=r"(dst1) // %4 : "0"(nn), "1"(src0), "2"(src1), "3"(dst0), "4"(dst1) : "cc", "memory", "q0", "q1", "q2", "q3"); } #endif // __aarch64__ #else int remain = size; #endif // __ARM_NEON for (; remain > 0; remain--) { *dst0++ = *src0++; *dst1++ = *src1++; } src0 += srcwgap + srcstride; src1 += srcwgap + srcstride; dst0 -= wgap + stride; dst1 -= wgap + stride; } for (; y < srch; y++) { #if __ARM_NEON int nn = size >> 5; int remain = size - (nn << 5); #if __aarch64__ for (; nn > 0; nn--) { uint8x16_t _src = vld1q_u8(src0); uint8x16_t _src2 = vld1q_u8(src0 + 16); vst1q_u8(dst0, _src); vst1q_u8(dst0 + 16, _src2); src0 += 32; dst0 += 32; } #else if (nn > 0) { asm volatile( "0: \n" "pld [%1, #256] \n" "vld1.u8 {d0-d3}, [%1]! \n" "subs %0, #1 \n" "vst1.u8 {d0-d3}, [%2]! \n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(dst0) // %2 : "0"(nn), "1"(src0), "2"(dst0) : "cc", "memory", "q0", "q1"); } #endif // __aarch64__ #else int remain = size; #endif // __ARM_NEON for (; remain > 0; remain--) { *dst0++ = *src0++; } src0 += srcwgap; dst0 -= wgap; } } static void kanna_rotate_4_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride) { const int srcwgap = srcstride - srcw * 4; const int wgap = stride + w * 4; // point to the last dst pixel row unsigned char* dstend = dst + stride * (h - 1); int size = srcw * 4; const unsigned char* src0 = src; const unsigned char* src1 = src + srcstride; unsigned char* dst0 = dstend; unsigned char* dst1 = dstend - stride; int y = 0; for (; y + 1 < srch; y += 2) { #if __ARM_NEON int nn = size >> 5; int remain = size - (nn << 5); #if __aarch64__ for (; nn > 0; nn--) { uint8x16_t _src0 = vld1q_u8(src0); uint8x16_t _src0n = vld1q_u8(src0 + 16); vst1q_u8(dst0, _src0); vst1q_u8(dst0 + 16, _src0n); uint8x16_t _src1 = vld1q_u8(src1); uint8x16_t _src1n = vld1q_u8(src1 + 16); vst1q_u8(dst1, _src1); vst1q_u8(dst1 + 16, _src1n); src0 += 32; src1 += 32; dst0 += 32; dst1 += 32; } #else if (nn > 0) { asm volatile( "0: \n" "pld [%1, #256] \n" "vld1.u8 {d0-d3}, [%1]! \n" "pld [%2, #256] \n" "vld1.u8 {d4-d7}, [%2]! \n" "subs %0, #1 \n" "vst1.u8 {d0-d3}, [%3]! \n" "vst1.u8 {d4-d7}, [%4]! \n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(src1), // %2 "=r"(dst0), // %3 "=r"(dst1) // %4 : "0"(nn), "1"(src0), "2"(src1), "3"(dst0), "4"(dst1) : "cc", "memory", "q0", "q1", "q2", "q3"); } #endif // __aarch64__ #else int remain = size; #endif // __ARM_NEON for (; remain > 0; remain--) { *dst0++ = *src0++; *dst1++ = *src1++; } src0 += srcwgap + srcstride; src1 += srcwgap + srcstride; dst0 -= wgap + stride; dst1 -= wgap + stride; } for (; y < srch; y++) { #if __ARM_NEON int nn = size >> 5; int remain = size - (nn << 5); #if __aarch64__ for (; nn > 0; nn--) { uint8x16_t _src = vld1q_u8(src0); uint8x16_t _src2 = vld1q_u8(src0 + 16); vst1q_u8(dst0, _src); vst1q_u8(dst0 + 16, _src2); src0 += 32; dst0 += 32; } #else if (nn > 0) { asm volatile( "0: \n" "pld [%1, #256] \n" "vld1.u8 {d0-d3}, [%1]! \n" "subs %0, #1 \n" "vst1.u8 {d0-d3}, [%2]! \n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(dst0) // %2 : "0"(nn), "1"(src0), "2"(dst0) : "cc", "memory", "q0", "q1"); } #endif // __aarch64__ #else int remain = size; #endif // __ARM_NEON for (; remain > 0; remain--) { *dst0++ = *src0++; } src0 += srcwgap; dst0 -= wgap; } } static void kanna_rotate_5_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int /*h*/, int stride) { const int srcwgap = srcstride - srcw; const unsigned char* src0 = src; int y = 0; #if __ARM_NEON for (; y + 7 < srch; y += 8) { const unsigned char* src1 = src0 + srcstride; unsigned char* dst0 = dst + y; unsigned char* dst1 = dst + y + stride; int src_step = 2 * srcstride; int dst_step = 2 * stride; int nn = srcw >> 3; int remain = srcw - (nn << 3); #if __aarch64__ for (; nn > 0; nn--) { uint8x8_t _src0 = vld1_u8(src0); uint8x8_t _src1 = vld1_u8(src1); uint8x8_t _src2 = vld1_u8(src0 + src_step); uint8x8_t _src3 = vld1_u8(src1 + src_step); uint8x8_t _src4 = vld1_u8(src0 + 2 * src_step); uint8x8_t _src5 = vld1_u8(src1 + 2 * src_step); uint8x8_t _src6 = vld1_u8(src0 + 3 * src_step); uint8x8_t _src7 = vld1_u8(src1 + 3 * src_step); uint8x8x2_t _src01t_r = vtrn_u8(_src0, _src1); uint8x8x2_t _src23t_r = vtrn_u8(_src2, _src3); uint8x8x2_t _src45t_r = vtrn_u8(_src4, _src5); uint8x8x2_t _src67t_r = vtrn_u8(_src6, _src7); uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0])); uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1])); uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0])); uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1])); uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0])); uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0])); uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1])); uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1])); uint8x8_t _dst0 = vreinterpret_u8_u32(_src04ttt_r.val[0]); uint8x8_t _dst1 = vreinterpret_u8_u32(_src15ttt_r.val[0]); uint8x8_t _dst2 = vreinterpret_u8_u32(_src26ttt_r.val[0]); uint8x8_t _dst3 = vreinterpret_u8_u32(_src37ttt_r.val[0]); uint8x8_t _dst4 = vreinterpret_u8_u32(_src04ttt_r.val[1]); uint8x8_t _dst5 = vreinterpret_u8_u32(_src15ttt_r.val[1]); uint8x8_t _dst6 = vreinterpret_u8_u32(_src26ttt_r.val[1]); uint8x8_t _dst7 = vreinterpret_u8_u32(_src37ttt_r.val[1]); vst1_u8(dst0, _dst0); vst1_u8(dst1, _dst1); vst1_u8(dst0 + dst_step, _dst2); vst1_u8(dst1 + dst_step, _dst3); vst1_u8(dst0 + 2 * dst_step, _dst4); vst1_u8(dst1 + 2 * dst_step, _dst5); vst1_u8(dst0 + 3 * dst_step, _dst6); vst1_u8(dst1 + 3 * dst_step, _dst7); src0 += 8; src1 += 8; dst0 += 4 * dst_step; dst1 += 4 * dst_step; } #else if (nn > 0) { asm volatile( "0: \n" "pld [%1, #64] \n" "vld1.u8 {d0}, [%1], %10 \n" "pld [%2, #64] \n" "vld1.u8 {d1}, [%2], %10 \n" "pld [%1, #64] \n" "vld1.u8 {d2}, [%1], %10 \n" "vtrn.u8 d0, d1 \n" // _src01t_r "pld [%2, #64] \n" "vld1.u8 {d3}, [%2], %10 \n" "pld [%1, #64] \n" "vld1.u8 {d4}, [%1], %10 \n" "vtrn.u8 d2, d3 \n" // _src23t_r "pld [%2, #64] \n" "vld1.u8 {d5}, [%2], %10 \n" "pld [%1, #64] \n" "vld1.u8 {d6}, [%1], %10 \n" "vtrn.u8 d4, d5 \n" // _src45t_r "pld [%2, #64] \n" "vld1.u8 {d7}, [%2], %10 \n" "vtrn.u8 d6, d7 \n" // _src67t_r "sub %1, %1, %10, lsl #2 \n" // restore src0 "vtrn.u16 q0, q1 \n" // _src02tt_r _src13tt_r "sub %2, %2, %10, lsl #2 \n" // restore src1 "vtrn.u16 q2, q3 \n" // _src13tt_r _src46tt_r "add %1, #8 \n" // src0 += 8 "vtrn.u32 q0, q2 \n" // _src04ttt_r _src15ttt_r "add %2, #8 \n" // src1 += 8 "vtrn.u32 q1, q3 \n" // _src26ttt_r _src37ttt_r "vst1.u8 {d0}, [%3], %11 \n" "vst1.u8 {d1}, [%4], %11 \n" "subs %0, #1 \n" "vst1.u8 {d2}, [%3], %11 \n" "vst1.u8 {d3}, [%4], %11 \n" "vst1.u8 {d4}, [%3], %11 \n" "vst1.u8 {d5}, [%4], %11 \n" "vst1.u8 {d6}, [%3], %11 \n" "vst1.u8 {d7}, [%4], %11 \n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(src1), // %2 "=r"(dst0), // %3 "=r"(dst1) // %4 : "0"(nn), "1"(src0), "2"(src1), "3"(dst0), "4"(dst1), "r"(src_step), // %10 "r"(dst_step) // %11 : "cc", "memory", "q0", "q1", "q2", "q3"); } #endif // __aarch64__ for (; remain > 0; remain--) { dst0[0] = src0[0]; dst0[1] = src1[0]; dst0[2] = src0[0 + src_step]; dst0[3] = src1[0 + src_step]; dst0[4] = src0[0 + 2 * src_step]; dst0[5] = src1[0 + 2 * src_step]; dst0[6] = src0[0 + 3 * src_step]; dst0[7] = src1[0 + 3 * src_step]; src0 += 1; src1 += 1; dst0 += stride; } src0 += srcwgap + 7 * srcstride; } #endif // __ARM_NEON for (; y < srch; y++) { unsigned char* dst0 = dst + y; int x = 0; for (; x < srcw; x++) { *dst0 = *src0; src0 += 1; dst0 += stride; } src0 += srcwgap; } } static void kanna_rotate_5_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int /*h*/, int stride) { const int srcwgap = srcstride - srcw * 2; const unsigned char* src0 = src; int y = 0; #if __ARM_NEON for (; y + 7 < srch; y += 8) { const unsigned char* src1 = src0 + srcstride; unsigned char* dst0 = dst + y * 2; unsigned char* dst1 = dst + y * 2 + stride; int src_step = 2 * srcstride; int dst_step = 2 * stride; int nn = srcw >> 3; int remain = srcw - (nn << 3); #if __aarch64__ for (; nn > 0; nn--) { uint8x8x2_t _src0 = vld2_u8(src0); uint8x8x2_t _src1 = vld2_u8(src1); uint8x8x2_t _src2 = vld2_u8(src0 + src_step); uint8x8x2_t _src3 = vld2_u8(src1 + src_step); uint8x8x2_t _src4 = vld2_u8(src0 + 2 * src_step); uint8x8x2_t _src5 = vld2_u8(src1 + 2 * src_step); uint8x8x2_t _src6 = vld2_u8(src0 + 3 * src_step); uint8x8x2_t _src7 = vld2_u8(src1 + 3 * src_step); uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]); uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]); uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]); uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]); uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]); uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]); uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]); uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]); uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0])); uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1])); uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0])); uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1])); uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0])); uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1])); uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0])); uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1])); uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0])); uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0])); uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1])); uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1])); uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0])); uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0])); uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1])); uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1])); uint8x8x2_t _dst0; uint8x8x2_t _dst1; uint8x8x2_t _dst2; uint8x8x2_t _dst3; uint8x8x2_t _dst4; uint8x8x2_t _dst5; uint8x8x2_t _dst6; uint8x8x2_t _dst7; _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]); _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]); _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]); _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]); _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]); _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]); _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]); _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]); _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]); _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]); _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]); _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]); _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]); _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]); _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]); _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]); vst2_u8(dst0, _dst0); vst2_u8(dst1, _dst1); vst2_u8(dst0 + dst_step, _dst2); vst2_u8(dst1 + dst_step, _dst3); vst2_u8(dst0 + 2 * dst_step, _dst4); vst2_u8(dst1 + 2 * dst_step, _dst5); vst2_u8(dst0 + 3 * dst_step, _dst6); vst2_u8(dst1 + 3 * dst_step, _dst7); src0 += 2 * 8; src1 += 2 * 8; dst0 += 4 * dst_step; dst1 += 4 * dst_step; } #else if (nn > 0) { asm volatile( "0: \n" "pld [%1, #128] \n" "vld2.u8 {d0-d1}, [%1], %10 \n" "pld [%2, #128] \n" "vld2.u8 {d2-d3}, [%2], %10 \n" "pld [%1, #128] \n" "vld2.u8 {d4-d5}, [%1], %10 \n" "vtrn.u8 q0, q1 \n" // _src01t_r "pld [%2, #128] \n" "vld2.u8 {d6-d7}, [%2], %10 \n" "pld [%1, #128] \n" "vld2.u8 {d16-d17}, [%1], %10\n" "vtrn.u8 q2, q3 \n" // _src23t_r "pld [%2, #128] \n" "vld2.u8 {d18-d19}, [%2], %10\n" "pld [%1, #128] \n" "vld2.u8 {d20-d21}, [%1], %10\n" "vtrn.u8 q8, q9 \n" // _src45t_r "pld [%2, #128] \n" "vld2.u8 {d22-d23}, [%2], %10\n" "vtrn.u8 q10, q11 \n" // _src67t_r "sub %1, %1, %10, lsl #2 \n" // restore src0 "vtrn.u16 q0, q2 \n" // _src02tt_r "sub %2, %2, %10, lsl #2 \n" // restore src1 "vtrn.u16 q1, q3 \n" // _src13tt_r "add %1, #16 \n" // src0 += 16 "vtrn.u16 q8, q10 \n" // _src46tt_r "add %2, #16 \n" // src1 += 16 "vtrn.u16 q9, q11 \n" // _src57tt_r "vtrn.u32 q0, q8 \n" // _src04ttt_r "vtrn.u32 q1, q9 \n" // _src15ttt_r "vst2.u8 {d0-d1}, [%3], %11 \n" "vtrn.u32 q2, q10 \n" // _src26ttt_r "vst2.u8 {d2-d3}, [%4], %11 \n" "vtrn.u32 q3, q11 \n" // _src37ttt_r "vst2.u8 {d4-d5}, [%3], %11 \n" "subs %0, #1 \n" "vst2.u8 {d6-d7}, [%4], %11 \n" "vst2.u8 {d16-d17}, [%3], %11\n" "vst2.u8 {d18-d19}, [%4], %11\n" "vst2.u8 {d20-d21}, [%3], %11\n" "vst2.u8 {d22-d23}, [%4], %11\n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(src1), // %2 "=r"(dst0), // %3 "=r"(dst1) // %4 : "0"(nn), "1"(src0), "2"(src1), "3"(dst0), "4"(dst1), "r"(src_step), // %10 "r"(dst_step) // %11 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); } #endif // __aarch64__ for (; remain > 0; remain--) { dst0[0] = src0[0]; dst0[1] = src0[1]; dst0[2] = src1[0]; dst0[3] = src1[1]; dst0[4] = src0[0 + src_step]; dst0[5] = src0[1 + src_step]; dst0[6] = src1[0 + src_step]; dst0[7] = src1[1 + src_step]; dst0[8] = src0[0 + 2 * src_step]; dst0[9] = src0[1 + 2 * src_step]; dst0[10] = src1[0 + 2 * src_step]; dst0[11] = src1[1 + 2 * src_step]; dst0[12] = src0[0 + 3 * src_step]; dst0[13] = src0[1 + 3 * src_step]; dst0[14] = src1[0 + 3 * src_step]; dst0[15] = src1[1 + 3 * src_step]; src0 += 2; src1 += 2; dst0 += stride; } src0 += srcwgap + 7 * srcstride; } #endif // __ARM_NEON for (; y < srch; y++) { unsigned char* dst0 = dst + y * 2; int x = 0; for (; x < srcw; x++) { dst0[0] = src0[0]; dst0[1] = src0[1]; src0 += 2; dst0 += stride; } src0 += srcwgap; } } static void kanna_rotate_5_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int /*h*/, int stride) { const int srcwgap = srcstride - srcw * 3; const unsigned char* src0 = src; int y = 0; #if __ARM_NEON for (; y + 7 < srch; y += 8) { const unsigned char* src1 = src0 + srcstride; unsigned char* dst0 = dst + y * 3; unsigned char* dst1 = dst + y * 3 + stride; int src_step = 2 * srcstride; int dst_step = 2 * stride; int nn = srcw >> 3; int remain = srcw - (nn << 3); #if __aarch64__ for (; nn > 0; nn--) { uint8x8x3_t _src0 = vld3_u8(src0); uint8x8x3_t _src1 = vld3_u8(src1); uint8x8x3_t _src2 = vld3_u8(src0 + src_step); uint8x8x3_t _src3 = vld3_u8(src1 + src_step); uint8x8x3_t _src4 = vld3_u8(src0 + 2 * src_step); uint8x8x3_t _src5 = vld3_u8(src1 + 2 * src_step); uint8x8x3_t _src6 = vld3_u8(src0 + 3 * src_step); uint8x8x3_t _src7 = vld3_u8(src1 + 3 * src_step); uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]); uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]); uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]); uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]); uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]); uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]); uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]); uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]); uint8x8x2_t _src01t_b = vtrn_u8(_src0.val[2], _src1.val[2]); uint8x8x2_t _src23t_b = vtrn_u8(_src2.val[2], _src3.val[2]); uint8x8x2_t _src45t_b = vtrn_u8(_src4.val[2], _src5.val[2]); uint8x8x2_t _src67t_b = vtrn_u8(_src6.val[2], _src7.val[2]); uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0])); uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1])); uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0])); uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1])); uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0])); uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1])); uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0])); uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1])); uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[0]), vreinterpret_u16_u8(_src23t_b.val[0])); uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[1]), vreinterpret_u16_u8(_src23t_b.val[1])); uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[0]), vreinterpret_u16_u8(_src67t_b.val[0])); uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[1]), vreinterpret_u16_u8(_src67t_b.val[1])); uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0])); uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0])); uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1])); uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1])); uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0])); uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0])); uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1])); uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1])); uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[0]), vreinterpret_u32_u16(_src46tt_b.val[0])); uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[0]), vreinterpret_u32_u16(_src57tt_b.val[0])); uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[1]), vreinterpret_u32_u16(_src46tt_b.val[1])); uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[1]), vreinterpret_u32_u16(_src57tt_b.val[1])); uint8x8x3_t _dst0; uint8x8x3_t _dst1; uint8x8x3_t _dst2; uint8x8x3_t _dst3; uint8x8x3_t _dst4; uint8x8x3_t _dst5; uint8x8x3_t _dst6; uint8x8x3_t _dst7; _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]); _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]); _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]); _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]); _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]); _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]); _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]); _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]); _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]); _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]); _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]); _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]); _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]); _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]); _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]); _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]); _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]); _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]); _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]); _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]); _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]); _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]); _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]); _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]); vst3_u8(dst0, _dst0); vst3_u8(dst1, _dst1); vst3_u8(dst0 + dst_step, _dst2); vst3_u8(dst1 + dst_step, _dst3); vst3_u8(dst0 + 2 * dst_step, _dst4); vst3_u8(dst1 + 2 * dst_step, _dst5); vst3_u8(dst0 + 3 * dst_step, _dst6); vst3_u8(dst1 + 3 * dst_step, _dst7); src0 += 3 * 8; src1 += 3 * 8; dst0 += 4 * dst_step; dst1 += 4 * dst_step; } #else if (nn > 0) { asm volatile( "0: \n" "pld [%1, #192] \n" "vld3.u8 {d0-d2}, [%1], %10 \n" "pld [%2, #192] \n" "vld3.u8 {d4-d6}, [%2], %10 \n" "pld [%1, #192] \n" "vld3.u8 {d8-d10}, [%1], %10 \n" "vtrn.u8 q0, q2 \n" // _src01t_r "vtrn.u8 d2, d6 \n" "pld [%2, #192] \n" "vld3.u8 {d12-d14}, [%2], %10\n" "pld [%1, #192] \n" "vld3.u8 {d16-d18}, [%1], %10\n" "vtrn.u8 q4, q6 \n" // _src23t_r "vtrn.u8 d10, d14 \n" "pld [%2, #192] \n" "vld3.u8 {d20-d22}, [%2], %10\n" "pld [%1, #192] \n" "vld3.u8 {d24-d26}, [%1], %10\n" "vtrn.u8 q8, q10 \n" // _src45t_r "vtrn.u8 d18, d22 \n" "pld [%2, #192] \n" "vld3.u8 {d28-d30}, [%2], %10\n" "vtrn.u8 q12, q14 \n" // _src67t_r "vtrn.u8 d26, d30 \n" "sub %1, %1, %10, lsl #2 \n" // restore src0 "vtrn.u16 q0, q4 \n" // _src02tt_r "vtrn.u16 d2, d10 \n" "sub %2, %2, %10, lsl #2 \n" // restore src1 "vtrn.u16 q2, q6 \n" // _src13tt_r "vtrn.u16 d6, d14 \n" "add %1, #24 \n" // src0 += 24 "vtrn.u16 q8, q12 \n" // _src46tt_r "vtrn.u16 d18, d26 \n" "add %2, #24 \n" // src1 += 24 "vtrn.u16 q10, q14 \n" // _src57tt_r "vtrn.u16 d22, d30 \n" "vtrn.u32 q0, q8 \n" // _src04ttt_r "vtrn.u32 d2, d18 \n" "vtrn.u32 q2, q10 \n" // _src15ttt_r "vst3.u8 {d0-d2}, [%3], %11 \n" "vtrn.u32 d6, d22 \n" "vtrn.u32 q4, q12 \n" // _src26ttt_r "vst3.u8 {d4-d6}, [%4], %11 \n" "vtrn.u32 d10, d26 \n" "vtrn.u32 q6, q14 \n" // _src37ttt_r "vst3.u8 {d8-d10}, [%3], %11 \n" "vtrn.u32 d14, d30 \n" "subs %0, #1 \n" "vst3.u8 {d16-d18}, [%3], %11\n" "vst3.u8 {d12-d14}, [%4], %11\n" "vst3.u8 {d20-d22}, [%4], %11\n" "vst3.u8 {d24-d26}, [%3], %11\n" "vst3.u8 {d28-d30}, [%4], %11\n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(src1), // %2 "=r"(dst0), // %3 "=r"(dst1) // %4 : "0"(nn), "1"(src0), "2"(src1), "3"(dst0), "4"(dst1), "r"(src_step), // %10 "r"(dst_step) // %11 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); } #endif // __aarch64__ for (; remain > 0; remain--) { dst0[0] = src0[0]; dst0[1] = src0[1]; dst0[2] = src0[2]; dst0[3] = src1[0]; dst0[4] = src1[1]; dst0[5] = src1[2]; dst0[6] = src0[0 + src_step]; dst0[7] = src0[1 + src_step]; dst0[8] = src0[2 + src_step]; dst0[9] = src1[0 + src_step]; dst0[10] = src1[1 + src_step]; dst0[11] = src1[2 + src_step]; dst0[12] = src0[0 + 2 * src_step]; dst0[13] = src0[1 + 2 * src_step]; dst0[14] = src0[2 + 2 * src_step]; dst0[15] = src1[0 + 2 * src_step]; dst0[16] = src1[1 + 2 * src_step]; dst0[17] = src1[2 + 2 * src_step]; dst0[18] = src0[0 + 3 * src_step]; dst0[19] = src0[1 + 3 * src_step]; dst0[20] = src0[2 + 3 * src_step]; dst0[21] = src1[0 + 3 * src_step]; dst0[22] = src1[1 + 3 * src_step]; dst0[23] = src1[2 + 3 * src_step]; src0 += 3; src1 += 3; dst0 += stride; } src0 += srcwgap + 7 * srcstride; } #endif // __ARM_NEON for (; y < srch; y++) { unsigned char* dst0 = dst + y * 3; int x = 0; for (; x < srcw; x++) { dst0[0] = src0[0]; dst0[1] = src0[1]; dst0[2] = src0[2]; src0 += 3; dst0 += stride; } src0 += srcwgap; } } static void kanna_rotate_5_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int /*h*/, int stride) { const int srcwgap = srcstride - srcw * 4; const unsigned char* src0 = src; int y = 0; #if __ARM_NEON for (; y + 7 < srch; y += 8) { const unsigned char* src1 = src0 + srcstride; unsigned char* dst0 = dst + y * 4; unsigned char* dst1 = dst + y * 4 + stride; int src_step = 2 * srcstride; int dst_step = 2 * stride; int nn = srcw >> 3; int remain = srcw - (nn << 3); #if __aarch64__ for (; nn > 0; nn--) { uint8x8x4_t _src0 = vld4_u8(src0); uint8x8x4_t _src1 = vld4_u8(src1); uint8x8x4_t _src2 = vld4_u8(src0 + src_step); uint8x8x4_t _src3 = vld4_u8(src1 + src_step); uint8x8x4_t _src4 = vld4_u8(src0 + 2 * src_step); uint8x8x4_t _src5 = vld4_u8(src1 + 2 * src_step); uint8x8x4_t _src6 = vld4_u8(src0 + 3 * src_step); uint8x8x4_t _src7 = vld4_u8(src1 + 3 * src_step); uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]); uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]); uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]); uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]); uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]); uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]); uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]); uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]); uint8x8x2_t _src01t_b = vtrn_u8(_src0.val[2], _src1.val[2]); uint8x8x2_t _src23t_b = vtrn_u8(_src2.val[2], _src3.val[2]); uint8x8x2_t _src45t_b = vtrn_u8(_src4.val[2], _src5.val[2]); uint8x8x2_t _src67t_b = vtrn_u8(_src6.val[2], _src7.val[2]); uint8x8x2_t _src01t_a = vtrn_u8(_src0.val[3], _src1.val[3]); uint8x8x2_t _src23t_a = vtrn_u8(_src2.val[3], _src3.val[3]); uint8x8x2_t _src45t_a = vtrn_u8(_src4.val[3], _src5.val[3]); uint8x8x2_t _src67t_a = vtrn_u8(_src6.val[3], _src7.val[3]); uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0])); uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1])); uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0])); uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1])); uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0])); uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1])); uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0])); uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1])); uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[0]), vreinterpret_u16_u8(_src23t_b.val[0])); uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[1]), vreinterpret_u16_u8(_src23t_b.val[1])); uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[0]), vreinterpret_u16_u8(_src67t_b.val[0])); uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[1]), vreinterpret_u16_u8(_src67t_b.val[1])); uint16x4x2_t _src02tt_a = vtrn_u16(vreinterpret_u16_u8(_src01t_a.val[0]), vreinterpret_u16_u8(_src23t_a.val[0])); uint16x4x2_t _src13tt_a = vtrn_u16(vreinterpret_u16_u8(_src01t_a.val[1]), vreinterpret_u16_u8(_src23t_a.val[1])); uint16x4x2_t _src46tt_a = vtrn_u16(vreinterpret_u16_u8(_src45t_a.val[0]), vreinterpret_u16_u8(_src67t_a.val[0])); uint16x4x2_t _src57tt_a = vtrn_u16(vreinterpret_u16_u8(_src45t_a.val[1]), vreinterpret_u16_u8(_src67t_a.val[1])); uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0])); uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0])); uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1])); uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1])); uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0])); uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0])); uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1])); uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1])); uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[0]), vreinterpret_u32_u16(_src46tt_b.val[0])); uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[0]), vreinterpret_u32_u16(_src57tt_b.val[0])); uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[1]), vreinterpret_u32_u16(_src46tt_b.val[1])); uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[1]), vreinterpret_u32_u16(_src57tt_b.val[1])); uint32x2x2_t _src04ttt_a = vtrn_u32(vreinterpret_u32_u16(_src02tt_a.val[0]), vreinterpret_u32_u16(_src46tt_a.val[0])); uint32x2x2_t _src15ttt_a = vtrn_u32(vreinterpret_u32_u16(_src13tt_a.val[0]), vreinterpret_u32_u16(_src57tt_a.val[0])); uint32x2x2_t _src26ttt_a = vtrn_u32(vreinterpret_u32_u16(_src02tt_a.val[1]), vreinterpret_u32_u16(_src46tt_a.val[1])); uint32x2x2_t _src37ttt_a = vtrn_u32(vreinterpret_u32_u16(_src13tt_a.val[1]), vreinterpret_u32_u16(_src57tt_a.val[1])); uint8x8x4_t _dst0; uint8x8x4_t _dst1; uint8x8x4_t _dst2; uint8x8x4_t _dst3; uint8x8x4_t _dst4; uint8x8x4_t _dst5; uint8x8x4_t _dst6; uint8x8x4_t _dst7; _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]); _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]); _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]); _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]); _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]); _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]); _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]); _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]); _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]); _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]); _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]); _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]); _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]); _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]); _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]); _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]); _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]); _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]); _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]); _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]); _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]); _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]); _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]); _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]); _dst0.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[0]); _dst1.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[0]); _dst2.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[0]); _dst3.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[0]); _dst4.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[1]); _dst5.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[1]); _dst6.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[1]); _dst7.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[1]); vst4_u8(dst0, _dst0); vst4_u8(dst1, _dst1); vst4_u8(dst0 + dst_step, _dst2); vst4_u8(dst1 + dst_step, _dst3); vst4_u8(dst0 + 2 * dst_step, _dst4); vst4_u8(dst1 + 2 * dst_step, _dst5); vst4_u8(dst0 + 3 * dst_step, _dst6); vst4_u8(dst1 + 3 * dst_step, _dst7); src0 += 4 * 8; src1 += 4 * 8; dst0 += 4 * dst_step; dst1 += 4 * dst_step; } #else if (nn > 0) { asm volatile( "0: \n" "pld [%1, #256] \n" "vld4.u8 {d0-d3}, [%1], %10 \n" "pld [%2, #256] \n" "vld4.u8 {d4-d7}, [%2], %10 \n" "pld [%1, #256] \n" "vld4.u8 {d8-d11}, [%1], %10 \n" "vtrn.u8 q0, q2 \n" // _src01t_r "vtrn.u8 q1, q3 \n" "pld [%2, #256] \n" "vld4.u8 {d12-d15}, [%2], %10\n" "pld [%1, #256] \n" "vld4.u8 {d16-d19}, [%1], %10\n" "vtrn.u8 q4, q6 \n" // _src23t_r "vtrn.u8 q5, q7 \n" "pld [%2, #256] \n" "vld4.u8 {d20-d23}, [%2], %10\n" "pld [%1, #256] \n" "vld4.u8 {d24-d27}, [%1], %10\n" "vtrn.u8 q8, q10 \n" // _src45t_r "vtrn.u8 q9, q11 \n" "pld [%2, #256] \n" "vld4.u8 {d28-d31}, [%2], %10\n" "vtrn.u8 q12, q14 \n" // _src67t_r "vtrn.u8 q13, q15 \n" "sub %1, %1, %10, lsl #2 \n" // restore src0 "vtrn.u16 q0, q4 \n" // _src02tt_r "vtrn.u16 q1, q5 \n" "sub %2, %2, %10, lsl #2 \n" // restore src1 "vtrn.u16 q2, q6 \n" // _src13tt_r "vtrn.u16 q3, q7 \n" "add %1, #32 \n" // src0 += 32 "vtrn.u16 q8, q12 \n" // _src46tt_r "vtrn.u16 q9, q13 \n" "add %2, #32 \n" // src1 += 32 "vtrn.u16 q10, q14 \n" // _src57tt_r "vtrn.u16 q11, q15 \n" "vtrn.u32 q0, q8 \n" // _src04ttt_r "vtrn.u32 q1, q9 \n" "vtrn.u32 q2, q10 \n" // _src15ttt_r "vst4.u8 {d0-d3}, [%3], %11 \n" "vtrn.u32 q3, q11 \n" "vtrn.u32 q4, q12 \n" // _src26ttt_r "vst4.u8 {d4-d7}, [%4], %11 \n" "vtrn.u32 q5, q13 \n" "vtrn.u32 q6, q14 \n" // _src37ttt_r "vst4.u8 {d8-d11}, [%3], %11 \n" "vtrn.u32 q7, q15 \n" "subs %0, #1 \n" "vst4.u8 {d16-d19}, [%3], %11\n" "vst4.u8 {d12-d15}, [%4], %11\n" "vst4.u8 {d20-d23}, [%4], %11\n" "vst4.u8 {d24-d27}, [%3], %11\n" "vst4.u8 {d28-d31}, [%4], %11\n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(src1), // %2 "=r"(dst0), // %3 "=r"(dst1) // %4 : "0"(nn), "1"(src0), "2"(src1), "3"(dst0), "4"(dst1), "r"(src_step), // %10 "r"(dst_step) // %11 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); } #endif // __aarch64__ for (; remain > 0; remain--) { dst0[0] = src0[0]; dst0[1] = src0[1]; dst0[2] = src0[2]; dst0[3] = src0[3]; dst0[4] = src1[0]; dst0[5] = src1[1]; dst0[6] = src1[2]; dst0[7] = src1[3]; dst0[8] = src0[0 + src_step]; dst0[9] = src0[1 + src_step]; dst0[10] = src0[2 + src_step]; dst0[11] = src0[3 + src_step]; dst0[12] = src1[0 + src_step]; dst0[13] = src1[1 + src_step]; dst0[14] = src1[2 + src_step]; dst0[15] = src1[3 + src_step]; dst0[16] = src0[0 + 2 * src_step]; dst0[17] = src0[1 + 2 * src_step]; dst0[18] = src0[2 + 2 * src_step]; dst0[19] = src0[3 + 2 * src_step]; dst0[20] = src1[0 + 2 * src_step]; dst0[21] = src1[1 + 2 * src_step]; dst0[22] = src1[2 + 2 * src_step]; dst0[23] = src1[3 + 2 * src_step]; dst0[24] = src0[0 + 3 * src_step]; dst0[25] = src0[1 + 3 * src_step]; dst0[26] = src0[2 + 3 * src_step]; dst0[27] = src0[3 + 3 * src_step]; dst0[28] = src1[0 + 3 * src_step]; dst0[29] = src1[1 + 3 * src_step]; dst0[30] = src1[2 + 3 * src_step]; dst0[31] = src1[3 + 3 * src_step]; src0 += 4; src1 += 4; dst0 += stride; } src0 += srcwgap + 7 * srcstride; } #endif // __ARM_NEON for (; y < srch; y++) { unsigned char* dst0 = dst + y * 4; int x = 0; for (; x < srcw; x++) { dst0[0] = src0[0]; dst0[1] = src0[1]; dst0[2] = src0[2]; dst0[3] = src0[3]; src0 += 4; dst0 += stride; } src0 += srcwgap; } } static void kanna_rotate_6_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride) { const int srcwgap = srcstride - srcw; // point to the last dst pixel in row unsigned char* dstend = dst + w; const unsigned char* src0 = src; int y = 0; #if __ARM_NEON for (; y + 7 < srch; y += 8) { const unsigned char* src1 = src0 + srcstride; unsigned char* dst0 = dstend - y - 8; unsigned char* dst1 = dstend - y - 8 + stride; int src_step = 2 * srcstride; int dst_step = 2 * stride; int nn = srcw >> 3; int remain = srcw - (nn << 3); #if __aarch64__ for (; nn > 0; nn--) { uint8x8_t _src0 = vld1_u8(src0); uint8x8_t _src1 = vld1_u8(src1); uint8x8_t _src2 = vld1_u8(src0 + src_step); uint8x8_t _src3 = vld1_u8(src1 + src_step); uint8x8_t _src4 = vld1_u8(src0 + 2 * src_step); uint8x8_t _src5 = vld1_u8(src1 + 2 * src_step); uint8x8_t _src6 = vld1_u8(src0 + 3 * src_step); uint8x8_t _src7 = vld1_u8(src1 + 3 * src_step); uint8x8x2_t _src01t_r = vtrn_u8(_src1, _src0); uint8x8x2_t _src23t_r = vtrn_u8(_src3, _src2); uint8x8x2_t _src45t_r = vtrn_u8(_src5, _src4); uint8x8x2_t _src67t_r = vtrn_u8(_src7, _src6); uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1])); uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0])); uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1])); uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0])); uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1])); uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1])); uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0])); uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0])); uint8x8_t _dst0 = vreinterpret_u8_u32(_src04ttt_r.val[1]); uint8x8_t _dst1 = vreinterpret_u8_u32(_src15ttt_r.val[1]); uint8x8_t _dst2 = vreinterpret_u8_u32(_src26ttt_r.val[1]); uint8x8_t _dst3 = vreinterpret_u8_u32(_src37ttt_r.val[1]); uint8x8_t _dst4 = vreinterpret_u8_u32(_src04ttt_r.val[0]); uint8x8_t _dst5 = vreinterpret_u8_u32(_src15ttt_r.val[0]); uint8x8_t _dst6 = vreinterpret_u8_u32(_src26ttt_r.val[0]); uint8x8_t _dst7 = vreinterpret_u8_u32(_src37ttt_r.val[0]); vst1_u8(dst0, _dst7); vst1_u8(dst1, _dst6); vst1_u8(dst0 + dst_step, _dst5); vst1_u8(dst1 + dst_step, _dst4); vst1_u8(dst0 + 2 * dst_step, _dst3); vst1_u8(dst1 + 2 * dst_step, _dst2); vst1_u8(dst0 + 3 * dst_step, _dst1); vst1_u8(dst1 + 3 * dst_step, _dst0); src0 += 8; src1 += 8; dst0 += 4 * dst_step; dst1 += 4 * dst_step; } #else if (nn > 0) { asm volatile( "0: \n" "pld [%1, #64] \n" "vld1.u8 {d0}, [%1], %10 \n" "pld [%2, #64] \n" "vld1.u8 {d1}, [%2], %10 \n" "pld [%1, #64] \n" "vld1.u8 {d2}, [%1], %10 \n" "vtrn.u8 d1, d0 \n" // _src01t_r "pld [%2, #64] \n" "vld1.u8 {d3}, [%2], %10 \n" "pld [%1, #64] \n" "vld1.u8 {d4}, [%1], %10 \n" "vtrn.u8 d3, d2 \n" // _src23t_r "pld [%2, #64] \n" "vld1.u8 {d5}, [%2], %10 \n" "pld [%1, #64] \n" "vld1.u8 {d6}, [%1], %10 \n" "vtrn.u8 d5, d4 \n" // _src45t_r "pld [%2, #64] \n" "vld1.u8 {d7}, [%2], %10 \n" "vtrn.u8 d7, d6 \n" // _src67t_r "sub %1, %1, %10, lsl #2 \n" // restore src0 "vtrn.u16 q1, q0 \n" // _src02tt_r _src13tt_r "sub %2, %2, %10, lsl #2 \n" // restore src1 "vtrn.u16 q3, q2 \n" // _src46tt_r _src57tt_r "add %1, #8 \n" // src0 += 8 "vtrn.u32 q3, q1 \n" // _src26ttt_r _src37ttt_r "add %2, #8 \n" // src1 += 8 "vtrn.u32 q2, q0 \n" // _src04ttt_r _src15ttt_r "vst1.u8 {d6}, [%4], %11 \n" "vst1.u8 {d7}, [%3], %11 \n" "subs %0, #1 \n" "vst1.u8 {d4}, [%4], %11 \n" "vst1.u8 {d5}, [%3], %11 \n" "vst1.u8 {d2}, [%4], %11 \n" "vst1.u8 {d3}, [%3], %11 \n" "vst1.u8 {d0}, [%4], %11 \n" "vst1.u8 {d1}, [%3], %11 \n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(src1), // %2 "=r"(dst0), // %3 "=r"(dst1) // %4 : "0"(nn), "1"(src0), "2"(src1), "3"(dst0), "4"(dst1), "r"(src_step), // %10 "r"(dst_step) // %11 : "cc", "memory", "q0", "q1", "q2", "q3"); } #endif // __aarch64__ for (; remain > 0; remain--) { dst0[0] = src1[0 + 3 * src_step]; dst0[1] = src0[0 + 3 * src_step]; dst0[2] = src1[0 + 2 * src_step]; dst0[3] = src0[0 + 2 * src_step]; dst0[4] = src1[0 + src_step]; dst0[5] = src0[0 + src_step]; dst0[6] = src1[0]; dst0[7] = src0[0]; src0 += 1; src1 += 1; dst0 += stride; } src0 += srcwgap + 7 * srcstride; } #endif // __ARM_NEON for (; y < srch; y++) { unsigned char* dst0 = dstend - y - 1; int x = 0; for (; x < srcw; x++) { *dst0 = *src0; src0 += 1; dst0 += stride; } src0 += srcwgap; } } static void kanna_rotate_6_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride) { const int srcwgap = srcstride - srcw * 2; // point to the last dst pixel in row unsigned char* dstend = dst + w * 2; const unsigned char* src0 = src; int y = 0; #if __ARM_NEON for (; y + 7 < srch; y += 8) { const unsigned char* src1 = src0 + srcstride; unsigned char* dst0 = dstend - y * 2 - 8 * 2; unsigned char* dst1 = dstend - y * 2 - 8 * 2 + stride; int src_step = 2 * srcstride; int dst_step = 2 * stride; int nn = srcw >> 3; int remain = srcw - (nn << 3); #if __aarch64__ for (; nn > 0; nn--) { uint8x8x2_t _src0 = vld2_u8(src0); uint8x8x2_t _src1 = vld2_u8(src1); uint8x8x2_t _src2 = vld2_u8(src0 + src_step); uint8x8x2_t _src3 = vld2_u8(src1 + src_step); uint8x8x2_t _src4 = vld2_u8(src0 + 2 * src_step); uint8x8x2_t _src5 = vld2_u8(src1 + 2 * src_step); uint8x8x2_t _src6 = vld2_u8(src0 + 3 * src_step); uint8x8x2_t _src7 = vld2_u8(src1 + 3 * src_step); uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]); uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]); uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]); uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]); uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]); uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]); uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]); uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]); uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1])); uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0])); uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1])); uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0])); uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1])); uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0])); uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1])); uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0])); uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1])); uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1])); uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0])); uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0])); uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1])); uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1])); uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0])); uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0])); uint8x8x2_t _dst0; uint8x8x2_t _dst1; uint8x8x2_t _dst2; uint8x8x2_t _dst3; uint8x8x2_t _dst4; uint8x8x2_t _dst5; uint8x8x2_t _dst6; uint8x8x2_t _dst7; _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]); _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]); _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]); _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]); _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]); _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]); _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]); _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]); _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]); _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]); _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]); _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]); _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]); _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]); _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]); _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]); vst2_u8(dst0, _dst7); vst2_u8(dst1, _dst6); vst2_u8(dst0 + dst_step, _dst5); vst2_u8(dst1 + dst_step, _dst4); vst2_u8(dst0 + 2 * dst_step, _dst3); vst2_u8(dst1 + 2 * dst_step, _dst2); vst2_u8(dst0 + 3 * dst_step, _dst1); vst2_u8(dst1 + 3 * dst_step, _dst0); src0 += 2 * 8; src1 += 2 * 8; dst0 += 4 * dst_step; dst1 += 4 * dst_step; } #else if (nn > 0) { asm volatile( "0: \n" "pld [%1, #128] \n" "vld2.u8 {d0-d1}, [%1], %10 \n" "pld [%2, #128] \n" "vld2.u8 {d2-d3}, [%2], %10 \n" "pld [%1, #128] \n" "vld2.u8 {d4-d5}, [%1], %10 \n" "vtrn.u8 q1, q0 \n" // _src01t_r "pld [%2, #128] \n" "vld2.u8 {d6-d7}, [%2], %10 \n" "pld [%1, #128] \n" "vld2.u8 {d16-d17}, [%1], %10\n" "vtrn.u8 q3, q2 \n" // _src23t_r "pld [%2, #128] \n" "vld2.u8 {d18-d19}, [%2], %10\n" "pld [%1, #128] \n" "vld2.u8 {d20-d21}, [%1], %10\n" "vtrn.u8 q9, q8 \n" // _src45t_r "pld [%2, #128] \n" "vld2.u8 {d22-d23}, [%2], %10\n" "vtrn.u8 q11, q10 \n" // _src67t_r "sub %1, %1, %10, lsl #2 \n" // restore src0 "vtrn.u16 q2, q0 \n" // _src02tt_r "sub %2, %2, %10, lsl #2 \n" // restore src1 "vtrn.u16 q3, q1 \n" // _src13tt_r "add %1, #16 \n" // src0 += 16 "vtrn.u16 q10, q8 \n" // _src46tt_r "add %2, #16 \n" // src1 += 16 "vtrn.u16 q11, q9 \n" // _src57tt_r "vtrn.u32 q10, q2 \n" // _src26ttt_r "vtrn.u32 q11, q3 \n" // _src37ttt_r "vst2.u8 {d20-d21}, [%4], %11\n" "vtrn.u32 q8, q0 \n" // _src04ttt_r "vst2.u8 {d22-d23}, [%3], %11\n" "vtrn.u32 q9, q1 \n" // _src15ttt_r "vst2.u8 {d16-d17}, [%4], %11\n" "subs %0, #1 \n" "vst2.u8 {d18-d19}, [%3], %11\n" "vst2.u8 {d4-d5}, [%4], %11 \n" "vst2.u8 {d6-d7}, [%3], %11 \n" "vst2.u8 {d0-d1}, [%4], %11 \n" "vst2.u8 {d2-d3}, [%3], %11 \n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(src1), // %2 "=r"(dst0), // %3 "=r"(dst1) // %4 : "0"(nn), "1"(src0), "2"(src1), "3"(dst0), "4"(dst1), "r"(src_step), // %10 "r"(dst_step) // %11 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); } #endif // __aarch64__ for (; remain > 0; remain--) { dst0[0] = src1[0 + 3 * src_step]; dst0[1] = src1[1 + 3 * src_step]; dst0[2] = src0[0 + 3 * src_step]; dst0[3] = src0[1 + 3 * src_step]; dst0[4] = src1[0 + 2 * src_step]; dst0[5] = src1[1 + 2 * src_step]; dst0[6] = src0[0 + 2 * src_step]; dst0[7] = src0[1 + 2 * src_step]; dst0[8] = src1[0 + src_step]; dst0[9] = src1[1 + src_step]; dst0[10] = src0[0 + src_step]; dst0[11] = src0[1 + src_step]; dst0[12] = src1[0]; dst0[13] = src1[1]; dst0[14] = src0[0]; dst0[15] = src0[1]; src0 += 2; src1 += 2; dst0 += stride; } src0 += srcwgap + 7 * srcstride; } #endif // __ARM_NEON for (; y < srch; y++) { unsigned char* dst0 = dstend - y * 2 - 2; int x = 0; for (; x < srcw; x++) { dst0[0] = src0[0]; dst0[1] = src0[1]; src0 += 2; dst0 += stride; } src0 += srcwgap; } } static void kanna_rotate_6_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride) { const int srcwgap = srcstride - srcw * 3; // point to the last dst pixel in row unsigned char* dstend = dst + w * 3; const unsigned char* src0 = src; int y = 0; #if __ARM_NEON for (; y + 7 < srch; y += 8) { const unsigned char* src1 = src0 + srcstride; unsigned char* dst0 = dstend - y * 3 - 8 * 3; unsigned char* dst1 = dstend - y * 3 - 8 * 3 + stride; int src_step = 2 * srcstride; int dst_step = 2 * stride; int nn = srcw >> 3; int remain = srcw - (nn << 3); #if __aarch64__ for (; nn > 0; nn--) { uint8x8x3_t _src0 = vld3_u8(src0); uint8x8x3_t _src1 = vld3_u8(src1); uint8x8x3_t _src2 = vld3_u8(src0 + src_step); uint8x8x3_t _src3 = vld3_u8(src1 + src_step); uint8x8x3_t _src4 = vld3_u8(src0 + 2 * src_step); uint8x8x3_t _src5 = vld3_u8(src1 + 2 * src_step); uint8x8x3_t _src6 = vld3_u8(src0 + 3 * src_step); uint8x8x3_t _src7 = vld3_u8(src1 + 3 * src_step); uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]); uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]); uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]); uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]); uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]); uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]); uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]); uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]); uint8x8x2_t _src01t_b = vtrn_u8(_src1.val[2], _src0.val[2]); uint8x8x2_t _src23t_b = vtrn_u8(_src3.val[2], _src2.val[2]); uint8x8x2_t _src45t_b = vtrn_u8(_src5.val[2], _src4.val[2]); uint8x8x2_t _src67t_b = vtrn_u8(_src7.val[2], _src6.val[2]); uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1])); uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0])); uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1])); uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0])); uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1])); uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0])); uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1])); uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0])); uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[1]), vreinterpret_u16_u8(_src01t_b.val[1])); uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[0]), vreinterpret_u16_u8(_src01t_b.val[0])); uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[1]), vreinterpret_u16_u8(_src45t_b.val[1])); uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[0]), vreinterpret_u16_u8(_src45t_b.val[0])); uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1])); uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1])); uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0])); uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0])); uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1])); uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1])); uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0])); uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0])); uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[1]), vreinterpret_u32_u16(_src02tt_b.val[1])); uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[1]), vreinterpret_u32_u16(_src13tt_b.val[1])); uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[0]), vreinterpret_u32_u16(_src02tt_b.val[0])); uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[0]), vreinterpret_u32_u16(_src13tt_b.val[0])); uint8x8x3_t _dst0; uint8x8x3_t _dst1; uint8x8x3_t _dst2; uint8x8x3_t _dst3; uint8x8x3_t _dst4; uint8x8x3_t _dst5; uint8x8x3_t _dst6; uint8x8x3_t _dst7; _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]); _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]); _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]); _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]); _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]); _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]); _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]); _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]); _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]); _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]); _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]); _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]); _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]); _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]); _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]); _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]); _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]); _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]); _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]); _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]); _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]); _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]); _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]); _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]); vst3_u8(dst0, _dst7); vst3_u8(dst1, _dst6); vst3_u8(dst0 + dst_step, _dst5); vst3_u8(dst1 + dst_step, _dst4); vst3_u8(dst0 + 2 * dst_step, _dst3); vst3_u8(dst1 + 2 * dst_step, _dst2); vst3_u8(dst0 + 3 * dst_step, _dst1); vst3_u8(dst1 + 3 * dst_step, _dst0); src0 += 3 * 8; src1 += 3 * 8; dst0 += 4 * dst_step; dst1 += 4 * dst_step; } #else if (nn > 0) { asm volatile( "0: \n" "pld [%1, #192] \n" "vld3.u8 {d0-d2}, [%1], %10 \n" "pld [%2, #192] \n" "vld3.u8 {d4-d6}, [%2], %10 \n" "pld [%1, #192] \n" "vld3.u8 {d8-d10}, [%1], %10 \n" "vtrn.u8 q2, q0 \n" // _src01t_r "vtrn.u8 d6, d2 \n" "pld [%2, #192] \n" "vld3.u8 {d12-d14}, [%2], %10\n" "pld [%1, #192] \n" "vld3.u8 {d16-d18}, [%1], %10\n" "vtrn.u8 q6, q4 \n" // _src23t_r "vtrn.u8 d14, d10 \n" "pld [%2, #192] \n" "vld3.u8 {d20-d22}, [%2], %10\n" "pld [%1, #192] \n" "vld3.u8 {d24-d26}, [%1], %10\n" "vtrn.u8 q10, q8 \n" // _src45t_r "vtrn.u8 d22, d18 \n" "pld [%2, #192] \n" "vld3.u8 {d28-d30}, [%2], %10\n" "vtrn.u8 q14, q12 \n" // _src67t_r "vtrn.u8 d30, d26 \n" "sub %1, %1, %10, lsl #2 \n" // restore src0 "vtrn.u16 q4, q0 \n" // _src02tt_r "vtrn.u16 d10, d2 \n" "sub %2, %2, %10, lsl #2 \n" // restore src1 "vtrn.u16 q6, q2 \n" // _src13tt_r "vtrn.u16 d14, d6 \n" "add %1, #24 \n" // src0 += 24 "vtrn.u16 q12, q8 \n" // _src46tt_r "vtrn.u16 d26, d18 \n" "add %2, #24 \n" // src1 += 24 "vtrn.u16 q14, q10 \n" // _src57tt_r "vtrn.u16 d30, d22 \n" "vtrn.u32 q12, q4 \n" // _src26ttt_r "vtrn.u32 d26, d10 \n" "vtrn.u32 q14, q6 \n" // _src37ttt_r "vst3.u8 {d24-d26}, [%4], %11\n" "vtrn.u32 d30, d14 \n" "vtrn.u32 q8, q0 \n" // _src04ttt_r "vst3.u8 {d28-d30}, [%3], %11\n" "vtrn.u32 d18, d2 \n" "vtrn.u32 q10, q2 \n" // _src15ttt_r "vst3.u8 {d16-d18}, [%4], %11\n" "vtrn.u32 d22, d6 \n" "subs %0, #1 \n" "vst3.u8 {d20-d22}, [%3], %11\n" "vst3.u8 {d8-d10}, [%4], %11 \n" "vst3.u8 {d12-d14}, [%3], %11\n" "vst3.u8 {d0-d2}, [%4], %11 \n" "vst3.u8 {d4-d6}, [%3], %11 \n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(src1), // %2 "=r"(dst0), // %3 "=r"(dst1) // %4 : "0"(nn), "1"(src0), "2"(src1), "3"(dst0), "4"(dst1), "r"(src_step), // %10 "r"(dst_step) // %11 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); } #endif // __aarch64__ for (; remain > 0; remain--) { dst0[0] = src1[0 + 3 * src_step]; dst0[1] = src1[1 + 3 * src_step]; dst0[2] = src1[2 + 3 * src_step]; dst0[3] = src0[0 + 3 * src_step]; dst0[4] = src0[1 + 3 * src_step]; dst0[5] = src0[2 + 3 * src_step]; dst0[6] = src1[0 + 2 * src_step]; dst0[7] = src1[1 + 2 * src_step]; dst0[8] = src1[2 + 2 * src_step]; dst0[9] = src0[0 + 2 * src_step]; dst0[10] = src0[1 + 2 * src_step]; dst0[11] = src0[2 + 2 * src_step]; dst0[12] = src1[0 + src_step]; dst0[13] = src1[1 + src_step]; dst0[14] = src1[2 + src_step]; dst0[15] = src0[0 + src_step]; dst0[16] = src0[1 + src_step]; dst0[17] = src0[2 + src_step]; dst0[18] = src1[0]; dst0[19] = src1[1]; dst0[20] = src1[2]; dst0[21] = src0[0]; dst0[22] = src0[1]; dst0[23] = src0[2]; src0 += 3; src1 += 3; dst0 += stride; } src0 += srcwgap + 7 * srcstride; } #endif // __ARM_NEON for (; y < srch; y++) { unsigned char* dst0 = dstend - y * 3 - 3; int x = 0; for (; x < srcw; x++) { dst0[0] = src0[0]; dst0[1] = src0[1]; dst0[2] = src0[2]; src0 += 3; dst0 += stride; } src0 += srcwgap; } } static void kanna_rotate_6_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride) { const int srcwgap = srcstride - srcw * 4; // point to the last dst pixel in row unsigned char* dstend = dst + w * 4; const unsigned char* src0 = src; int y = 0; #if __ARM_NEON for (; y + 7 < srch; y += 8) { const unsigned char* src1 = src0 + srcstride; unsigned char* dst0 = dstend - y * 4 - 8 * 4; unsigned char* dst1 = dstend - y * 4 - 8 * 4 + stride; int src_step = 2 * srcstride; int dst_step = 2 * stride; int nn = srcw >> 3; int remain = srcw - (nn << 3); #if __aarch64__ for (; nn > 0; nn--) { uint8x8x4_t _src0 = vld4_u8(src0); uint8x8x4_t _src1 = vld4_u8(src1); uint8x8x4_t _src2 = vld4_u8(src0 + src_step); uint8x8x4_t _src3 = vld4_u8(src1 + src_step); uint8x8x4_t _src4 = vld4_u8(src0 + 2 * src_step); uint8x8x4_t _src5 = vld4_u8(src1 + 2 * src_step); uint8x8x4_t _src6 = vld4_u8(src0 + 3 * src_step); uint8x8x4_t _src7 = vld4_u8(src1 + 3 * src_step); uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]); uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]); uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]); uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]); uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]); uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]); uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]); uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]); uint8x8x2_t _src01t_b = vtrn_u8(_src1.val[2], _src0.val[2]); uint8x8x2_t _src23t_b = vtrn_u8(_src3.val[2], _src2.val[2]); uint8x8x2_t _src45t_b = vtrn_u8(_src5.val[2], _src4.val[2]); uint8x8x2_t _src67t_b = vtrn_u8(_src7.val[2], _src6.val[2]); uint8x8x2_t _src01t_a = vtrn_u8(_src1.val[3], _src0.val[3]); uint8x8x2_t _src23t_a = vtrn_u8(_src3.val[3], _src2.val[3]); uint8x8x2_t _src45t_a = vtrn_u8(_src5.val[3], _src4.val[3]); uint8x8x2_t _src67t_a = vtrn_u8(_src7.val[3], _src6.val[3]); uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1])); uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0])); uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1])); uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0])); uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1])); uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0])); uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1])); uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0])); uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[1]), vreinterpret_u16_u8(_src01t_b.val[1])); uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[0]), vreinterpret_u16_u8(_src01t_b.val[0])); uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[1]), vreinterpret_u16_u8(_src45t_b.val[1])); uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[0]), vreinterpret_u16_u8(_src45t_b.val[0])); uint16x4x2_t _src02tt_a = vtrn_u16(vreinterpret_u16_u8(_src23t_a.val[1]), vreinterpret_u16_u8(_src01t_a.val[1])); uint16x4x2_t _src13tt_a = vtrn_u16(vreinterpret_u16_u8(_src23t_a.val[0]), vreinterpret_u16_u8(_src01t_a.val[0])); uint16x4x2_t _src46tt_a = vtrn_u16(vreinterpret_u16_u8(_src67t_a.val[1]), vreinterpret_u16_u8(_src45t_a.val[1])); uint16x4x2_t _src57tt_a = vtrn_u16(vreinterpret_u16_u8(_src67t_a.val[0]), vreinterpret_u16_u8(_src45t_a.val[0])); uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1])); uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1])); uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0])); uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0])); uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1])); uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1])); uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0])); uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0])); uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[1]), vreinterpret_u32_u16(_src02tt_b.val[1])); uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[1]), vreinterpret_u32_u16(_src13tt_b.val[1])); uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[0]), vreinterpret_u32_u16(_src02tt_b.val[0])); uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[0]), vreinterpret_u32_u16(_src13tt_b.val[0])); uint32x2x2_t _src04ttt_a = vtrn_u32(vreinterpret_u32_u16(_src46tt_a.val[1]), vreinterpret_u32_u16(_src02tt_a.val[1])); uint32x2x2_t _src15ttt_a = vtrn_u32(vreinterpret_u32_u16(_src57tt_a.val[1]), vreinterpret_u32_u16(_src13tt_a.val[1])); uint32x2x2_t _src26ttt_a = vtrn_u32(vreinterpret_u32_u16(_src46tt_a.val[0]), vreinterpret_u32_u16(_src02tt_a.val[0])); uint32x2x2_t _src37ttt_a = vtrn_u32(vreinterpret_u32_u16(_src57tt_a.val[0]), vreinterpret_u32_u16(_src13tt_a.val[0])); uint8x8x4_t _dst0; uint8x8x4_t _dst1; uint8x8x4_t _dst2; uint8x8x4_t _dst3; uint8x8x4_t _dst4; uint8x8x4_t _dst5; uint8x8x4_t _dst6; uint8x8x4_t _dst7; _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]); _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]); _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]); _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]); _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]); _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]); _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]); _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]); _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]); _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]); _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]); _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]); _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]); _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]); _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]); _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]); _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]); _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]); _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]); _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]); _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]); _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]); _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]); _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]); _dst0.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[1]); _dst1.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[1]); _dst2.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[1]); _dst3.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[1]); _dst4.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[0]); _dst5.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[0]); _dst6.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[0]); _dst7.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[0]); vst4_u8(dst0, _dst7); vst4_u8(dst1, _dst6); vst4_u8(dst0 + dst_step, _dst5); vst4_u8(dst1 + dst_step, _dst4); vst4_u8(dst0 + 2 * dst_step, _dst3); vst4_u8(dst1 + 2 * dst_step, _dst2); vst4_u8(dst0 + 3 * dst_step, _dst1); vst4_u8(dst1 + 3 * dst_step, _dst0); src0 += 4 * 8; src1 += 4 * 8; dst0 += 4 * dst_step; dst1 += 4 * dst_step; } #else if (nn > 0) { asm volatile( "0: \n" "pld [%1, #256] \n" "vld4.u8 {d0-d3}, [%1], %10 \n" "pld [%2, #256] \n" "vld4.u8 {d4-d7}, [%2], %10 \n" "pld [%1, #256] \n" "vld4.u8 {d8-d11}, [%1], %10 \n" "vtrn.u8 q2, q0 \n" // _src01t_r "vtrn.u8 q3, q1 \n" "pld [%2, #256] \n" "vld4.u8 {d12-d15}, [%2], %10\n" "pld [%1, #256] \n" "vld4.u8 {d16-d19}, [%1], %10\n" "vtrn.u8 q6, q4 \n" // _src23t_r "vtrn.u8 q7, q5 \n" "pld [%2, #256] \n" "vld4.u8 {d20-d23}, [%2], %10\n" "pld [%1, #256] \n" "vld4.u8 {d24-d27}, [%1], %10\n" "vtrn.u8 q10, q8 \n" // _src45t_r "vtrn.u8 q11, q9 \n" "pld [%2, #256] \n" "vld4.u8 {d28-d31}, [%2], %10\n" "vtrn.u8 q14, q12 \n" // _src67t_r "vtrn.u8 q15, q13 \n" "sub %1, %1, %10, lsl #2 \n" // restore src0 "vtrn.u16 q4, q0 \n" // _src02tt_r "vtrn.u16 q5, q1 \n" "sub %2, %2, %10, lsl #2 \n" // restore src1 "vtrn.u16 q6, q2 \n" // _src13tt_r "vtrn.u16 q7, q3 \n" "add %1, #32 \n" // src0 += 32 "vtrn.u16 q12, q8 \n" // _src46tt_r "vtrn.u16 q13, q9 \n" "add %2, #32 \n" // src1 += 32 "vtrn.u16 q14, q10 \n" // _src57tt_r "vtrn.u16 q15, q11 \n" "vtrn.u32 q12, q4 \n" // _src26ttt_r "vtrn.u32 q13, q5 \n" "vtrn.u32 q14, q6 \n" // _src37ttt_r "vst4.u8 {d24-d27}, [%4], %11\n" "vtrn.u32 q15, q7 \n" "vtrn.u32 q8, q0 \n" // _src04ttt_r "vst4.u8 {d28-d31}, [%3], %11\n" "vtrn.u32 q9, q1 \n" "vtrn.u32 q10, q2 \n" // _src15ttt_r "vst4.u8 {d16-d19}, [%4], %11\n" "vtrn.u32 q11, q3 \n" "subs %0, #1 \n" "vst4.u8 {d8-d11}, [%4], %11 \n" "vst4.u8 {d20-d23}, [%3], %11\n" "vst4.u8 {d12-d15}, [%3], %11\n" "vst4.u8 {d0-d3}, [%4], %11 \n" "vst4.u8 {d4-d7}, [%3], %11 \n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(src1), // %2 "=r"(dst0), // %3 "=r"(dst1) // %4 : "0"(nn), "1"(src0), "2"(src1), "3"(dst0), "4"(dst1), "r"(src_step), // %10 "r"(dst_step) // %11 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); } #endif // __aarch64__ for (; remain > 0; remain--) { dst0[0] = src1[0 + 3 * src_step]; dst0[1] = src1[1 + 3 * src_step]; dst0[2] = src1[2 + 3 * src_step]; dst0[3] = src1[3 + 3 * src_step]; dst0[4] = src0[0 + 3 * src_step]; dst0[5] = src0[1 + 3 * src_step]; dst0[6] = src0[2 + 3 * src_step]; dst0[7] = src0[3 + 3 * src_step]; dst0[8] = src1[0 + 2 * src_step]; dst0[9] = src1[1 + 2 * src_step]; dst0[10] = src1[2 + 2 * src_step]; dst0[11] = src1[3 + 2 * src_step]; dst0[12] = src0[0 + 2 * src_step]; dst0[13] = src0[1 + 2 * src_step]; dst0[14] = src0[2 + 2 * src_step]; dst0[15] = src0[3 + 2 * src_step]; dst0[16] = src1[0 + src_step]; dst0[17] = src1[1 + src_step]; dst0[18] = src1[2 + src_step]; dst0[19] = src1[3 + src_step]; dst0[20] = src0[0 + src_step]; dst0[21] = src0[1 + src_step]; dst0[22] = src0[2 + src_step]; dst0[23] = src0[3 + src_step]; dst0[24] = src1[0]; dst0[25] = src1[1]; dst0[26] = src1[2]; dst0[27] = src1[3]; dst0[28] = src0[0]; dst0[29] = src0[1]; dst0[30] = src0[2]; dst0[31] = src0[3]; src0 += 4; src1 += 4; dst0 += stride; } src0 += srcwgap + 7 * srcstride; } #endif // __ARM_NEON for (; y < srch; y++) { unsigned char* dst0 = dstend - y * 4 - 4; int x = 0; for (; x < srcw; x++) { dst0[0] = src0[0]; dst0[1] = src0[1]; dst0[2] = src0[2]; dst0[3] = src0[3]; src0 += 4; dst0 += stride; } src0 += srcwgap; } } static void kanna_rotate_7_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride) { const int srcwgap = srcstride - srcw; // point to the last dst pixel unsigned char* dstend = dst + stride * (h - 1) + w; const unsigned char* src0 = src; int y = 0; #if __ARM_NEON for (; y + 7 < srch; y += 8) { const unsigned char* src1 = src0 + srcstride; unsigned char* dst6 = dstend - y - 8 - stride; unsigned char* dst7 = dstend - y - 8; int src_step = 2 * srcstride; int dst_step = -2 * stride; int nn = srcw >> 3; int remain = srcw - (nn << 3); #if __aarch64__ for (; nn > 0; nn--) { uint8x8_t _src0 = vld1_u8(src0); uint8x8_t _src1 = vld1_u8(src1); uint8x8_t _src2 = vld1_u8(src0 + src_step); uint8x8_t _src3 = vld1_u8(src1 + src_step); uint8x8_t _src4 = vld1_u8(src0 + 2 * src_step); uint8x8_t _src5 = vld1_u8(src1 + 2 * src_step); uint8x8_t _src6 = vld1_u8(src0 + 3 * src_step); uint8x8_t _src7 = vld1_u8(src1 + 3 * src_step); uint8x8x2_t _src01t_r = vtrn_u8(_src1, _src0); uint8x8x2_t _src23t_r = vtrn_u8(_src3, _src2); uint8x8x2_t _src45t_r = vtrn_u8(_src5, _src4); uint8x8x2_t _src67t_r = vtrn_u8(_src7, _src6); uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1])); uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0])); uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1])); uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0])); uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1])); uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1])); uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0])); uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0])); uint8x8_t _dst0 = vreinterpret_u8_u32(_src04ttt_r.val[1]); uint8x8_t _dst1 = vreinterpret_u8_u32(_src15ttt_r.val[1]); uint8x8_t _dst2 = vreinterpret_u8_u32(_src26ttt_r.val[1]); uint8x8_t _dst3 = vreinterpret_u8_u32(_src37ttt_r.val[1]); uint8x8_t _dst4 = vreinterpret_u8_u32(_src04ttt_r.val[0]); uint8x8_t _dst5 = vreinterpret_u8_u32(_src15ttt_r.val[0]); uint8x8_t _dst6 = vreinterpret_u8_u32(_src26ttt_r.val[0]); uint8x8_t _dst7 = vreinterpret_u8_u32(_src37ttt_r.val[0]); vst1_u8(dst7, _dst7); vst1_u8(dst6, _dst6); vst1_u8(dst7 + dst_step, _dst5); vst1_u8(dst6 + dst_step, _dst4); vst1_u8(dst7 + 2 * dst_step, _dst3); vst1_u8(dst6 + 2 * dst_step, _dst2); vst1_u8(dst7 + 3 * dst_step, _dst1); vst1_u8(dst6 + 3 * dst_step, _dst0); src0 += 8; src1 += 8; dst7 += 4 * dst_step; dst6 += 4 * dst_step; } #else if (nn > 0) { asm volatile( "0: \n" "pld [%1, #64] \n" "vld1.u8 {d0}, [%1], %10 \n" "pld [%2, #64] \n" "vld1.u8 {d1}, [%2], %10 \n" "pld [%1, #64] \n" "vld1.u8 {d2}, [%1], %10 \n" "vtrn.u8 d1, d0 \n" // _src01t_r "pld [%2, #64] \n" "vld1.u8 {d3}, [%2], %10 \n" "pld [%1, #64] \n" "vld1.u8 {d4}, [%1], %10 \n" "vtrn.u8 d3, d2 \n" // _src23t_r "pld [%2, #64] \n" "vld1.u8 {d5}, [%2], %10 \n" "pld [%1, #64] \n" "vld1.u8 {d6}, [%1], %10 \n" "vtrn.u8 d5, d4 \n" // _src45t_r "pld [%2, #64] \n" "vld1.u8 {d7}, [%2], %10 \n" "vtrn.u8 d7, d6 \n" // _src67t_r "sub %1, %1, %10, lsl #2 \n" // restore src0 "vtrn.u16 q1, q0 \n" // _src02tt_r _src13tt_r "sub %2, %2, %10, lsl #2 \n" // restore src1 "vtrn.u16 q3, q2 \n" // _src46tt_r _src57tt_r "add %1, #8 \n" // src0 += 8 "vtrn.u32 q3, q1 \n" // _src26ttt_r _src37ttt_r "add %2, #8 \n" // src1 += 8 "vtrn.u32 q2, q0 \n" // _src04ttt_r _src15ttt_r "vst1.u8 {d6}, [%4], %11 \n" "vst1.u8 {d7}, [%3], %11 \n" "subs %0, #1 \n" "vst1.u8 {d4}, [%4], %11 \n" "vst1.u8 {d5}, [%3], %11 \n" "vst1.u8 {d2}, [%4], %11 \n" "vst1.u8 {d3}, [%3], %11 \n" "vst1.u8 {d0}, [%4], %11 \n" "vst1.u8 {d1}, [%3], %11 \n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(src1), // %2 "=r"(dst7), // %3 "=r"(dst6) // %4 : "0"(nn), "1"(src0), "2"(src1), "3"(dst7), "4"(dst6), "r"(src_step), // %10 "r"(dst_step) // %11 : "cc", "memory", "q0", "q1", "q2", "q3"); } #endif // __aarch64__ for (; remain > 0; remain--) { dst7[0] = src1[0 + 3 * src_step]; dst7[1] = src0[0 + 3 * src_step]; dst7[2] = src1[0 + 2 * src_step]; dst7[3] = src0[0 + 2 * src_step]; dst7[4] = src1[0 + src_step]; dst7[5] = src0[0 + src_step]; dst7[6] = src1[0]; dst7[7] = src0[0]; src0 += 1; src1 += 1; dst7 -= stride; } src0 += srcwgap + 7 * srcstride; } #endif // __ARM_NEON for (; y < srch; y++) { unsigned char* dst0 = dstend - y - 1; int x = 0; for (; x < srcw; x++) { *dst0 = *src0; src0 += 1; dst0 -= stride; } src0 += srcwgap; } } static void kanna_rotate_7_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride) { const int srcwgap = srcstride - srcw * 2; // point to the last dst pixel unsigned char* dstend = dst + stride * (h - 1) + w * 2; const unsigned char* src0 = src; int y = 0; #if __ARM_NEON for (; y + 7 < srch; y += 8) { const unsigned char* src1 = src0 + srcstride; unsigned char* dst6 = dstend - y * 2 - 8 * 2 - stride; unsigned char* dst7 = dstend - y * 2 - 8 * 2; int src_step = 2 * srcstride; int dst_step = -2 * stride; int nn = srcw >> 3; int remain = srcw - (nn << 3); #if __aarch64__ for (; nn > 0; nn--) { uint8x8x2_t _src0 = vld2_u8(src0); uint8x8x2_t _src1 = vld2_u8(src1); uint8x8x2_t _src2 = vld2_u8(src0 + src_step); uint8x8x2_t _src3 = vld2_u8(src1 + src_step); uint8x8x2_t _src4 = vld2_u8(src0 + 2 * src_step); uint8x8x2_t _src5 = vld2_u8(src1 + 2 * src_step); uint8x8x2_t _src6 = vld2_u8(src0 + 3 * src_step); uint8x8x2_t _src7 = vld2_u8(src1 + 3 * src_step); uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]); uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]); uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]); uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]); uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]); uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]); uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]); uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]); uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1])); uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0])); uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1])); uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0])); uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1])); uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0])); uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1])); uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0])); uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1])); uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1])); uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0])); uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0])); uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1])); uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1])); uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0])); uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0])); uint8x8x2_t _dst0; uint8x8x2_t _dst1; uint8x8x2_t _dst2; uint8x8x2_t _dst3; uint8x8x2_t _dst4; uint8x8x2_t _dst5; uint8x8x2_t _dst6; uint8x8x2_t _dst7; _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]); _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]); _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]); _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]); _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]); _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]); _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]); _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]); _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]); _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]); _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]); _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]); _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]); _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]); _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]); _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]); vst2_u8(dst7, _dst7); vst2_u8(dst6, _dst6); vst2_u8(dst7 + dst_step, _dst5); vst2_u8(dst6 + dst_step, _dst4); vst2_u8(dst7 + 2 * dst_step, _dst3); vst2_u8(dst6 + 2 * dst_step, _dst2); vst2_u8(dst7 + 3 * dst_step, _dst1); vst2_u8(dst6 + 3 * dst_step, _dst0); src0 += 2 * 8; src1 += 2 * 8; dst7 += 4 * dst_step; dst6 += 4 * dst_step; } #else if (nn > 0) { asm volatile( "0: \n" "pld [%1, #128] \n" "vld2.u8 {d0-d1}, [%1], %10 \n" "pld [%2, #128] \n" "vld2.u8 {d2-d3}, [%2], %10 \n" "pld [%1, #128] \n" "vld2.u8 {d4-d5}, [%1], %10 \n" "vtrn.u8 q1, q0 \n" // _src01t_r "pld [%2, #128] \n" "vld2.u8 {d6-d7}, [%2], %10 \n" "pld [%1, #128] \n" "vld2.u8 {d16-d17}, [%1], %10\n" "vtrn.u8 q3, q2 \n" // _src23t_r "pld [%2, #128] \n" "vld2.u8 {d18-d19}, [%2], %10\n" "pld [%1, #128] \n" "vld2.u8 {d20-d21}, [%1], %10\n" "vtrn.u8 q9, q8 \n" // _src45t_r "pld [%2, #128] \n" "vld2.u8 {d22-d23}, [%2], %10\n" "vtrn.u8 q11, q10 \n" // _src67t_r "sub %1, %1, %10, lsl #2 \n" // restore src0 "vtrn.u16 q2, q0 \n" // _src02tt_r "sub %2, %2, %10, lsl #2 \n" // restore src1 "vtrn.u16 q3, q1 \n" // _src13tt_r "add %1, #16 \n" // src0 += 16 "vtrn.u16 q10, q8 \n" // _src46tt_r "add %2, #16 \n" // src1 += 16 "vtrn.u16 q11, q9 \n" // _src57tt_r "vtrn.u32 q10, q2 \n" // _src26ttt_r "vtrn.u32 q11, q3 \n" // _src37ttt_r "vst2.u8 {d20-d21}, [%4], %11\n" "vtrn.u32 q8, q0 \n" // _src04ttt_r "vst2.u8 {d22-d23}, [%3], %11\n" "vtrn.u32 q9, q1 \n" // _src15ttt_r "vst2.u8 {d16-d17}, [%4], %11\n" "subs %0, #1 \n" "vst2.u8 {d4-d5}, [%4], %11 \n" "vst2.u8 {d18-d19}, [%3], %11\n" "vst2.u8 {d6-d7}, [%3], %11 \n" "vst2.u8 {d0-d1}, [%4], %11 \n" "vst2.u8 {d2-d3}, [%3], %11 \n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(src1), // %2 "=r"(dst7), // %3 "=r"(dst6) // %4 : "0"(nn), "1"(src0), "2"(src1), "3"(dst7), "4"(dst6), "r"(src_step), // %10 "r"(dst_step) // %11 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); } #endif // __aarch64__ for (; remain > 0; remain--) { dst7[0] = src1[0 + 3 * src_step]; dst7[1] = src1[1 + 3 * src_step]; dst7[2] = src0[0 + 3 * src_step]; dst7[3] = src0[1 + 3 * src_step]; dst7[4] = src1[0 + 2 * src_step]; dst7[5] = src1[1 + 2 * src_step]; dst7[6] = src0[0 + 2 * src_step]; dst7[7] = src0[1 + 2 * src_step]; dst7[8] = src1[0 + src_step]; dst7[9] = src1[1 + src_step]; dst7[10] = src0[0 + src_step]; dst7[11] = src0[1 + src_step]; dst7[12] = src1[0]; dst7[13] = src1[1]; dst7[14] = src0[0]; dst7[15] = src0[1]; src0 += 2; src1 += 2; dst7 -= stride; } src0 += srcwgap + 7 * srcstride; } #endif // __ARM_NEON for (; y < srch; y++) { unsigned char* dst0 = dstend - y * 2 - 2; int x = 0; for (; x < srcw; x++) { dst0[0] = src0[0]; dst0[1] = src0[1]; src0 += 2; dst0 -= stride; } src0 += srcwgap; } } static void kanna_rotate_7_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride) { const int srcwgap = srcstride - srcw * 3; // point to the last dst pixel unsigned char* dstend = dst + stride * (h - 1) + w * 3; const unsigned char* src0 = src; int y = 0; #if __ARM_NEON for (; y + 7 < srch; y += 8) { const unsigned char* src1 = src0 + srcstride; unsigned char* dst6 = dstend - y * 3 - 8 * 3 - stride; unsigned char* dst7 = dstend - y * 3 - 8 * 3; int src_step = 2 * srcstride; int dst_step = -2 * stride; int nn = srcw >> 3; int remain = srcw - (nn << 3); #if __aarch64__ for (; nn > 0; nn--) { uint8x8x3_t _src0 = vld3_u8(src0); uint8x8x3_t _src1 = vld3_u8(src1); uint8x8x3_t _src2 = vld3_u8(src0 + src_step); uint8x8x3_t _src3 = vld3_u8(src1 + src_step); uint8x8x3_t _src4 = vld3_u8(src0 + 2 * src_step); uint8x8x3_t _src5 = vld3_u8(src1 + 2 * src_step); uint8x8x3_t _src6 = vld3_u8(src0 + 3 * src_step); uint8x8x3_t _src7 = vld3_u8(src1 + 3 * src_step); uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]); uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]); uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]); uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]); uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]); uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]); uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]); uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]); uint8x8x2_t _src01t_b = vtrn_u8(_src1.val[2], _src0.val[2]); uint8x8x2_t _src23t_b = vtrn_u8(_src3.val[2], _src2.val[2]); uint8x8x2_t _src45t_b = vtrn_u8(_src5.val[2], _src4.val[2]); uint8x8x2_t _src67t_b = vtrn_u8(_src7.val[2], _src6.val[2]); uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1])); uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0])); uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1])); uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0])); uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1])); uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0])); uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1])); uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0])); uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[1]), vreinterpret_u16_u8(_src01t_b.val[1])); uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[0]), vreinterpret_u16_u8(_src01t_b.val[0])); uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[1]), vreinterpret_u16_u8(_src45t_b.val[1])); uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[0]), vreinterpret_u16_u8(_src45t_b.val[0])); uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1])); uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1])); uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0])); uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0])); uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1])); uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1])); uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0])); uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0])); uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[1]), vreinterpret_u32_u16(_src02tt_b.val[1])); uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[1]), vreinterpret_u32_u16(_src13tt_b.val[1])); uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[0]), vreinterpret_u32_u16(_src02tt_b.val[0])); uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[0]), vreinterpret_u32_u16(_src13tt_b.val[0])); uint8x8x3_t _dst0; uint8x8x3_t _dst1; uint8x8x3_t _dst2; uint8x8x3_t _dst3; uint8x8x3_t _dst4; uint8x8x3_t _dst5; uint8x8x3_t _dst6; uint8x8x3_t _dst7; _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]); _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]); _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]); _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]); _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]); _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]); _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]); _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]); _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]); _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]); _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]); _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]); _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]); _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]); _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]); _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]); _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]); _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]); _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]); _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]); _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]); _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]); _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]); _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]); vst3_u8(dst7, _dst7); vst3_u8(dst6, _dst6); vst3_u8(dst7 + dst_step, _dst5); vst3_u8(dst6 + dst_step, _dst4); vst3_u8(dst7 + 2 * dst_step, _dst3); vst3_u8(dst6 + 2 * dst_step, _dst2); vst3_u8(dst7 + 3 * dst_step, _dst1); vst3_u8(dst6 + 3 * dst_step, _dst0); src0 += 3 * 8; src1 += 3 * 8; dst7 += 4 * dst_step; dst6 += 4 * dst_step; } #else if (nn > 0) { asm volatile( "0: \n" "pld [%1, #192] \n" "vld3.u8 {d0-d2}, [%1], %10 \n" "pld [%2, #192] \n" "vld3.u8 {d4-d6}, [%2], %10 \n" "pld [%1, #192] \n" "vld3.u8 {d8-d10}, [%1], %10 \n" "vtrn.u8 q2, q0 \n" // _src01t_r "vtrn.u8 d6, d2 \n" "pld [%2, #192] \n" "vld3.u8 {d12-d14}, [%2], %10\n" "pld [%1, #192] \n" "vld3.u8 {d16-d18}, [%1], %10\n" "vtrn.u8 q6, q4 \n" // _src23t_r "vtrn.u8 d14, d10 \n" "pld [%2, #192] \n" "vld3.u8 {d20-d22}, [%2], %10\n" "pld [%1, #192] \n" "vld3.u8 {d24-d26}, [%1], %10\n" "vtrn.u8 q10, q8 \n" // _src45t_r "vtrn.u8 d22, d18 \n" "pld [%2, #192] \n" "vld3.u8 {d28-d30}, [%2], %10\n" "vtrn.u8 q14, q12 \n" // _src67t_r "vtrn.u8 d30, d26 \n" "sub %1, %1, %10, lsl #2 \n" // restore src0 "vtrn.u16 q4, q0 \n" // _src02tt_r "vtrn.u16 d10, d2 \n" "sub %2, %2, %10, lsl #2 \n" // restore src1 "vtrn.u16 q6, q2 \n" // _src13tt_r "vtrn.u16 d14, d6 \n" "add %1, #24 \n" // src0 += 24 "vtrn.u16 q12, q8 \n" // _src46tt_r "vtrn.u16 d26, d18 \n" "add %2, #24 \n" // src1 += 24 "vtrn.u16 q14, q10 \n" // _src57tt_r "vtrn.u16 d30, d22 \n" "vtrn.u32 q12, q4 \n" // _src26ttt_r "vtrn.u32 d26, d10 \n" "vtrn.u32 q14, q6 \n" // _src37ttt_r "vst3.u8 {d24-d26}, [%4], %11\n" "vtrn.u32 d30, d14 \n" "vtrn.u32 q8, q0 \n" // _src04ttt_r "vst3.u8 {d28-d30}, [%3], %11\n" "vtrn.u32 d18, d2 \n" "vtrn.u32 q10, q2 \n" // _src15ttt_r "vst3.u8 {d16-d18}, [%4], %11\n" "vtrn.u32 d22, d6 \n" "subs %0, #1 \n" "vst3.u8 {d8-d10}, [%4], %11 \n" "vst3.u8 {d20-d22}, [%3], %11\n" "vst3.u8 {d12-d14}, [%3], %11\n" "vst3.u8 {d0-d2}, [%4], %11 \n" "vst3.u8 {d4-d6}, [%3], %11 \n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(src1), // %2 "=r"(dst7), // %3 "=r"(dst6) // %4 : "0"(nn), "1"(src0), "2"(src1), "3"(dst7), "4"(dst6), "r"(src_step), // %10 "r"(dst_step) // %11 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); } #endif // __aarch64__ for (; remain > 0; remain--) { dst7[0] = src1[0 + 3 * src_step]; dst7[1] = src1[1 + 3 * src_step]; dst7[2] = src1[2 + 3 * src_step]; dst7[3] = src0[0 + 3 * src_step]; dst7[4] = src0[1 + 3 * src_step]; dst7[5] = src0[2 + 3 * src_step]; dst7[6] = src1[0 + 2 * src_step]; dst7[7] = src1[1 + 2 * src_step]; dst7[8] = src1[2 + 2 * src_step]; dst7[9] = src0[0 + 2 * src_step]; dst7[10] = src0[1 + 2 * src_step]; dst7[11] = src0[2 + 2 * src_step]; dst7[12] = src1[0 + src_step]; dst7[13] = src1[1 + src_step]; dst7[14] = src1[2 + src_step]; dst7[15] = src0[0 + src_step]; dst7[16] = src0[1 + src_step]; dst7[17] = src0[2 + src_step]; dst7[18] = src1[0]; dst7[19] = src1[1]; dst7[20] = src1[2]; dst7[21] = src0[0]; dst7[22] = src0[1]; dst7[23] = src0[2]; src0 += 3; src1 += 3; dst7 -= stride; } src0 += srcwgap + 7 * srcstride; } #endif // __ARM_NEON for (; y < srch; y++) { unsigned char* dst0 = dstend - y * 3 - 3; int x = 0; for (; x < srcw; x++) { dst0[0] = src0[0]; dst0[1] = src0[1]; dst0[2] = src0[2]; src0 += 3; dst0 -= stride; } src0 += srcwgap; } } static void kanna_rotate_7_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride) { const int srcwgap = srcstride - srcw * 4; // point to the last dst pixel unsigned char* dstend = dst + stride * (h - 1) + w * 4; const unsigned char* src0 = src; int y = 0; #if __ARM_NEON for (; y + 7 < srch; y += 8) { const unsigned char* src1 = src0 + srcstride; unsigned char* dst6 = dstend - y * 4 - 8 * 4 - stride; unsigned char* dst7 = dstend - y * 4 - 8 * 4; int src_step = 2 * srcstride; int dst_step = -2 * stride; int nn = srcw >> 3; int remain = srcw - (nn << 3); #if __aarch64__ for (; nn > 0; nn--) { uint8x8x4_t _src0 = vld4_u8(src0); uint8x8x4_t _src1 = vld4_u8(src1); uint8x8x4_t _src2 = vld4_u8(src0 + src_step); uint8x8x4_t _src3 = vld4_u8(src1 + src_step); uint8x8x4_t _src4 = vld4_u8(src0 + 2 * src_step); uint8x8x4_t _src5 = vld4_u8(src1 + 2 * src_step); uint8x8x4_t _src6 = vld4_u8(src0 + 3 * src_step); uint8x8x4_t _src7 = vld4_u8(src1 + 3 * src_step); uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]); uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]); uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]); uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]); uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]); uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]); uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]); uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]); uint8x8x2_t _src01t_b = vtrn_u8(_src1.val[2], _src0.val[2]); uint8x8x2_t _src23t_b = vtrn_u8(_src3.val[2], _src2.val[2]); uint8x8x2_t _src45t_b = vtrn_u8(_src5.val[2], _src4.val[2]); uint8x8x2_t _src67t_b = vtrn_u8(_src7.val[2], _src6.val[2]); uint8x8x2_t _src01t_a = vtrn_u8(_src1.val[3], _src0.val[3]); uint8x8x2_t _src23t_a = vtrn_u8(_src3.val[3], _src2.val[3]); uint8x8x2_t _src45t_a = vtrn_u8(_src5.val[3], _src4.val[3]); uint8x8x2_t _src67t_a = vtrn_u8(_src7.val[3], _src6.val[3]); uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1])); uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0])); uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1])); uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0])); uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1])); uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0])); uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1])); uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0])); uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[1]), vreinterpret_u16_u8(_src01t_b.val[1])); uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[0]), vreinterpret_u16_u8(_src01t_b.val[0])); uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[1]), vreinterpret_u16_u8(_src45t_b.val[1])); uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[0]), vreinterpret_u16_u8(_src45t_b.val[0])); uint16x4x2_t _src02tt_a = vtrn_u16(vreinterpret_u16_u8(_src23t_a.val[1]), vreinterpret_u16_u8(_src01t_a.val[1])); uint16x4x2_t _src13tt_a = vtrn_u16(vreinterpret_u16_u8(_src23t_a.val[0]), vreinterpret_u16_u8(_src01t_a.val[0])); uint16x4x2_t _src46tt_a = vtrn_u16(vreinterpret_u16_u8(_src67t_a.val[1]), vreinterpret_u16_u8(_src45t_a.val[1])); uint16x4x2_t _src57tt_a = vtrn_u16(vreinterpret_u16_u8(_src67t_a.val[0]), vreinterpret_u16_u8(_src45t_a.val[0])); uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1])); uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1])); uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0])); uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0])); uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1])); uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1])); uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0])); uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0])); uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[1]), vreinterpret_u32_u16(_src02tt_b.val[1])); uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[1]), vreinterpret_u32_u16(_src13tt_b.val[1])); uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[0]), vreinterpret_u32_u16(_src02tt_b.val[0])); uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[0]), vreinterpret_u32_u16(_src13tt_b.val[0])); uint32x2x2_t _src04ttt_a = vtrn_u32(vreinterpret_u32_u16(_src46tt_a.val[1]), vreinterpret_u32_u16(_src02tt_a.val[1])); uint32x2x2_t _src15ttt_a = vtrn_u32(vreinterpret_u32_u16(_src57tt_a.val[1]), vreinterpret_u32_u16(_src13tt_a.val[1])); uint32x2x2_t _src26ttt_a = vtrn_u32(vreinterpret_u32_u16(_src46tt_a.val[0]), vreinterpret_u32_u16(_src02tt_a.val[0])); uint32x2x2_t _src37ttt_a = vtrn_u32(vreinterpret_u32_u16(_src57tt_a.val[0]), vreinterpret_u32_u16(_src13tt_a.val[0])); uint8x8x4_t _dst0; uint8x8x4_t _dst1; uint8x8x4_t _dst2; uint8x8x4_t _dst3; uint8x8x4_t _dst4; uint8x8x4_t _dst5; uint8x8x4_t _dst6; uint8x8x4_t _dst7; _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]); _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]); _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]); _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]); _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]); _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]); _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]); _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]); _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]); _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]); _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]); _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]); _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]); _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]); _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]); _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]); _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]); _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]); _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]); _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]); _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]); _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]); _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]); _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]); _dst0.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[1]); _dst1.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[1]); _dst2.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[1]); _dst3.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[1]); _dst4.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[0]); _dst5.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[0]); _dst6.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[0]); _dst7.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[0]); vst4_u8(dst7, _dst7); vst4_u8(dst6, _dst6); vst4_u8(dst7 + dst_step, _dst5); vst4_u8(dst6 + dst_step, _dst4); vst4_u8(dst7 + 2 * dst_step, _dst3); vst4_u8(dst6 + 2 * dst_step, _dst2); vst4_u8(dst7 + 3 * dst_step, _dst1); vst4_u8(dst6 + 3 * dst_step, _dst0); src0 += 4 * 8; src1 += 4 * 8; dst7 += 4 * dst_step; dst6 += 4 * dst_step; } #else if (nn > 0) { asm volatile( "0: \n" "pld [%1, #256] \n" "vld4.u8 {d0-d3}, [%1], %10 \n" "pld [%2, #256] \n" "vld4.u8 {d4-d7}, [%2], %10 \n" "pld [%1, #256] \n" "vld4.u8 {d8-d11}, [%1], %10 \n" "vtrn.u8 q2, q0 \n" // _src01t_r "vtrn.u8 q3, q1 \n" "pld [%2, #256] \n" "vld4.u8 {d12-d15}, [%2], %10\n" "pld [%1, #256] \n" "vld4.u8 {d16-d19}, [%1], %10\n" "vtrn.u8 q6, q4 \n" // _src23t_r "vtrn.u8 q7, q5 \n" "pld [%2, #256] \n" "vld4.u8 {d20-d23}, [%2], %10\n" "pld [%1, #256] \n" "vld4.u8 {d24-d27}, [%1], %10\n" "vtrn.u8 q10, q8 \n" // _src45t_r "vtrn.u8 q11, q9 \n" "pld [%2, #256] \n" "vld4.u8 {d28-d31}, [%2], %10\n" "vtrn.u8 q14, q12 \n" // _src67t_r "vtrn.u8 q15, q13 \n" "sub %1, %1, %10, lsl #2 \n" // restore src0 "vtrn.u16 q4, q0 \n" // _src02tt_r "vtrn.u16 q5, q1 \n" "sub %2, %2, %10, lsl #2 \n" // restore src1 "vtrn.u16 q6, q2 \n" // _src13tt_r "vtrn.u16 q7, q3 \n" "add %1, #32 \n" // src0 += 32 "vtrn.u16 q12, q8 \n" // _src46tt_r "vtrn.u16 q13, q9 \n" "add %2, #32 \n" // src1 += 32 "vtrn.u16 q14, q10 \n" // _src57tt_r "vtrn.u16 q15, q11 \n" "vtrn.u32 q12, q4 \n" // _src26ttt_r "vtrn.u32 q13, q5 \n" "vtrn.u32 q14, q6 \n" // _src37ttt_r "vst4.u8 {d24-d27}, [%4], %11\n" "vtrn.u32 q15, q7 \n" "vtrn.u32 q8, q0 \n" // _src04ttt_r "vst4.u8 {d28-d31}, [%3], %11\n" "vtrn.u32 q9, q1 \n" "vtrn.u32 q10, q2 \n" // _src15ttt_r "vst4.u8 {d16-d19}, [%4], %11\n" "vtrn.u32 q11, q3 \n" "subs %0, #1 \n" "vst4.u8 {d8-d11}, [%4], %11 \n" "vst4.u8 {d20-d23}, [%3], %11\n" "vst4.u8 {d12-d15}, [%3], %11\n" "vst4.u8 {d0-d3}, [%4], %11 \n" "vst4.u8 {d4-d7}, [%3], %11 \n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(src1), // %2 "=r"(dst7), // %3 "=r"(dst6) // %4 : "0"(nn), "1"(src0), "2"(src1), "3"(dst7), "4"(dst6), "r"(src_step), // %10 "r"(dst_step) // %11 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); } #endif // __aarch64__ for (; remain > 0; remain--) { dst7[0] = src1[0 + 3 * src_step]; dst7[1] = src1[1 + 3 * src_step]; dst7[2] = src1[2 + 3 * src_step]; dst7[3] = src1[3 + 3 * src_step]; dst7[4] = src0[0 + 3 * src_step]; dst7[5] = src0[1 + 3 * src_step]; dst7[6] = src0[2 + 3 * src_step]; dst7[7] = src0[3 + 3 * src_step]; dst7[8] = src1[0 + 2 * src_step]; dst7[9] = src1[1 + 2 * src_step]; dst7[10] = src1[2 + 2 * src_step]; dst7[11] = src1[3 + 2 * src_step]; dst7[12] = src0[0 + 2 * src_step]; dst7[13] = src0[1 + 2 * src_step]; dst7[14] = src0[2 + 2 * src_step]; dst7[15] = src0[3 + 2 * src_step]; dst7[16] = src1[0 + src_step]; dst7[17] = src1[1 + src_step]; dst7[18] = src1[2 + src_step]; dst7[19] = src1[3 + src_step]; dst7[20] = src0[0 + src_step]; dst7[21] = src0[1 + src_step]; dst7[22] = src0[2 + src_step]; dst7[23] = src0[3 + src_step]; dst7[24] = src1[0]; dst7[25] = src1[1]; dst7[26] = src1[2]; dst7[27] = src1[3]; dst7[28] = src0[0]; dst7[29] = src0[1]; dst7[30] = src0[2]; dst7[31] = src0[3]; src0 += 4; src1 += 4; dst7 -= stride; } src0 += srcwgap + 7 * srcstride; } #endif // __ARM_NEON for (; y < srch; y++) { unsigned char* dst0 = dstend - y * 4 - 4; int x = 0; for (; x < srcw; x++) { dst0[0] = src0[0]; dst0[1] = src0[1]; dst0[2] = src0[2]; dst0[3] = src0[3]; src0 += 4; dst0 -= stride; } src0 += srcwgap; } } static void kanna_rotate_8_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int h, int stride) { const int srcwgap = srcstride - srcw; // point to the last dst pixel row unsigned char* dstend = dst + stride * (h - 1); const unsigned char* src0 = src; int y = 0; #if __ARM_NEON for (; y + 7 < srch; y += 8) { const unsigned char* src1 = src0 + srcstride; unsigned char* dst7 = dstend + y; unsigned char* dst6 = dstend + y - stride; int src_step = 2 * srcstride; int dst_step = -2 * stride; int nn = srcw >> 3; int remain = srcw - (nn << 3); #if __aarch64__ for (; nn > 0; nn--) { uint8x8_t _src0 = vld1_u8(src0); uint8x8_t _src1 = vld1_u8(src1); uint8x8_t _src2 = vld1_u8(src0 + src_step); uint8x8_t _src3 = vld1_u8(src1 + src_step); uint8x8_t _src4 = vld1_u8(src0 + 2 * src_step); uint8x8_t _src5 = vld1_u8(src1 + 2 * src_step); uint8x8_t _src6 = vld1_u8(src0 + 3 * src_step); uint8x8_t _src7 = vld1_u8(src1 + 3 * src_step); uint8x8x2_t _src01t_r = vtrn_u8(_src0, _src1); uint8x8x2_t _src23t_r = vtrn_u8(_src2, _src3); uint8x8x2_t _src45t_r = vtrn_u8(_src4, _src5); uint8x8x2_t _src67t_r = vtrn_u8(_src6, _src7); uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0])); uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1])); uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0])); uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1])); uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0])); uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0])); uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1])); uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1])); uint8x8_t _dst0 = vreinterpret_u8_u32(_src04ttt_r.val[0]); uint8x8_t _dst1 = vreinterpret_u8_u32(_src15ttt_r.val[0]); uint8x8_t _dst2 = vreinterpret_u8_u32(_src26ttt_r.val[0]); uint8x8_t _dst3 = vreinterpret_u8_u32(_src37ttt_r.val[0]); uint8x8_t _dst4 = vreinterpret_u8_u32(_src04ttt_r.val[1]); uint8x8_t _dst5 = vreinterpret_u8_u32(_src15ttt_r.val[1]); uint8x8_t _dst6 = vreinterpret_u8_u32(_src26ttt_r.val[1]); uint8x8_t _dst7 = vreinterpret_u8_u32(_src37ttt_r.val[1]); vst1_u8(dst7, _dst0); vst1_u8(dst6, _dst1); vst1_u8(dst7 + dst_step, _dst2); vst1_u8(dst6 + dst_step, _dst3); vst1_u8(dst7 + 2 * dst_step, _dst4); vst1_u8(dst6 + 2 * dst_step, _dst5); vst1_u8(dst7 + 3 * dst_step, _dst6); vst1_u8(dst6 + 3 * dst_step, _dst7); src0 += 8; src1 += 8; dst7 += 4 * dst_step; dst6 += 4 * dst_step; } #else if (nn > 0) { asm volatile( "0: \n" "pld [%1, #64] \n" "vld1.u8 {d0}, [%1], %10 \n" "pld [%2, #64] \n" "vld1.u8 {d1}, [%2], %10 \n" "pld [%1, #64] \n" "vld1.u8 {d2}, [%1], %10 \n" "vtrn.u8 d0, d1 \n" // _src01t_r "pld [%2, #64] \n" "vld1.u8 {d3}, [%2], %10 \n" "pld [%1, #64] \n" "vld1.u8 {d4}, [%1], %10 \n" "vtrn.u8 d2, d3 \n" // _src23t_r "pld [%2, #64] \n" "vld1.u8 {d5}, [%2], %10 \n" "pld [%1, #64] \n" "vld1.u8 {d6}, [%1], %10 \n" "vtrn.u8 d4, d5 \n" // _src45t_r "pld [%2, #64] \n" "vld1.u8 {d7}, [%2], %10 \n" "vtrn.u8 d6, d7 \n" // _src67t_r "sub %1, %1, %10, lsl #2 \n" // restore src0 "vtrn.u16 q0, q1 \n" // _src02tt_r _src13tt_r "sub %2, %2, %10, lsl #2 \n" // restore src1 "vtrn.u16 q2, q3 \n" // _src46tt_r _src57tt_r "add %1, #8 \n" // src0 += 8 "vtrn.u32 q0, q2 \n" // _src04ttt_r _src15ttt_r "add %2, #8 \n" // src1 += 8 "vtrn.u32 q1, q3 \n" // _src26ttt_r _src37ttt_r "vst1.u8 {d0}, [%3], %11 \n" "vst1.u8 {d1}, [%4], %11 \n" "subs %0, #1 \n" "vst1.u8 {d2}, [%3], %11 \n" "vst1.u8 {d3}, [%4], %11 \n" "vst1.u8 {d4}, [%3], %11 \n" "vst1.u8 {d5}, [%4], %11 \n" "vst1.u8 {d6}, [%3], %11 \n" "vst1.u8 {d7}, [%4], %11 \n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(src1), // %2 "=r"(dst7), // %3 "=r"(dst6) // %4 : "0"(nn), "1"(src0), "2"(src1), "3"(dst7), "4"(dst6), "r"(src_step), // %10 "r"(dst_step) // %11 : "cc", "memory", "q0", "q1", "q2", "q3"); } #endif // __aarch64__ for (; remain > 0; remain--) { dst7[0] = src0[0]; dst7[1] = src1[0]; dst7[2] = src0[0 + src_step]; dst7[3] = src1[0 + src_step]; dst7[4] = src0[0 + 2 * src_step]; dst7[5] = src1[0 + 2 * src_step]; dst7[6] = src0[0 + 3 * src_step]; dst7[7] = src1[0 + 3 * src_step]; src0 += 1; src1 += 1; dst7 -= stride; } src0 += srcwgap + 7 * srcstride; } #endif // __ARM_NEON for (; y < srch; y++) { unsigned char* dst0 = dstend + y; int x = 0; for (; x < srcw; x++) { *dst0 = *src0; src0 += 1; dst0 -= stride; } src0 += srcwgap; } } static void kanna_rotate_8_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int h, int stride) { const int srcwgap = srcstride - srcw * 2; // point to the last dst pixel row unsigned char* dstend = dst + stride * (h - 1); const unsigned char* src0 = src; int y = 0; #if __ARM_NEON for (; y + 7 < srch; y += 8) { const unsigned char* src1 = src0 + srcstride; unsigned char* dst7 = dstend + y * 2; unsigned char* dst6 = dstend + y * 2 - stride; int src_step = 2 * srcstride; int dst_step = -2 * stride; int nn = srcw >> 3; int remain = srcw - (nn << 3); #if __aarch64__ for (; nn > 0; nn--) { uint8x8x2_t _src0 = vld2_u8(src0); uint8x8x2_t _src1 = vld2_u8(src1); uint8x8x2_t _src2 = vld2_u8(src0 + src_step); uint8x8x2_t _src3 = vld2_u8(src1 + src_step); uint8x8x2_t _src4 = vld2_u8(src0 + 2 * src_step); uint8x8x2_t _src5 = vld2_u8(src1 + 2 * src_step); uint8x8x2_t _src6 = vld2_u8(src0 + 3 * src_step); uint8x8x2_t _src7 = vld2_u8(src1 + 3 * src_step); uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]); uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]); uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]); uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]); uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]); uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]); uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]); uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]); uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0])); uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1])); uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0])); uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1])); uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0])); uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1])); uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0])); uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1])); uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0])); uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0])); uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1])); uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1])); uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0])); uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0])); uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1])); uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1])); uint8x8x2_t _dst0; uint8x8x2_t _dst1; uint8x8x2_t _dst2; uint8x8x2_t _dst3; uint8x8x2_t _dst4; uint8x8x2_t _dst5; uint8x8x2_t _dst6; uint8x8x2_t _dst7; _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]); _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]); _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]); _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]); _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]); _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]); _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]); _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]); _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]); _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]); _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]); _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]); _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]); _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]); _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]); _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]); vst2_u8(dst7, _dst0); vst2_u8(dst6, _dst1); vst2_u8(dst7 + dst_step, _dst2); vst2_u8(dst6 + dst_step, _dst3); vst2_u8(dst7 + 2 * dst_step, _dst4); vst2_u8(dst6 + 2 * dst_step, _dst5); vst2_u8(dst7 + 3 * dst_step, _dst6); vst2_u8(dst6 + 3 * dst_step, _dst7); src0 += 2 * 8; src1 += 2 * 8; dst7 += 4 * dst_step; dst6 += 4 * dst_step; } #else if (nn > 0) { asm volatile( "0: \n" "pld [%1, #128] \n" "vld2.u8 {d0-d1}, [%1], %10 \n" "pld [%2, #128] \n" "vld2.u8 {d2-d3}, [%2], %10 \n" "pld [%1, #128] \n" "vld2.u8 {d4-d5}, [%1], %10 \n" "vtrn.u8 q0, q1 \n" // _src01t_r "pld [%2, #128] \n" "vld2.u8 {d6-d7}, [%2], %10 \n" "pld [%1, #128] \n" "vld2.u8 {d16-d17}, [%1], %10\n" "vtrn.u8 q2, q3 \n" // _src23t_r "pld [%2, #128] \n" "vld2.u8 {d18-d19}, [%2], %10\n" "pld [%1, #128] \n" "vld2.u8 {d20-d21}, [%1], %10\n" "vtrn.u8 q8, q9 \n" // _src45t_r "pld [%2, #128] \n" "vld2.u8 {d22-d23}, [%2], %10\n" "vtrn.u8 q10, q11 \n" // _src67t_r "sub %1, %1, %10, lsl #2 \n" // restore src0 "vtrn.u16 q0, q2 \n" // _src02tt_r "sub %2, %2, %10, lsl #2 \n" // restore src1 "vtrn.u16 q1, q3 \n" // _src13tt_r "add %1, #16 \n" // src0 += 16 "vtrn.u16 q8, q10 \n" // _src46tt_r "add %2, #16 \n" // src1 += 16 "vtrn.u16 q9, q11 \n" // _src57tt_r "vtrn.u32 q0, q8 \n" // _src04ttt_r "vtrn.u32 q1, q9 \n" // _src15ttt_r "vst2.u8 {d0-d1}, [%3], %11 \n" "vtrn.u32 q2, q10 \n" // _src26ttt_r "vst2.u8 {d2-d3}, [%4], %11 \n" "vtrn.u32 q3, q11 \n" // _src37ttt_r "vst2.u8 {d4-d5}, [%3], %11 \n" "subs %0, #1 \n" "vst2.u8 {d16-d17}, [%3], %11\n" "vst2.u8 {d6-d7}, [%4], %11 \n" "vst2.u8 {d18-d19}, [%4], %11\n" "vst2.u8 {d20-d21}, [%3], %11\n" "vst2.u8 {d22-d23}, [%4], %11\n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(src1), // %2 "=r"(dst7), // %3 "=r"(dst6) // %4 : "0"(nn), "1"(src0), "2"(src1), "3"(dst7), "4"(dst6), "r"(src_step), // %10 "r"(dst_step) // %11 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); } #endif // __aarch64__ for (; remain > 0; remain--) { dst7[0] = src0[0]; dst7[1] = src0[1]; dst7[2] = src1[0]; dst7[3] = src1[1]; dst7[4] = src0[0 + src_step]; dst7[5] = src0[1 + src_step]; dst7[6] = src1[0 + src_step]; dst7[7] = src1[1 + src_step]; dst7[8] = src0[0 + 2 * src_step]; dst7[9] = src0[1 + 2 * src_step]; dst7[10] = src1[0 + 2 * src_step]; dst7[11] = src1[1 + 2 * src_step]; dst7[12] = src0[0 + 3 * src_step]; dst7[13] = src0[1 + 3 * src_step]; dst7[14] = src1[0 + 3 * src_step]; dst7[15] = src1[1 + 3 * src_step]; src0 += 2; src1 += 2; dst7 -= stride; } src0 += srcwgap + 7 * srcstride; } #endif // __ARM_NEON for (; y < srch; y++) { unsigned char* dst0 = dstend + y * 2; int x = 0; for (; x < srcw; x++) { dst0[0] = src0[0]; dst0[1] = src0[1]; src0 += 2; dst0 -= stride; } src0 += srcwgap; } } static void kanna_rotate_8_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int h, int stride) { const int srcwgap = srcstride - srcw * 3; // point to the last dst pixel row unsigned char* dstend = dst + stride * (h - 1); const unsigned char* src0 = src; int y = 0; #if __ARM_NEON for (; y + 7 < srch; y += 8) { const unsigned char* src1 = src0 + srcstride; unsigned char* dst7 = dstend + y * 3; unsigned char* dst6 = dstend + y * 3 - stride; int src_step = 2 * srcstride; int dst_step = -2 * stride; int nn = srcw >> 3; int remain = srcw - (nn << 3); #if __aarch64__ for (; nn > 0; nn--) { uint8x8x3_t _src0 = vld3_u8(src0); uint8x8x3_t _src1 = vld3_u8(src1); uint8x8x3_t _src2 = vld3_u8(src0 + src_step); uint8x8x3_t _src3 = vld3_u8(src1 + src_step); uint8x8x3_t _src4 = vld3_u8(src0 + 2 * src_step); uint8x8x3_t _src5 = vld3_u8(src1 + 2 * src_step); uint8x8x3_t _src6 = vld3_u8(src0 + 3 * src_step); uint8x8x3_t _src7 = vld3_u8(src1 + 3 * src_step); uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]); uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]); uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]); uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]); uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]); uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]); uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]); uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]); uint8x8x2_t _src01t_b = vtrn_u8(_src0.val[2], _src1.val[2]); uint8x8x2_t _src23t_b = vtrn_u8(_src2.val[2], _src3.val[2]); uint8x8x2_t _src45t_b = vtrn_u8(_src4.val[2], _src5.val[2]); uint8x8x2_t _src67t_b = vtrn_u8(_src6.val[2], _src7.val[2]); uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0])); uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1])); uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0])); uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1])); uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0])); uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1])); uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0])); uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1])); uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[0]), vreinterpret_u16_u8(_src23t_b.val[0])); uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[1]), vreinterpret_u16_u8(_src23t_b.val[1])); uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[0]), vreinterpret_u16_u8(_src67t_b.val[0])); uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[1]), vreinterpret_u16_u8(_src67t_b.val[1])); uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0])); uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0])); uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1])); uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1])); uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0])); uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0])); uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1])); uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1])); uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[0]), vreinterpret_u32_u16(_src46tt_b.val[0])); uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[0]), vreinterpret_u32_u16(_src57tt_b.val[0])); uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[1]), vreinterpret_u32_u16(_src46tt_b.val[1])); uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[1]), vreinterpret_u32_u16(_src57tt_b.val[1])); uint8x8x3_t _dst0; uint8x8x3_t _dst1; uint8x8x3_t _dst2; uint8x8x3_t _dst3; uint8x8x3_t _dst4; uint8x8x3_t _dst5; uint8x8x3_t _dst6; uint8x8x3_t _dst7; _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]); _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]); _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]); _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]); _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]); _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]); _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]); _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]); _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]); _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]); _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]); _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]); _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]); _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]); _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]); _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]); _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]); _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]); _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]); _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]); _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]); _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]); _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]); _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]); vst3_u8(dst7, _dst0); vst3_u8(dst6, _dst1); vst3_u8(dst7 + dst_step, _dst2); vst3_u8(dst6 + dst_step, _dst3); vst3_u8(dst7 + 2 * dst_step, _dst4); vst3_u8(dst6 + 2 * dst_step, _dst5); vst3_u8(dst7 + 3 * dst_step, _dst6); vst3_u8(dst6 + 3 * dst_step, _dst7); src0 += 3 * 8; src1 += 3 * 8; dst7 += 4 * dst_step; dst6 += 4 * dst_step; } #else if (nn > 0) { asm volatile( "0: \n" "pld [%1, #192] \n" "vld3.u8 {d0-d2}, [%1], %10 \n" "pld [%2, #192] \n" "vld3.u8 {d4-d6}, [%2], %10 \n" "pld [%1, #192] \n" "vld3.u8 {d8-d10}, [%1], %10 \n" "vtrn.u8 q0, q2 \n" // _src01t_r "vtrn.u8 d2, d6 \n" "pld [%2, #192] \n" "vld3.u8 {d12-d14}, [%2], %10\n" "pld [%1, #192] \n" "vld3.u8 {d16-d18}, [%1], %10\n" "vtrn.u8 q4, q6 \n" // _src23t_r "vtrn.u8 d10, d14 \n" "pld [%2, #192] \n" "vld3.u8 {d20-d22}, [%2], %10\n" "pld [%1, #192] \n" "vld3.u8 {d24-d26}, [%1], %10\n" "vtrn.u8 q8, q10 \n" // _src45t_r "vtrn.u8 d18, d22 \n" "pld [%2, #192] \n" "vld3.u8 {d28-d30}, [%2], %10\n" "vtrn.u8 q12, q14 \n" // _src67t_r "vtrn.u8 d26, d30 \n" "sub %1, %1, %10, lsl #2 \n" // restore src0 "vtrn.u16 q0, q4 \n" // _src02tt_r "vtrn.u16 d2, d10 \n" "sub %2, %2, %10, lsl #2 \n" // restore src1 "vtrn.u16 q2, q6 \n" // _src13tt_r "vtrn.u16 d6, d14 \n" "add %1, #24 \n" // src0 += 24 "vtrn.u16 q8, q12 \n" // _src46tt_r "vtrn.u16 d18, d26 \n" "add %2, #24 \n" // src1 += 24 "vtrn.u16 q10, q14 \n" // _src57tt_r "vtrn.u16 d22, d30 \n" "vtrn.u32 q0, q8 \n" // _src04ttt_r "vtrn.u32 d2, d18 \n" "vtrn.u32 q2, q10 \n" // _src15ttt_r "vst3.u8 {d0-d2}, [%3], %11 \n" "vtrn.u32 d6, d22 \n" "vtrn.u32 q4, q12 \n" // _src26ttt_r "vst3.u8 {d4-d6}, [%4], %11 \n" "vtrn.u32 d10, d26 \n" "vtrn.u32 q6, q14 \n" // _src37ttt_r "vst3.u8 {d8-d10}, [%3], %11 \n" "vtrn.u32 d14, d30 \n" "subs %0, #1 \n" "vst3.u8 {d16-d18}, [%3], %11\n" "vst3.u8 {d12-d14}, [%4], %11\n" "vst3.u8 {d20-d22}, [%4], %11\n" "vst3.u8 {d24-d26}, [%3], %11\n" "vst3.u8 {d28-d30}, [%4], %11\n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(src1), // %2 "=r"(dst7), // %3 "=r"(dst6) // %4 : "0"(nn), "1"(src0), "2"(src1), "3"(dst7), "4"(dst6), "r"(src_step), // %10 "r"(dst_step) // %11 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); } #endif // __aarch64__ for (; remain > 0; remain--) { dst7[0] = src0[0]; dst7[1] = src0[1]; dst7[2] = src0[2]; dst7[3] = src1[0]; dst7[4] = src1[1]; dst7[5] = src1[2]; dst7[6] = src0[0 + src_step]; dst7[7] = src0[1 + src_step]; dst7[8] = src0[2 + src_step]; dst7[9] = src1[0 + src_step]; dst7[10] = src1[1 + src_step]; dst7[11] = src1[2 + src_step]; dst7[12] = src0[0 + 2 * src_step]; dst7[13] = src0[1 + 2 * src_step]; dst7[14] = src0[2 + 2 * src_step]; dst7[15] = src1[0 + 2 * src_step]; dst7[16] = src1[1 + 2 * src_step]; dst7[17] = src1[2 + 2 * src_step]; dst7[18] = src0[0 + 3 * src_step]; dst7[19] = src0[1 + 3 * src_step]; dst7[20] = src0[2 + 3 * src_step]; dst7[21] = src1[0 + 3 * src_step]; dst7[22] = src1[1 + 3 * src_step]; dst7[23] = src1[2 + 3 * src_step]; src0 += 3; src1 += 3; dst7 -= stride; } src0 += srcwgap + 7 * srcstride; } #endif // __ARM_NEON for (; y < srch; y++) { unsigned char* dst0 = dstend + y * 3; int x = 0; for (; x < srcw; x++) { dst0[0] = src0[0]; dst0[1] = src0[1]; dst0[2] = src0[2]; src0 += 3; dst0 -= stride; } src0 += srcwgap; } } static void kanna_rotate_8_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int h, int stride) { const int srcwgap = srcstride - srcw * 4; // point to the last dst pixel row unsigned char* dstend = dst + stride * (h - 1); const unsigned char* src0 = src; int y = 0; #if __ARM_NEON for (; y + 7 < srch; y += 8) { const unsigned char* src1 = src0 + srcstride; unsigned char* dst7 = dstend + y * 4; unsigned char* dst6 = dstend + y * 4 - stride; int src_step = 2 * srcstride; int dst_step = -2 * stride; int nn = srcw >> 3; int remain = srcw - (nn << 3); #if __aarch64__ for (; nn > 0; nn--) { uint8x8x4_t _src0 = vld4_u8(src0); uint8x8x4_t _src1 = vld4_u8(src1); uint8x8x4_t _src2 = vld4_u8(src0 + src_step); uint8x8x4_t _src3 = vld4_u8(src1 + src_step); uint8x8x4_t _src4 = vld4_u8(src0 + 2 * src_step); uint8x8x4_t _src5 = vld4_u8(src1 + 2 * src_step); uint8x8x4_t _src6 = vld4_u8(src0 + 3 * src_step); uint8x8x4_t _src7 = vld4_u8(src1 + 3 * src_step); uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]); uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]); uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]); uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]); uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]); uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]); uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]); uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]); uint8x8x2_t _src01t_b = vtrn_u8(_src0.val[2], _src1.val[2]); uint8x8x2_t _src23t_b = vtrn_u8(_src2.val[2], _src3.val[2]); uint8x8x2_t _src45t_b = vtrn_u8(_src4.val[2], _src5.val[2]); uint8x8x2_t _src67t_b = vtrn_u8(_src6.val[2], _src7.val[2]); uint8x8x2_t _src01t_a = vtrn_u8(_src0.val[3], _src1.val[3]); uint8x8x2_t _src23t_a = vtrn_u8(_src2.val[3], _src3.val[3]); uint8x8x2_t _src45t_a = vtrn_u8(_src4.val[3], _src5.val[3]); uint8x8x2_t _src67t_a = vtrn_u8(_src6.val[3], _src7.val[3]); uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0])); uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1])); uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0])); uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1])); uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0])); uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1])); uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0])); uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1])); uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[0]), vreinterpret_u16_u8(_src23t_b.val[0])); uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[1]), vreinterpret_u16_u8(_src23t_b.val[1])); uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[0]), vreinterpret_u16_u8(_src67t_b.val[0])); uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[1]), vreinterpret_u16_u8(_src67t_b.val[1])); uint16x4x2_t _src02tt_a = vtrn_u16(vreinterpret_u16_u8(_src01t_a.val[0]), vreinterpret_u16_u8(_src23t_a.val[0])); uint16x4x2_t _src13tt_a = vtrn_u16(vreinterpret_u16_u8(_src01t_a.val[1]), vreinterpret_u16_u8(_src23t_a.val[1])); uint16x4x2_t _src46tt_a = vtrn_u16(vreinterpret_u16_u8(_src45t_a.val[0]), vreinterpret_u16_u8(_src67t_a.val[0])); uint16x4x2_t _src57tt_a = vtrn_u16(vreinterpret_u16_u8(_src45t_a.val[1]), vreinterpret_u16_u8(_src67t_a.val[1])); uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0])); uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0])); uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1])); uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1])); uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0])); uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0])); uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1])); uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1])); uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[0]), vreinterpret_u32_u16(_src46tt_b.val[0])); uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[0]), vreinterpret_u32_u16(_src57tt_b.val[0])); uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[1]), vreinterpret_u32_u16(_src46tt_b.val[1])); uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[1]), vreinterpret_u32_u16(_src57tt_b.val[1])); uint32x2x2_t _src04ttt_a = vtrn_u32(vreinterpret_u32_u16(_src02tt_a.val[0]), vreinterpret_u32_u16(_src46tt_a.val[0])); uint32x2x2_t _src15ttt_a = vtrn_u32(vreinterpret_u32_u16(_src13tt_a.val[0]), vreinterpret_u32_u16(_src57tt_a.val[0])); uint32x2x2_t _src26ttt_a = vtrn_u32(vreinterpret_u32_u16(_src02tt_a.val[1]), vreinterpret_u32_u16(_src46tt_a.val[1])); uint32x2x2_t _src37ttt_a = vtrn_u32(vreinterpret_u32_u16(_src13tt_a.val[1]), vreinterpret_u32_u16(_src57tt_a.val[1])); uint8x8x4_t _dst0; uint8x8x4_t _dst1; uint8x8x4_t _dst2; uint8x8x4_t _dst3; uint8x8x4_t _dst4; uint8x8x4_t _dst5; uint8x8x4_t _dst6; uint8x8x4_t _dst7; _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]); _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]); _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]); _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]); _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]); _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]); _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]); _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]); _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]); _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]); _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]); _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]); _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]); _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]); _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]); _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]); _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]); _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]); _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]); _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]); _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]); _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]); _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]); _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]); _dst0.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[0]); _dst1.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[0]); _dst2.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[0]); _dst3.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[0]); _dst4.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[1]); _dst5.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[1]); _dst6.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[1]); _dst7.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[1]); vst4_u8(dst7, _dst0); vst4_u8(dst6, _dst1); vst4_u8(dst7 + dst_step, _dst2); vst4_u8(dst6 + dst_step, _dst3); vst4_u8(dst7 + 2 * dst_step, _dst4); vst4_u8(dst6 + 2 * dst_step, _dst5); vst4_u8(dst7 + 3 * dst_step, _dst6); vst4_u8(dst6 + 3 * dst_step, _dst7); src0 += 4 * 8; src1 += 4 * 8; dst7 += 4 * dst_step; dst6 += 4 * dst_step; } #else if (nn > 0) { asm volatile( "0: \n" "pld [%1, #256] \n" "vld4.u8 {d0-d3}, [%1], %10 \n" "pld [%2, #256] \n" "vld4.u8 {d4-d7}, [%2], %10 \n" "pld [%1, #256] \n" "vld4.u8 {d8-d11}, [%1], %10 \n" "vtrn.u8 q0, q2 \n" // _src01t_r "vtrn.u8 q1, q3 \n" "pld [%2, #256] \n" "vld4.u8 {d12-d15}, [%2], %10\n" "pld [%1, #256] \n" "vld4.u8 {d16-d19}, [%1], %10\n" "vtrn.u8 q4, q6 \n" // _src23t_r "vtrn.u8 q5, q7 \n" "pld [%2, #256] \n" "vld4.u8 {d20-d23}, [%2], %10\n" "pld [%1, #256] \n" "vld4.u8 {d24-d27}, [%1], %10\n" "vtrn.u8 q8, q10 \n" // _src45t_r "vtrn.u8 q9, q11 \n" "pld [%2, #256] \n" "vld4.u8 {d28-d31}, [%2], %10\n" "vtrn.u8 q12, q14 \n" // _src67t_r "vtrn.u8 q13, q15 \n" "sub %1, %1, %10, lsl #2 \n" // restore src0 "vtrn.u16 q0, q4 \n" // _src02tt_r "vtrn.u16 q1, q5 \n" "sub %2, %2, %10, lsl #2 \n" // restore src1 "vtrn.u16 q2, q6 \n" // _src13tt_r "vtrn.u16 q3, q7 \n" "add %1, #32 \n" // src0 += 32 "vtrn.u16 q8, q12 \n" // _src46tt_r "vtrn.u16 q9, q13 \n" "add %2, #32 \n" // src1 += 32 "vtrn.u16 q10, q14 \n" // _src57tt_r "vtrn.u16 q11, q15 \n" "vtrn.u32 q0, q8 \n" // _src04ttt_r "vtrn.u32 q1, q9 \n" "vtrn.u32 q2, q10 \n" // _src15ttt_r "vst4.u8 {d0-d3}, [%3], %11 \n" "vtrn.u32 q3, q11 \n" "vtrn.u32 q4, q12 \n" // _src26ttt_r "vst4.u8 {d4-d7}, [%4], %11 \n" "vtrn.u32 q5, q13 \n" "vtrn.u32 q6, q14 \n" // _src37ttt_r "vst4.u8 {d8-d11}, [%3], %11 \n" "vtrn.u32 q7, q15 \n" "subs %0, #1 \n" "vst4.u8 {d16-d19}, [%3], %11\n" "vst4.u8 {d12-d15}, [%4], %11\n" "vst4.u8 {d20-d23}, [%4], %11\n" "vst4.u8 {d24-d27}, [%3], %11\n" "vst4.u8 {d28-d31}, [%4], %11\n" "bne 0b \n" : "=r"(nn), // %0 "=r"(src0), // %1 "=r"(src1), // %2 "=r"(dst7), // %3 "=r"(dst6) // %4 : "0"(nn), "1"(src0), "2"(src1), "3"(dst7), "4"(dst6), "r"(src_step), // %10 "r"(dst_step) // %11 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); } #endif // __aarch64__ for (; remain > 0; remain--) { dst7[0] = src0[0]; dst7[1] = src0[1]; dst7[2] = src0[2]; dst7[3] = src0[3]; dst7[4] = src1[0]; dst7[5] = src1[1]; dst7[6] = src1[2]; dst7[7] = src1[3]; dst7[8] = src0[0 + src_step]; dst7[9] = src0[1 + src_step]; dst7[10] = src0[2 + src_step]; dst7[11] = src0[3 + src_step]; dst7[12] = src1[0 + src_step]; dst7[13] = src1[1 + src_step]; dst7[14] = src1[2 + src_step]; dst7[15] = src1[3 + src_step]; dst7[16] = src0[0 + 2 * src_step]; dst7[17] = src0[1 + 2 * src_step]; dst7[18] = src0[2 + 2 * src_step]; dst7[19] = src0[3 + 2 * src_step]; dst7[20] = src1[0 + 2 * src_step]; dst7[21] = src1[1 + 2 * src_step]; dst7[22] = src1[2 + 2 * src_step]; dst7[23] = src1[3 + 2 * src_step]; dst7[24] = src0[0 + 3 * src_step]; dst7[25] = src0[1 + 3 * src_step]; dst7[26] = src0[2 + 3 * src_step]; dst7[27] = src0[3 + 3 * src_step]; dst7[28] = src1[0 + 3 * src_step]; dst7[29] = src1[1 + 3 * src_step]; dst7[30] = src1[2 + 3 * src_step]; dst7[31] = src1[3 + 3 * src_step]; src0 += 4; src1 += 4; dst7 -= stride; } src0 += srcwgap + 7 * srcstride; } #endif // __ARM_NEON for (; y < srch; y++) { unsigned char* dst0 = dstend + y * 4; int x = 0; for (; x < srcw; x++) { dst0[0] = src0[0]; dst0[1] = src0[1]; dst0[2] = src0[2]; dst0[3] = src0[3]; src0 += 4; dst0 -= stride; } src0 += srcwgap; } } void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type) { return kanna_rotate_c1(src, srcw, srch, srcw, dst, w, h, w, type); } void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type) { return kanna_rotate_c2(src, srcw, srch, srcw * 2, dst, w, h, w * 2, type); } void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type) { return kanna_rotate_c3(src, srcw, srch, srcw * 3, dst, w, h, w * 3, type); } void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type) { return kanna_rotate_c4(src, srcw, srch, srcw * 4, dst, w, h, w * 4, type); } void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type) { // assert srcw == w && srch == h for type 1234 // assert srcw == h && srch == w for type 5678 switch (type) { case 1: kanna_rotate_1_c1(src, srcw, srch, srcstride, dst, w, h, stride); break; case 2: kanna_rotate_2_c1(src, srcw, srch, srcstride, dst, w, h, stride); break; case 3: kanna_rotate_3_c1(src, srcw, srch, srcstride, dst, w, h, stride); break; case 4: kanna_rotate_4_c1(src, srcw, srch, srcstride, dst, w, h, stride); break; case 5: kanna_rotate_5_c1(src, srcw, srch, srcstride, dst, w, h, stride); break; case 6: kanna_rotate_6_c1(src, srcw, srch, srcstride, dst, w, h, stride); break; case 7: kanna_rotate_7_c1(src, srcw, srch, srcstride, dst, w, h, stride); break; case 8: kanna_rotate_8_c1(src, srcw, srch, srcstride, dst, w, h, stride); break; default: // unsupported rotate type break; } } void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type) { // assert srcw == w && srch == h for type 1234 // assert srcw == h && srch == w for type 5678 switch (type) { case 1: kanna_rotate_1_c2(src, srcw, srch, srcstride, dst, w, h, stride); break; case 2: kanna_rotate_2_c2(src, srcw, srch, srcstride, dst, w, h, stride); break; case 3: kanna_rotate_3_c2(src, srcw, srch, srcstride, dst, w, h, stride); break; case 4: kanna_rotate_4_c2(src, srcw, srch, srcstride, dst, w, h, stride); break; case 5: kanna_rotate_5_c2(src, srcw, srch, srcstride, dst, w, h, stride); break; case 6: kanna_rotate_6_c2(src, srcw, srch, srcstride, dst, w, h, stride); break; case 7: kanna_rotate_7_c2(src, srcw, srch, srcstride, dst, w, h, stride); break; case 8: kanna_rotate_8_c2(src, srcw, srch, srcstride, dst, w, h, stride); break; default: // unsupported rotate type break; } } void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type) { // assert srcw == w && srch == h for type 1234 // assert srcw == h && srch == w for type 5678 switch (type) { case 1: kanna_rotate_1_c3(src, srcw, srch, srcstride, dst, w, h, stride); break; case 2: kanna_rotate_2_c3(src, srcw, srch, srcstride, dst, w, h, stride); break; case 3: kanna_rotate_3_c3(src, srcw, srch, srcstride, dst, w, h, stride); break; case 4: kanna_rotate_4_c3(src, srcw, srch, srcstride, dst, w, h, stride); break; case 5: kanna_rotate_5_c3(src, srcw, srch, srcstride, dst, w, h, stride); break; case 6: kanna_rotate_6_c3(src, srcw, srch, srcstride, dst, w, h, stride); break; case 7: kanna_rotate_7_c3(src, srcw, srch, srcstride, dst, w, h, stride); break; case 8: kanna_rotate_8_c3(src, srcw, srch, srcstride, dst, w, h, stride); break; default: // unsupported rotate type break; } } void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type) { // assert srcw == w && srch == h for type 1234 // assert srcw == h && srch == w for type 5678 switch (type) { case 1: kanna_rotate_1_c4(src, srcw, srch, srcstride, dst, w, h, stride); break; case 2: kanna_rotate_2_c4(src, srcw, srch, srcstride, dst, w, h, stride); break; case 3: kanna_rotate_3_c4(src, srcw, srch, srcstride, dst, w, h, stride); break; case 4: kanna_rotate_4_c4(src, srcw, srch, srcstride, dst, w, h, stride); break; case 5: kanna_rotate_5_c4(src, srcw, srch, srcstride, dst, w, h, stride); break; case 6: kanna_rotate_6_c4(src, srcw, srch, srcstride, dst, w, h, stride); break; case 7: kanna_rotate_7_c4(src, srcw, srch, srcstride, dst, w, h, stride); break; case 8: kanna_rotate_8_c4(src, srcw, srch, srcstride, dst, w, h, stride); break; default: // unsupported rotate type break; } } void kanna_rotate_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type) { // assert srcw % 2 == 0 // assert srch % 2 == 0 // assert w % 2 == 0 // assert h % 2 == 0 const unsigned char* srcY = src; unsigned char* dstY = dst; kanna_rotate_c1(srcY, srcw, srch, dstY, w, h, type); const unsigned char* srcUV = src + srcw * srch; unsigned char* dstUV = dst + w * h; kanna_rotate_c2(srcUV, srcw / 2, srch / 2, dstUV, w / 2, h / 2, type); } #endif // NCNN_PIXEL_ROTATE } // namespace ncnn