You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
6118 lines
229 KiB
C++
6118 lines
229 KiB
C++
// Tencent is pleased to support the open source community by making ncnn available.
|
|
//
|
|
// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
|
|
//
|
|
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
|
|
// in compliance with the License. You may obtain a copy of the License at
|
|
//
|
|
// https://opensource.org/licenses/BSD-3-Clause
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software distributed
|
|
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
|
|
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations under the License.
|
|
|
|
#include "mat.h"
|
|
#if __ARM_NEON
|
|
#include <arm_neon.h>
|
|
#endif // __ARM_NEON
|
|
#include "platform.h"
|
|
|
|
namespace ncnn {
|
|
|
|
#if NCNN_PIXEL_ROTATE
|
|
// should be a kanna ascii art here in my local branch
|
|
// but we shall ask the original art author for permission first ...
|
|
// https://www.reddit.com/r/anime/comments/5uxjn4/i_recreated_the_kanna_ascii_art_from_kobayashisan/
|
|
|
|
static void kanna_rotate_1_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
|
|
{
|
|
const int srcwgap = srcstride - srcw;
|
|
const int wgap = stride - w;
|
|
|
|
const unsigned char* src0 = src;
|
|
const unsigned char* src1 = src + srcstride;
|
|
unsigned char* dst0 = dst;
|
|
unsigned char* dst1 = dst + stride;
|
|
|
|
int y = 0;
|
|
for (; y + 1 < srch; y += 2)
|
|
{
|
|
#if __ARM_NEON
|
|
int nn = srcw >> 5;
|
|
int remain = srcw - (nn << 5);
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x16_t _src0 = vld1q_u8(src0);
|
|
uint8x16_t _src0n = vld1q_u8(src0 + 16);
|
|
vst1q_u8(dst0, _src0);
|
|
vst1q_u8(dst0 + 16, _src0n);
|
|
|
|
uint8x16_t _src1 = vld1q_u8(src1);
|
|
uint8x16_t _src1n = vld1q_u8(src1 + 16);
|
|
vst1q_u8(dst1, _src1);
|
|
vst1q_u8(dst1 + 16, _src1n);
|
|
|
|
src0 += 32;
|
|
src1 += 32;
|
|
dst0 += 32;
|
|
dst1 += 32;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"0: \n"
|
|
"pld [%1, #256] \n"
|
|
"vld1.u8 {d0-d3}, [%1]! \n"
|
|
"pld [%2, #256] \n"
|
|
"vld1.u8 {d4-d7}, [%2]! \n"
|
|
"subs %0, #1 \n"
|
|
"vst1.u8 {d0-d3}, [%3]! \n"
|
|
"vst1.u8 {d4-d7}, [%4]! \n"
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(src1), // %2
|
|
"=r"(dst0), // %3
|
|
"=r"(dst1) // %4
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(src1),
|
|
"3"(dst0),
|
|
"4"(dst1)
|
|
: "cc", "memory", "q0", "q1", "q2", "q3");
|
|
}
|
|
#endif // __aarch64__
|
|
#else
|
|
int remain = srcw;
|
|
#endif // __ARM_NEON
|
|
|
|
for (; remain > 0; remain--)
|
|
{
|
|
*dst0++ = *src0++;
|
|
*dst1++ = *src1++;
|
|
}
|
|
|
|
src0 += srcwgap + srcstride;
|
|
src1 += srcwgap + srcstride;
|
|
dst0 += wgap + stride;
|
|
dst1 += wgap + stride;
|
|
}
|
|
|
|
for (; y < srch; y++)
|
|
{
|
|
#if __ARM_NEON
|
|
int nn = srcw >> 5;
|
|
int remain = srcw - (nn << 5);
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x16_t _src = vld1q_u8(src0);
|
|
uint8x16_t _src2 = vld1q_u8(src0 + 16);
|
|
vst1q_u8(dst0, _src);
|
|
vst1q_u8(dst0 + 16, _src2);
|
|
|
|
src0 += 32;
|
|
dst0 += 32;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"0: \n"
|
|
"pld [%1, #256] \n"
|
|
"vld1.u8 {d0-d3}, [%1]! \n"
|
|
"subs %0, #1 \n"
|
|
"vst1.u8 {d0-d3}, [%2]! \n"
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(dst0) // %2
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(dst0)
|
|
: "cc", "memory", "q0", "q1");
|
|
}
|
|
#endif // __aarch64__
|
|
#else
|
|
int remain = srcw;
|
|
#endif // __ARM_NEON
|
|
|
|
for (; remain > 0; remain--)
|
|
{
|
|
*dst0++ = *src0++;
|
|
}
|
|
|
|
src0 += srcwgap;
|
|
dst0 += wgap;
|
|
}
|
|
}
|
|
|
|
static void kanna_rotate_1_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
|
|
{
|
|
const int srcwgap = srcstride - srcw * 2;
|
|
const int wgap = stride - w * 2;
|
|
|
|
int size = srcw * 2;
|
|
|
|
const unsigned char* src0 = src;
|
|
const unsigned char* src1 = src + srcstride;
|
|
unsigned char* dst0 = dst;
|
|
unsigned char* dst1 = dst + stride;
|
|
|
|
int y = 0;
|
|
for (; y + 1 < srch; y += 2)
|
|
{
|
|
#if __ARM_NEON
|
|
int nn = size >> 5;
|
|
int remain = size - (nn << 5);
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x16_t _src0 = vld1q_u8(src0);
|
|
uint8x16_t _src0n = vld1q_u8(src0 + 16);
|
|
vst1q_u8(dst0, _src0);
|
|
vst1q_u8(dst0 + 16, _src0n);
|
|
|
|
uint8x16_t _src1 = vld1q_u8(src1);
|
|
uint8x16_t _src1n = vld1q_u8(src1 + 16);
|
|
vst1q_u8(dst1, _src1);
|
|
vst1q_u8(dst1 + 16, _src1n);
|
|
|
|
src0 += 32;
|
|
src1 += 32;
|
|
dst0 += 32;
|
|
dst1 += 32;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"0: \n"
|
|
"pld [%1, #256] \n"
|
|
"vld1.u8 {d0-d3}, [%1]! \n"
|
|
"pld [%2, #256] \n"
|
|
"vld1.u8 {d4-d7}, [%2]! \n"
|
|
"subs %0, #1 \n"
|
|
"vst1.u8 {d0-d3}, [%3]! \n"
|
|
"vst1.u8 {d4-d7}, [%4]! \n"
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(src1), // %2
|
|
"=r"(dst0), // %3
|
|
"=r"(dst1) // %4
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(src1),
|
|
"3"(dst0),
|
|
"4"(dst1)
|
|
: "cc", "memory", "q0", "q1", "q2", "q3");
|
|
}
|
|
#endif // __aarch64__
|
|
#else
|
|
int remain = size;
|
|
#endif // __ARM_NEON
|
|
|
|
for (; remain > 0; remain--)
|
|
{
|
|
*dst0++ = *src0++;
|
|
*dst1++ = *src1++;
|
|
}
|
|
|
|
src0 += srcwgap + srcstride;
|
|
src1 += srcwgap + srcstride;
|
|
dst0 += wgap + stride;
|
|
dst1 += wgap + stride;
|
|
}
|
|
|
|
for (; y < srch; y++)
|
|
{
|
|
#if __ARM_NEON
|
|
int nn = size >> 5;
|
|
int remain = size - (nn << 5);
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x16_t _src = vld1q_u8(src0);
|
|
uint8x16_t _src2 = vld1q_u8(src0 + 16);
|
|
vst1q_u8(dst0, _src);
|
|
vst1q_u8(dst0 + 16, _src2);
|
|
|
|
src0 += 32;
|
|
dst0 += 32;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"0: \n"
|
|
"pld [%1, #256] \n"
|
|
"vld1.u8 {d0-d3}, [%1]! \n"
|
|
"subs %0, #1 \n"
|
|
"vst1.u8 {d0-d3}, [%2]! \n"
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(dst0) // %2
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(dst0)
|
|
: "cc", "memory", "q0", "q1");
|
|
}
|
|
#endif // __aarch64__
|
|
#else
|
|
int remain = size;
|
|
#endif // __ARM_NEON
|
|
|
|
for (; remain > 0; remain--)
|
|
{
|
|
*dst0++ = *src0++;
|
|
}
|
|
|
|
src0 += srcwgap;
|
|
dst0 += wgap;
|
|
}
|
|
}
|
|
|
|
static void kanna_rotate_1_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
|
|
{
|
|
const int srcwgap = srcstride - srcw * 3;
|
|
const int wgap = stride - w * 3;
|
|
|
|
int size = srcw * 3;
|
|
|
|
const unsigned char* src0 = src;
|
|
const unsigned char* src1 = src + srcstride;
|
|
unsigned char* dst0 = dst;
|
|
unsigned char* dst1 = dst + stride;
|
|
|
|
int y = 0;
|
|
for (; y + 1 < srch; y += 2)
|
|
{
|
|
#if __ARM_NEON
|
|
int nn = size >> 5;
|
|
int remain = size - (nn << 5);
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x16_t _src0 = vld1q_u8(src0);
|
|
uint8x16_t _src0n = vld1q_u8(src0 + 16);
|
|
vst1q_u8(dst0, _src0);
|
|
vst1q_u8(dst0 + 16, _src0n);
|
|
|
|
uint8x16_t _src1 = vld1q_u8(src1);
|
|
uint8x16_t _src1n = vld1q_u8(src1 + 16);
|
|
vst1q_u8(dst1, _src1);
|
|
vst1q_u8(dst1 + 16, _src1n);
|
|
|
|
src0 += 32;
|
|
src1 += 32;
|
|
dst0 += 32;
|
|
dst1 += 32;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"0: \n"
|
|
"pld [%1, #256] \n"
|
|
"vld1.u8 {d0-d3}, [%1]! \n"
|
|
"pld [%2, #256] \n"
|
|
"vld1.u8 {d4-d7}, [%2]! \n"
|
|
"subs %0, #1 \n"
|
|
"vst1.u8 {d0-d3}, [%3]! \n"
|
|
"vst1.u8 {d4-d7}, [%4]! \n"
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(src1), // %2
|
|
"=r"(dst0), // %3
|
|
"=r"(dst1) // %4
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(src1),
|
|
"3"(dst0),
|
|
"4"(dst1)
|
|
: "cc", "memory", "q0", "q1", "q2", "q3");
|
|
}
|
|
#endif // __aarch64__
|
|
#else
|
|
int remain = size;
|
|
#endif // __ARM_NEON
|
|
|
|
for (; remain > 0; remain--)
|
|
{
|
|
*dst0++ = *src0++;
|
|
*dst1++ = *src1++;
|
|
}
|
|
|
|
src0 += srcwgap + srcstride;
|
|
src1 += srcwgap + srcstride;
|
|
dst0 += wgap + stride;
|
|
dst1 += wgap + stride;
|
|
}
|
|
|
|
for (; y < srch; y++)
|
|
{
|
|
#if __ARM_NEON
|
|
int nn = size >> 5;
|
|
int remain = size - (nn << 5);
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x16_t _src = vld1q_u8(src0);
|
|
uint8x16_t _src2 = vld1q_u8(src0 + 16);
|
|
vst1q_u8(dst0, _src);
|
|
vst1q_u8(dst0 + 16, _src2);
|
|
|
|
src0 += 32;
|
|
dst0 += 32;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"0: \n"
|
|
"pld [%1, #256] \n"
|
|
"vld1.u8 {d0-d3}, [%1]! \n"
|
|
"subs %0, #1 \n"
|
|
"vst1.u8 {d0-d3}, [%2]! \n"
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(dst0) // %2
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(dst0)
|
|
: "cc", "memory", "q0", "q1");
|
|
}
|
|
#endif // __aarch64__
|
|
#else
|
|
int remain = size;
|
|
#endif // __ARM_NEON
|
|
|
|
for (; remain > 0; remain--)
|
|
{
|
|
*dst0++ = *src0++;
|
|
}
|
|
|
|
src0 += srcwgap;
|
|
dst0 += wgap;
|
|
}
|
|
}
|
|
|
|
static void kanna_rotate_1_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
|
|
{
|
|
const int srcwgap = srcstride - srcw * 4;
|
|
const int wgap = stride - w * 4;
|
|
|
|
int size = srcw * 4;
|
|
|
|
const unsigned char* src0 = src;
|
|
const unsigned char* src1 = src + srcstride;
|
|
unsigned char* dst0 = dst;
|
|
unsigned char* dst1 = dst + stride;
|
|
|
|
int y = 0;
|
|
for (; y + 1 < srch; y += 2)
|
|
{
|
|
#if __ARM_NEON
|
|
int nn = size >> 5;
|
|
int remain = size - (nn << 5);
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x16_t _src0 = vld1q_u8(src0);
|
|
uint8x16_t _src0n = vld1q_u8(src0 + 16);
|
|
vst1q_u8(dst0, _src0);
|
|
vst1q_u8(dst0 + 16, _src0n);
|
|
|
|
uint8x16_t _src1 = vld1q_u8(src1);
|
|
uint8x16_t _src1n = vld1q_u8(src1 + 16);
|
|
vst1q_u8(dst1, _src1);
|
|
vst1q_u8(dst1 + 16, _src1n);
|
|
|
|
src0 += 32;
|
|
src1 += 32;
|
|
dst0 += 32;
|
|
dst1 += 32;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"0: \n"
|
|
"pld [%1, #256] \n"
|
|
"vld1.u8 {d0-d3}, [%1]! \n"
|
|
"pld [%2, #256] \n"
|
|
"vld1.u8 {d4-d7}, [%2]! \n"
|
|
"subs %0, #1 \n"
|
|
"vst1.u8 {d0-d3}, [%3]! \n"
|
|
"vst1.u8 {d4-d7}, [%4]! \n"
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(src1), // %2
|
|
"=r"(dst0), // %3
|
|
"=r"(dst1) // %4
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(src1),
|
|
"3"(dst0),
|
|
"4"(dst1)
|
|
: "cc", "memory", "q0", "q1", "q2", "q3");
|
|
}
|
|
#endif // __aarch64__
|
|
#else
|
|
int remain = size;
|
|
#endif // __ARM_NEON
|
|
|
|
for (; remain > 0; remain--)
|
|
{
|
|
*dst0++ = *src0++;
|
|
*dst1++ = *src1++;
|
|
}
|
|
|
|
src0 += srcwgap + srcstride;
|
|
src1 += srcwgap + srcstride;
|
|
dst0 += wgap + stride;
|
|
dst1 += wgap + stride;
|
|
}
|
|
|
|
for (; y < srch; y++)
|
|
{
|
|
#if __ARM_NEON
|
|
int nn = size >> 5;
|
|
int remain = size - (nn << 5);
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x16_t _src = vld1q_u8(src0);
|
|
uint8x16_t _src2 = vld1q_u8(src0 + 16);
|
|
vst1q_u8(dst0, _src);
|
|
vst1q_u8(dst0 + 16, _src2);
|
|
|
|
src0 += 32;
|
|
dst0 += 32;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"0: \n"
|
|
"pld [%1, #256] \n"
|
|
"vld1.u8 {d0-d3}, [%1]! \n"
|
|
"subs %0, #1 \n"
|
|
"vst1.u8 {d0-d3}, [%2]! \n"
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(dst0) // %2
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(dst0)
|
|
: "cc", "memory", "q0", "q1");
|
|
}
|
|
#endif // __aarch64__
|
|
#else
|
|
int remain = size;
|
|
#endif // __ARM_NEON
|
|
|
|
for (; remain > 0; remain--)
|
|
{
|
|
*dst0++ = *src0++;
|
|
}
|
|
|
|
src0 += srcwgap;
|
|
dst0 += wgap;
|
|
}
|
|
}
|
|
|
|
static void kanna_rotate_2_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
|
|
{
|
|
const int srcwgap = srcstride - srcw;
|
|
const int wgap = stride + w;
|
|
|
|
const unsigned char* src0 = src;
|
|
unsigned char* dst0 = dst + w - 1;
|
|
|
|
int y = 0;
|
|
for (; y < srch; y++)
|
|
{
|
|
#if __ARM_NEON
|
|
dst0 -= 15;
|
|
|
|
int nn = srcw >> 4;
|
|
int remain = srcw - (nn << 4);
|
|
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x8_t _src = vld1_u8(src0);
|
|
uint8x8_t _src2 = vld1_u8(src0 + 8);
|
|
|
|
_src = vrev64_u8(_src);
|
|
_src2 = vrev64_u8(_src2);
|
|
|
|
vst1_u8(dst0, _src2);
|
|
vst1_u8(dst0 + 8, _src);
|
|
|
|
src0 += 16;
|
|
dst0 -= 16;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"mov r4, #-16 \n"
|
|
"0: \n"
|
|
"pld [%1, #128] \n"
|
|
"vld1.u8 {d0-d1}, [%1]! \n"
|
|
"vrev64.u8 d3, d0 \n"
|
|
"vrev64.u8 d2, d1 \n"
|
|
"subs %0, #1 \n"
|
|
"vst1.u8 {d2-d3}, [%2], r4 \n"
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(dst0) // %2
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(dst0)
|
|
: "cc", "memory", "q0", "q1", "r4");
|
|
}
|
|
#endif // __aarch64__
|
|
|
|
dst0 += 15;
|
|
#else
|
|
int remain = srcw;
|
|
#endif // __ARM_NEON
|
|
|
|
for (; remain > 0; remain--)
|
|
{
|
|
*dst0 = *src0;
|
|
|
|
src0 += 1;
|
|
dst0 -= 1;
|
|
}
|
|
|
|
src0 += srcwgap;
|
|
dst0 += wgap;
|
|
}
|
|
}
|
|
|
|
static void kanna_rotate_2_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
|
|
{
|
|
const int srcwgap = srcstride - srcw * 2;
|
|
const int wgap = stride + w * 2;
|
|
|
|
const unsigned char* src0 = src;
|
|
unsigned char* dst0 = dst + w * 2 - 2;
|
|
|
|
int y = 0;
|
|
for (; y < srch; y++)
|
|
{
|
|
#if __ARM_NEON
|
|
dst0 -= 7 * 2;
|
|
|
|
int nn = srcw >> 4;
|
|
int remain = srcw - (nn << 4);
|
|
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x8x2_t _src = vld2_u8(src0);
|
|
uint8x8x2_t _src2 = vld2_u8(src0 + 8 * 2);
|
|
|
|
_src.val[0] = vrev64_u8(_src.val[0]);
|
|
_src.val[1] = vrev64_u8(_src.val[1]);
|
|
|
|
_src2.val[0] = vrev64_u8(_src2.val[0]);
|
|
_src2.val[1] = vrev64_u8(_src2.val[1]);
|
|
|
|
vst2_u8(dst0, _src);
|
|
vst2_u8(dst0 - 8 * 2, _src2);
|
|
|
|
src0 += 16 * 2;
|
|
dst0 -= 16 * 2;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"mov r4, #-16 \n"
|
|
"0: \n"
|
|
"pld [%1, #128] \n"
|
|
"vld2.u8 {d0-d1}, [%1]! \n"
|
|
"vrev64.u8 d0, d0 \n"
|
|
"pld [%1, #128] \n"
|
|
"vld2.u8 {d2-d3}, [%1]! \n"
|
|
"vrev64.u8 d1, d1 \n"
|
|
"vrev64.u8 d2, d2 \n"
|
|
"vst2.u8 {d0-d1}, [%2], r4 \n"
|
|
"vrev64.u8 d3, d3 \n"
|
|
"subs %0, #1 \n"
|
|
"vst2.u8 {d2-d3}, [%2], r4 \n"
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(dst0) // %2
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(dst0)
|
|
: "cc", "memory", "q0", "q1", "r4");
|
|
}
|
|
#endif // __aarch64__
|
|
|
|
dst0 += 7 * 2;
|
|
#else
|
|
int remain = srcw;
|
|
#endif // __ARM_NEON
|
|
|
|
for (; remain > 0; remain--)
|
|
{
|
|
dst0[0] = src0[0];
|
|
dst0[1] = src0[1];
|
|
|
|
src0 += 2;
|
|
dst0 -= 2;
|
|
}
|
|
|
|
src0 += srcwgap;
|
|
dst0 += wgap;
|
|
}
|
|
}
|
|
|
|
static void kanna_rotate_2_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
|
|
{
|
|
const int srcwgap = srcstride - srcw * 3;
|
|
const int wgap = stride + w * 3;
|
|
|
|
const unsigned char* src0 = src;
|
|
unsigned char* dst0 = dst + w * 3 - 3;
|
|
|
|
int y = 0;
|
|
for (; y < srch; y++)
|
|
{
|
|
#if __ARM_NEON
|
|
dst0 -= 7 * 3;
|
|
|
|
int nn = srcw >> 4;
|
|
int remain = srcw - (nn << 4);
|
|
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x8x3_t _src = vld3_u8(src0);
|
|
uint8x8x3_t _src2 = vld3_u8(src0 + 8 * 3);
|
|
|
|
_src.val[0] = vrev64_u8(_src.val[0]);
|
|
_src.val[1] = vrev64_u8(_src.val[1]);
|
|
_src.val[2] = vrev64_u8(_src.val[2]);
|
|
|
|
_src2.val[0] = vrev64_u8(_src2.val[0]);
|
|
_src2.val[1] = vrev64_u8(_src2.val[1]);
|
|
_src2.val[2] = vrev64_u8(_src2.val[2]);
|
|
|
|
vst3_u8(dst0, _src);
|
|
vst3_u8(dst0 - 8 * 3, _src2);
|
|
|
|
src0 += 16 * 3;
|
|
dst0 -= 16 * 3;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"mov r4, #-24 \n"
|
|
"0: \n"
|
|
"pld [%1, #192] \n"
|
|
"vld3.u8 {d0-d2}, [%1]! \n"
|
|
"vrev64.u8 d0, d0 \n"
|
|
"vrev64.u8 d1, d1 \n"
|
|
"pld [%1, #192] \n"
|
|
"vld3.u8 {d4-d6}, [%1]! \n"
|
|
"vrev64.u8 d2, d2 \n"
|
|
"vrev64.u8 d4, d4 \n"
|
|
"vst3.u8 {d0-d2}, [%2], r4 \n"
|
|
"vrev64.u8 d5, d5 \n"
|
|
"vrev64.u8 d6, d6 \n"
|
|
"subs %0, #1 \n"
|
|
"vst3.u8 {d4-d6}, [%2], r4 \n"
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(dst0) // %2
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(dst0)
|
|
: "cc", "memory", "q0", "q1", "q2", "q3", "r4");
|
|
}
|
|
#endif // __aarch64__
|
|
|
|
dst0 += 7 * 3;
|
|
#else
|
|
int remain = srcw;
|
|
#endif // __ARM_NEON
|
|
|
|
for (; remain > 0; remain--)
|
|
{
|
|
dst0[0] = src0[0];
|
|
dst0[1] = src0[1];
|
|
dst0[2] = src0[2];
|
|
|
|
src0 += 3;
|
|
dst0 -= 3;
|
|
}
|
|
|
|
src0 += srcwgap;
|
|
dst0 += wgap;
|
|
}
|
|
}
|
|
|
|
static void kanna_rotate_2_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
|
|
{
|
|
const int srcwgap = srcstride - srcw * 4;
|
|
const int wgap = stride + w * 4;
|
|
|
|
const unsigned char* src0 = src;
|
|
unsigned char* dst0 = dst + w * 4 - 4;
|
|
|
|
int y = 0;
|
|
for (; y < srch; y++)
|
|
{
|
|
#if __ARM_NEON
|
|
dst0 -= 7 * 4;
|
|
|
|
int nn = srcw >> 4;
|
|
int remain = srcw - (nn << 4);
|
|
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x8x4_t _src = vld4_u8(src0);
|
|
uint8x8x4_t _src2 = vld4_u8(src0 + 8 * 4);
|
|
|
|
_src.val[0] = vrev64_u8(_src.val[0]);
|
|
_src.val[1] = vrev64_u8(_src.val[1]);
|
|
_src.val[2] = vrev64_u8(_src.val[2]);
|
|
_src.val[3] = vrev64_u8(_src.val[3]);
|
|
|
|
_src2.val[0] = vrev64_u8(_src2.val[0]);
|
|
_src2.val[1] = vrev64_u8(_src2.val[1]);
|
|
_src2.val[2] = vrev64_u8(_src2.val[2]);
|
|
_src2.val[3] = vrev64_u8(_src2.val[3]);
|
|
|
|
vst4_u8(dst0, _src);
|
|
vst4_u8(dst0 - 8 * 4, _src2);
|
|
|
|
src0 += 16 * 4;
|
|
dst0 -= 16 * 4;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"mov r4, #-32 \n"
|
|
"0: \n"
|
|
"pld [%1, #256] \n"
|
|
"vld4.u8 {d0-d3}, [%1]! \n"
|
|
"vrev64.u8 d0, d0 \n"
|
|
"vrev64.u8 d1, d1 \n"
|
|
"vrev64.u8 d2, d2 \n"
|
|
"pld [%1, #256] \n"
|
|
"vld4.u8 {d4-d7}, [%1]! \n"
|
|
"vrev64.u8 d3, d3 \n"
|
|
"vrev64.u8 d4, d4 \n"
|
|
"vrev64.u8 d5, d5 \n"
|
|
"vst4.u8 {d0-d3}, [%2], r4 \n"
|
|
"vrev64.u8 d6, d6 \n"
|
|
"vrev64.u8 d7, d7 \n"
|
|
"subs %0, #1 \n"
|
|
"vst4.u8 {d4-d7}, [%2], r4 \n"
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(dst0) // %2
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(dst0)
|
|
: "cc", "memory", "q0", "q1", "q2", "q3", "r4");
|
|
}
|
|
#endif // __aarch64__
|
|
|
|
dst0 += 7 * 4;
|
|
#else
|
|
int remain = srcw;
|
|
#endif // __ARM_NEON
|
|
|
|
for (; remain > 0; remain--)
|
|
{
|
|
dst0[0] = src0[0];
|
|
dst0[1] = src0[1];
|
|
dst0[2] = src0[2];
|
|
dst0[3] = src0[3];
|
|
|
|
src0 += 4;
|
|
dst0 -= 4;
|
|
}
|
|
|
|
src0 += srcwgap;
|
|
dst0 += wgap;
|
|
}
|
|
}
|
|
|
|
static void kanna_rotate_3_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
|
|
{
|
|
const int srcwgap = srcstride - srcw;
|
|
const int wgap = stride - w;
|
|
|
|
// point to the last dst pixel
|
|
unsigned char* dstend = dst + stride * h - wgap;
|
|
|
|
const unsigned char* src0 = src;
|
|
unsigned char* dst0 = dstend - 1;
|
|
|
|
int y = 0;
|
|
for (; y < srch; y++)
|
|
{
|
|
#if __ARM_NEON
|
|
dst0 -= 15;
|
|
|
|
int nn = srcw >> 4;
|
|
int remain = srcw - (nn << 4);
|
|
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x8_t _src = vld1_u8(src0);
|
|
uint8x8_t _src2 = vld1_u8(src0 + 8);
|
|
|
|
_src = vrev64_u8(_src);
|
|
_src2 = vrev64_u8(_src2);
|
|
|
|
vst1_u8(dst0, _src2);
|
|
vst1_u8(dst0 + 8, _src);
|
|
|
|
src0 += 16;
|
|
dst0 -= 16;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"mov r4, #-16 \n"
|
|
"0: \n"
|
|
"pld [%1, #128] \n"
|
|
"vld1.u8 {d0-d1}, [%1]! \n"
|
|
"vrev64.u8 d3, d0 \n"
|
|
"vrev64.u8 d2, d1 \n"
|
|
"subs %0, #1 \n"
|
|
"vst1.u8 {d2-d3}, [%2], r4 \n"
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(dst0) // %2
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(dst0)
|
|
: "cc", "memory", "q0", "q1", "r4");
|
|
}
|
|
#endif // __aarch64__
|
|
|
|
dst0 += 15;
|
|
#else
|
|
int remain = srcw;
|
|
#endif // __ARM_NEON
|
|
|
|
for (; remain > 0; remain--)
|
|
{
|
|
*dst0 = *src0;
|
|
|
|
src0 += 1;
|
|
dst0 -= 1;
|
|
}
|
|
|
|
src0 += srcwgap;
|
|
dst0 -= wgap;
|
|
}
|
|
}
|
|
|
|
static void kanna_rotate_3_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
|
|
{
|
|
const int srcwgap = srcstride - srcw * 2;
|
|
const int wgap = stride - w * 2;
|
|
|
|
// point to the last dst pixel
|
|
unsigned char* dstend = dst + stride * h - wgap;
|
|
|
|
const unsigned char* src0 = src;
|
|
unsigned char* dst0 = dstend - 2;
|
|
|
|
int y = 0;
|
|
for (; y < srch; y++)
|
|
{
|
|
#if __ARM_NEON
|
|
dst0 -= 7 * 2;
|
|
|
|
int nn = srcw >> 4;
|
|
int remain = srcw - (nn << 4);
|
|
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x8x2_t _src = vld2_u8(src0);
|
|
uint8x8x2_t _src2 = vld2_u8(src0 + 8 * 2);
|
|
|
|
_src.val[0] = vrev64_u8(_src.val[0]);
|
|
_src.val[1] = vrev64_u8(_src.val[1]);
|
|
|
|
_src2.val[0] = vrev64_u8(_src2.val[0]);
|
|
_src2.val[1] = vrev64_u8(_src2.val[1]);
|
|
|
|
vst2_u8(dst0, _src);
|
|
vst2_u8(dst0 - 8 * 2, _src2);
|
|
|
|
src0 += 16 * 2;
|
|
dst0 -= 16 * 2;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"mov r4, #-16 \n"
|
|
"0: \n"
|
|
"pld [%1, #128] \n"
|
|
"vld2.u8 {d0-d1}, [%1]! \n"
|
|
"vrev64.u8 d0, d0 \n"
|
|
"pld [%1, #128] \n"
|
|
"vld2.u8 {d2-d3}, [%1]! \n"
|
|
"vrev64.u8 d1, d1 \n"
|
|
"vrev64.u8 d2, d2 \n"
|
|
"vst2.u8 {d0-d1}, [%2], r4 \n"
|
|
"vrev64.u8 d3, d3 \n"
|
|
"subs %0, #1 \n"
|
|
"vst2.u8 {d2-d3}, [%2], r4 \n"
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(dst0) // %2
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(dst0)
|
|
: "cc", "memory", "q0", "q1", "r4");
|
|
}
|
|
#endif // __aarch64__
|
|
|
|
dst0 += 7 * 2;
|
|
#else
|
|
int remain = srcw;
|
|
#endif // __ARM_NEON
|
|
|
|
for (; remain > 0; remain--)
|
|
{
|
|
dst0[0] = src0[0];
|
|
dst0[1] = src0[1];
|
|
|
|
src0 += 2;
|
|
dst0 -= 2;
|
|
}
|
|
|
|
src0 += srcwgap;
|
|
dst0 -= wgap;
|
|
}
|
|
}
|
|
|
|
static void kanna_rotate_3_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
|
|
{
|
|
const int srcwgap = srcstride - srcw * 3;
|
|
const int wgap = stride - w * 3;
|
|
|
|
// point to the last dst pixel
|
|
unsigned char* dstend = dst + stride * h - wgap;
|
|
|
|
const unsigned char* src0 = src;
|
|
unsigned char* dst0 = dstend - 3;
|
|
|
|
int y = 0;
|
|
for (; y < srch; y++)
|
|
{
|
|
#if __ARM_NEON
|
|
dst0 -= 7 * 3;
|
|
|
|
int nn = srcw >> 4;
|
|
int remain = srcw - (nn << 4);
|
|
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x8x3_t _src = vld3_u8(src0);
|
|
uint8x8x3_t _src2 = vld3_u8(src0 + 8 * 3);
|
|
|
|
_src.val[0] = vrev64_u8(_src.val[0]);
|
|
_src.val[1] = vrev64_u8(_src.val[1]);
|
|
_src.val[2] = vrev64_u8(_src.val[2]);
|
|
|
|
_src2.val[0] = vrev64_u8(_src2.val[0]);
|
|
_src2.val[1] = vrev64_u8(_src2.val[1]);
|
|
_src2.val[2] = vrev64_u8(_src2.val[2]);
|
|
|
|
vst3_u8(dst0, _src);
|
|
vst3_u8(dst0 - 8 * 3, _src2);
|
|
|
|
src0 += 16 * 3;
|
|
dst0 -= 16 * 3;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"mov r4, #-24 \n"
|
|
"0: \n"
|
|
"pld [%1, #192] \n"
|
|
"vld3.u8 {d0-d2}, [%1]! \n"
|
|
"vrev64.u8 d0, d0 \n"
|
|
"vrev64.u8 d1, d1 \n"
|
|
"pld [%1, #192] \n"
|
|
"vld3.u8 {d4-d6}, [%1]! \n"
|
|
"vrev64.u8 d2, d2 \n"
|
|
"vrev64.u8 d4, d4 \n"
|
|
"vst3.u8 {d0-d2}, [%2], r4 \n"
|
|
"vrev64.u8 d5, d5 \n"
|
|
"vrev64.u8 d6, d6 \n"
|
|
"subs %0, #1 \n"
|
|
"vst3.u8 {d4-d6}, [%2], r4 \n"
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(dst0) // %2
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(dst0)
|
|
: "cc", "memory", "q0", "q1", "q2", "q3", "r4");
|
|
}
|
|
#endif // __aarch64__
|
|
|
|
dst0 += 7 * 3;
|
|
#else
|
|
int remain = srcw;
|
|
#endif // __ARM_NEON
|
|
|
|
for (; remain > 0; remain--)
|
|
{
|
|
dst0[0] = src0[0];
|
|
dst0[1] = src0[1];
|
|
dst0[2] = src0[2];
|
|
|
|
src0 += 3;
|
|
dst0 -= 3;
|
|
}
|
|
|
|
src0 += srcwgap;
|
|
dst0 -= wgap;
|
|
}
|
|
}
|
|
|
|
static void kanna_rotate_3_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
|
|
{
|
|
const int srcwgap = srcstride - srcw * 4;
|
|
const int wgap = stride - w * 4;
|
|
|
|
// point to the last dst pixel
|
|
unsigned char* dstend = dst + stride * h - wgap;
|
|
|
|
const unsigned char* src0 = src;
|
|
unsigned char* dst0 = dstend - 4;
|
|
|
|
int y = 0;
|
|
for (; y < srch; y++)
|
|
{
|
|
#if __ARM_NEON
|
|
dst0 -= 7 * 4;
|
|
|
|
int nn = srcw >> 4;
|
|
int remain = srcw - (nn << 4);
|
|
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x8x4_t _src = vld4_u8(src0);
|
|
uint8x8x4_t _src2 = vld4_u8(src0 + 8 * 4);
|
|
|
|
_src.val[0] = vrev64_u8(_src.val[0]);
|
|
_src.val[1] = vrev64_u8(_src.val[1]);
|
|
_src.val[2] = vrev64_u8(_src.val[2]);
|
|
_src.val[3] = vrev64_u8(_src.val[3]);
|
|
|
|
_src2.val[0] = vrev64_u8(_src2.val[0]);
|
|
_src2.val[1] = vrev64_u8(_src2.val[1]);
|
|
_src2.val[2] = vrev64_u8(_src2.val[2]);
|
|
_src2.val[3] = vrev64_u8(_src2.val[3]);
|
|
|
|
vst4_u8(dst0, _src);
|
|
vst4_u8(dst0 - 8 * 4, _src2);
|
|
|
|
src0 += 16 * 4;
|
|
dst0 -= 16 * 4;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"mov r4, #-32 \n"
|
|
"0: \n"
|
|
"pld [%1, #256] \n"
|
|
"vld4.u8 {d0-d3}, [%1]! \n"
|
|
"vrev64.u8 d0, d0 \n"
|
|
"vrev64.u8 d1, d1 \n"
|
|
"vrev64.u8 d2, d2 \n"
|
|
"pld [%1, #256] \n"
|
|
"vld4.u8 {d4-d7}, [%1]! \n"
|
|
"vrev64.u8 d3, d3 \n"
|
|
"vrev64.u8 d4, d4 \n"
|
|
"vrev64.u8 d5, d5 \n"
|
|
"vst4.u8 {d0-d3}, [%2], r4 \n"
|
|
"vrev64.u8 d6, d6 \n"
|
|
"vrev64.u8 d7, d7 \n"
|
|
"subs %0, #1 \n"
|
|
"vst4.u8 {d4-d7}, [%2], r4 \n"
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(dst0) // %2
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(dst0)
|
|
: "cc", "memory", "q0", "q1", "q2", "q3", "r4");
|
|
}
|
|
#endif // __aarch64__
|
|
|
|
dst0 += 7 * 4;
|
|
#else
|
|
int remain = srcw;
|
|
#endif // __ARM_NEON
|
|
|
|
for (; remain > 0; remain--)
|
|
{
|
|
dst0[0] = src0[0];
|
|
dst0[1] = src0[1];
|
|
dst0[2] = src0[2];
|
|
dst0[3] = src0[3];
|
|
|
|
src0 += 4;
|
|
dst0 -= 4;
|
|
}
|
|
|
|
src0 += srcwgap;
|
|
dst0 -= wgap;
|
|
}
|
|
}
|
|
|
|
static void kanna_rotate_4_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
|
|
{
|
|
const int srcwgap = srcstride - srcw;
|
|
const int wgap = stride + w;
|
|
|
|
// point to the last dst pixel row
|
|
unsigned char* dstend = dst + stride * (h - 1);
|
|
|
|
const unsigned char* src0 = src;
|
|
const unsigned char* src1 = src + srcstride;
|
|
unsigned char* dst0 = dstend;
|
|
unsigned char* dst1 = dstend - stride;
|
|
|
|
int y = 0;
|
|
for (; y + 1 < srch; y += 2)
|
|
{
|
|
#if __ARM_NEON
|
|
int nn = srcw >> 5;
|
|
int remain = srcw - (nn << 5);
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x16_t _src0 = vld1q_u8(src0);
|
|
uint8x16_t _src0n = vld1q_u8(src0 + 16);
|
|
vst1q_u8(dst0, _src0);
|
|
vst1q_u8(dst0 + 16, _src0n);
|
|
|
|
uint8x16_t _src1 = vld1q_u8(src1);
|
|
uint8x16_t _src1n = vld1q_u8(src1 + 16);
|
|
vst1q_u8(dst1, _src1);
|
|
vst1q_u8(dst1 + 16, _src1n);
|
|
|
|
src0 += 32;
|
|
src1 += 32;
|
|
dst0 += 32;
|
|
dst1 += 32;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"0: \n"
|
|
"pld [%1, #256] \n"
|
|
"vld1.u8 {d0-d3}, [%1]! \n"
|
|
"pld [%2, #256] \n"
|
|
"vld1.u8 {d4-d7}, [%2]! \n"
|
|
"subs %0, #1 \n"
|
|
"vst1.u8 {d0-d3}, [%3]! \n"
|
|
"vst1.u8 {d4-d7}, [%4]! \n"
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(src1), // %2
|
|
"=r"(dst0), // %3
|
|
"=r"(dst1) // %4
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(src1),
|
|
"3"(dst0),
|
|
"4"(dst1)
|
|
: "cc", "memory", "q0", "q1", "q2", "q3");
|
|
}
|
|
#endif // __aarch64__
|
|
#else
|
|
int remain = srcw;
|
|
#endif // __ARM_NEON
|
|
|
|
for (; remain > 0; remain--)
|
|
{
|
|
*dst0++ = *src0++;
|
|
*dst1++ = *src1++;
|
|
}
|
|
|
|
src0 += srcwgap + srcstride;
|
|
src1 += srcwgap + srcstride;
|
|
dst0 -= wgap + stride;
|
|
dst1 -= wgap + stride;
|
|
}
|
|
|
|
for (; y < srch; y++)
|
|
{
|
|
#if __ARM_NEON
|
|
int nn = srcw >> 5;
|
|
int remain = srcw - (nn << 5);
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x16_t _src = vld1q_u8(src0);
|
|
uint8x16_t _src2 = vld1q_u8(src0 + 16);
|
|
vst1q_u8(dst0, _src);
|
|
vst1q_u8(dst0 + 16, _src2);
|
|
|
|
src0 += 32;
|
|
dst0 += 32;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"0: \n"
|
|
"pld [%1, #256] \n"
|
|
"vld1.u8 {d0-d3}, [%1]! \n"
|
|
"subs %0, #1 \n"
|
|
"vst1.u8 {d0-d3}, [%2]! \n"
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(dst0) // %2
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(dst0)
|
|
: "cc", "memory", "q0", "q1");
|
|
}
|
|
#endif // __aarch64__
|
|
#else
|
|
int remain = srcw;
|
|
#endif // __ARM_NEON
|
|
|
|
for (; remain > 0; remain--)
|
|
{
|
|
*dst0++ = *src0++;
|
|
}
|
|
|
|
src0 += srcwgap;
|
|
dst0 -= wgap;
|
|
}
|
|
}
|
|
|
|
static void kanna_rotate_4_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
|
|
{
|
|
const int srcwgap = srcstride - srcw * 2;
|
|
const int wgap = stride + w * 2;
|
|
|
|
// point to the last dst pixel row
|
|
unsigned char* dstend = dst + stride * (h - 1);
|
|
|
|
int size = srcw * 2;
|
|
|
|
const unsigned char* src0 = src;
|
|
const unsigned char* src1 = src + srcstride;
|
|
unsigned char* dst0 = dstend;
|
|
unsigned char* dst1 = dstend - stride;
|
|
|
|
int y = 0;
|
|
for (; y + 1 < srch; y += 2)
|
|
{
|
|
#if __ARM_NEON
|
|
int nn = size >> 5;
|
|
int remain = size - (nn << 5);
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x16_t _src0 = vld1q_u8(src0);
|
|
uint8x16_t _src0n = vld1q_u8(src0 + 16);
|
|
vst1q_u8(dst0, _src0);
|
|
vst1q_u8(dst0 + 16, _src0n);
|
|
|
|
uint8x16_t _src1 = vld1q_u8(src1);
|
|
uint8x16_t _src1n = vld1q_u8(src1 + 16);
|
|
vst1q_u8(dst1, _src1);
|
|
vst1q_u8(dst1 + 16, _src1n);
|
|
|
|
src0 += 32;
|
|
src1 += 32;
|
|
dst0 += 32;
|
|
dst1 += 32;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"0: \n"
|
|
"pld [%1, #256] \n"
|
|
"vld1.u8 {d0-d3}, [%1]! \n"
|
|
"pld [%2, #256] \n"
|
|
"vld1.u8 {d4-d7}, [%2]! \n"
|
|
"subs %0, #1 \n"
|
|
"vst1.u8 {d0-d3}, [%3]! \n"
|
|
"vst1.u8 {d4-d7}, [%4]! \n"
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(src1), // %2
|
|
"=r"(dst0), // %3
|
|
"=r"(dst1) // %4
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(src1),
|
|
"3"(dst0),
|
|
"4"(dst1)
|
|
: "cc", "memory", "q0", "q1", "q2", "q3");
|
|
}
|
|
#endif // __aarch64__
|
|
#else
|
|
int remain = size;
|
|
#endif // __ARM_NEON
|
|
|
|
for (; remain > 0; remain--)
|
|
{
|
|
*dst0++ = *src0++;
|
|
*dst1++ = *src1++;
|
|
}
|
|
|
|
src0 += srcwgap + srcstride;
|
|
src1 += srcwgap + srcstride;
|
|
dst0 -= wgap + stride;
|
|
dst1 -= wgap + stride;
|
|
}
|
|
|
|
for (; y < srch; y++)
|
|
{
|
|
#if __ARM_NEON
|
|
int nn = size >> 5;
|
|
int remain = size - (nn << 5);
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x16_t _src = vld1q_u8(src0);
|
|
uint8x16_t _src2 = vld1q_u8(src0 + 16);
|
|
vst1q_u8(dst0, _src);
|
|
vst1q_u8(dst0 + 16, _src2);
|
|
|
|
src0 += 32;
|
|
dst0 += 32;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"0: \n"
|
|
"pld [%1, #256] \n"
|
|
"vld1.u8 {d0-d3}, [%1]! \n"
|
|
"subs %0, #1 \n"
|
|
"vst1.u8 {d0-d3}, [%2]! \n"
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(dst0) // %2
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(dst0)
|
|
: "cc", "memory", "q0", "q1");
|
|
}
|
|
#endif // __aarch64__
|
|
#else
|
|
int remain = size;
|
|
#endif // __ARM_NEON
|
|
|
|
for (; remain > 0; remain--)
|
|
{
|
|
*dst0++ = *src0++;
|
|
}
|
|
|
|
src0 += srcwgap;
|
|
dst0 -= wgap;
|
|
}
|
|
}
|
|
|
|
static void kanna_rotate_4_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
|
|
{
|
|
const int srcwgap = srcstride - srcw * 3;
|
|
const int wgap = stride + w * 3;
|
|
|
|
// point to the last dst pixel row
|
|
unsigned char* dstend = dst + stride * (h - 1);
|
|
|
|
int size = srcw * 3;
|
|
|
|
const unsigned char* src0 = src;
|
|
const unsigned char* src1 = src + srcstride;
|
|
unsigned char* dst0 = dstend;
|
|
unsigned char* dst1 = dstend - stride;
|
|
|
|
int y = 0;
|
|
for (; y + 1 < srch; y += 2)
|
|
{
|
|
#if __ARM_NEON
|
|
int nn = size >> 5;
|
|
int remain = size - (nn << 5);
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x16_t _src0 = vld1q_u8(src0);
|
|
uint8x16_t _src0n = vld1q_u8(src0 + 16);
|
|
vst1q_u8(dst0, _src0);
|
|
vst1q_u8(dst0 + 16, _src0n);
|
|
|
|
uint8x16_t _src1 = vld1q_u8(src1);
|
|
uint8x16_t _src1n = vld1q_u8(src1 + 16);
|
|
vst1q_u8(dst1, _src1);
|
|
vst1q_u8(dst1 + 16, _src1n);
|
|
|
|
src0 += 32;
|
|
src1 += 32;
|
|
dst0 += 32;
|
|
dst1 += 32;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"0: \n"
|
|
"pld [%1, #256] \n"
|
|
"vld1.u8 {d0-d3}, [%1]! \n"
|
|
"pld [%2, #256] \n"
|
|
"vld1.u8 {d4-d7}, [%2]! \n"
|
|
"subs %0, #1 \n"
|
|
"vst1.u8 {d0-d3}, [%3]! \n"
|
|
"vst1.u8 {d4-d7}, [%4]! \n"
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(src1), // %2
|
|
"=r"(dst0), // %3
|
|
"=r"(dst1) // %4
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(src1),
|
|
"3"(dst0),
|
|
"4"(dst1)
|
|
: "cc", "memory", "q0", "q1", "q2", "q3");
|
|
}
|
|
#endif // __aarch64__
|
|
#else
|
|
int remain = size;
|
|
#endif // __ARM_NEON
|
|
|
|
for (; remain > 0; remain--)
|
|
{
|
|
*dst0++ = *src0++;
|
|
*dst1++ = *src1++;
|
|
}
|
|
|
|
src0 += srcwgap + srcstride;
|
|
src1 += srcwgap + srcstride;
|
|
dst0 -= wgap + stride;
|
|
dst1 -= wgap + stride;
|
|
}
|
|
|
|
for (; y < srch; y++)
|
|
{
|
|
#if __ARM_NEON
|
|
int nn = size >> 5;
|
|
int remain = size - (nn << 5);
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x16_t _src = vld1q_u8(src0);
|
|
uint8x16_t _src2 = vld1q_u8(src0 + 16);
|
|
vst1q_u8(dst0, _src);
|
|
vst1q_u8(dst0 + 16, _src2);
|
|
|
|
src0 += 32;
|
|
dst0 += 32;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"0: \n"
|
|
"pld [%1, #256] \n"
|
|
"vld1.u8 {d0-d3}, [%1]! \n"
|
|
"subs %0, #1 \n"
|
|
"vst1.u8 {d0-d3}, [%2]! \n"
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(dst0) // %2
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(dst0)
|
|
: "cc", "memory", "q0", "q1");
|
|
}
|
|
#endif // __aarch64__
|
|
#else
|
|
int remain = size;
|
|
#endif // __ARM_NEON
|
|
|
|
for (; remain > 0; remain--)
|
|
{
|
|
*dst0++ = *src0++;
|
|
}
|
|
|
|
src0 += srcwgap;
|
|
dst0 -= wgap;
|
|
}
|
|
}
|
|
|
|
static void kanna_rotate_4_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
|
|
{
|
|
const int srcwgap = srcstride - srcw * 4;
|
|
const int wgap = stride + w * 4;
|
|
|
|
// point to the last dst pixel row
|
|
unsigned char* dstend = dst + stride * (h - 1);
|
|
|
|
int size = srcw * 4;
|
|
|
|
const unsigned char* src0 = src;
|
|
const unsigned char* src1 = src + srcstride;
|
|
unsigned char* dst0 = dstend;
|
|
unsigned char* dst1 = dstend - stride;
|
|
|
|
int y = 0;
|
|
for (; y + 1 < srch; y += 2)
|
|
{
|
|
#if __ARM_NEON
|
|
int nn = size >> 5;
|
|
int remain = size - (nn << 5);
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x16_t _src0 = vld1q_u8(src0);
|
|
uint8x16_t _src0n = vld1q_u8(src0 + 16);
|
|
vst1q_u8(dst0, _src0);
|
|
vst1q_u8(dst0 + 16, _src0n);
|
|
|
|
uint8x16_t _src1 = vld1q_u8(src1);
|
|
uint8x16_t _src1n = vld1q_u8(src1 + 16);
|
|
vst1q_u8(dst1, _src1);
|
|
vst1q_u8(dst1 + 16, _src1n);
|
|
|
|
src0 += 32;
|
|
src1 += 32;
|
|
dst0 += 32;
|
|
dst1 += 32;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"0: \n"
|
|
"pld [%1, #256] \n"
|
|
"vld1.u8 {d0-d3}, [%1]! \n"
|
|
"pld [%2, #256] \n"
|
|
"vld1.u8 {d4-d7}, [%2]! \n"
|
|
"subs %0, #1 \n"
|
|
"vst1.u8 {d0-d3}, [%3]! \n"
|
|
"vst1.u8 {d4-d7}, [%4]! \n"
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(src1), // %2
|
|
"=r"(dst0), // %3
|
|
"=r"(dst1) // %4
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(src1),
|
|
"3"(dst0),
|
|
"4"(dst1)
|
|
: "cc", "memory", "q0", "q1", "q2", "q3");
|
|
}
|
|
#endif // __aarch64__
|
|
#else
|
|
int remain = size;
|
|
#endif // __ARM_NEON
|
|
|
|
for (; remain > 0; remain--)
|
|
{
|
|
*dst0++ = *src0++;
|
|
*dst1++ = *src1++;
|
|
}
|
|
|
|
src0 += srcwgap + srcstride;
|
|
src1 += srcwgap + srcstride;
|
|
dst0 -= wgap + stride;
|
|
dst1 -= wgap + stride;
|
|
}
|
|
|
|
for (; y < srch; y++)
|
|
{
|
|
#if __ARM_NEON
|
|
int nn = size >> 5;
|
|
int remain = size - (nn << 5);
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x16_t _src = vld1q_u8(src0);
|
|
uint8x16_t _src2 = vld1q_u8(src0 + 16);
|
|
vst1q_u8(dst0, _src);
|
|
vst1q_u8(dst0 + 16, _src2);
|
|
|
|
src0 += 32;
|
|
dst0 += 32;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"0: \n"
|
|
"pld [%1, #256] \n"
|
|
"vld1.u8 {d0-d3}, [%1]! \n"
|
|
"subs %0, #1 \n"
|
|
"vst1.u8 {d0-d3}, [%2]! \n"
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(dst0) // %2
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(dst0)
|
|
: "cc", "memory", "q0", "q1");
|
|
}
|
|
#endif // __aarch64__
|
|
#else
|
|
int remain = size;
|
|
#endif // __ARM_NEON
|
|
|
|
for (; remain > 0; remain--)
|
|
{
|
|
*dst0++ = *src0++;
|
|
}
|
|
|
|
src0 += srcwgap;
|
|
dst0 -= wgap;
|
|
}
|
|
}
|
|
|
|
static void kanna_rotate_5_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int /*h*/, int stride)
|
|
{
|
|
const int srcwgap = srcstride - srcw;
|
|
|
|
const unsigned char* src0 = src;
|
|
|
|
int y = 0;
|
|
#if __ARM_NEON
|
|
for (; y + 7 < srch; y += 8)
|
|
{
|
|
const unsigned char* src1 = src0 + srcstride;
|
|
|
|
unsigned char* dst0 = dst + y;
|
|
unsigned char* dst1 = dst + y + stride;
|
|
|
|
int src_step = 2 * srcstride;
|
|
int dst_step = 2 * stride;
|
|
|
|
int nn = srcw >> 3;
|
|
int remain = srcw - (nn << 3);
|
|
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x8_t _src0 = vld1_u8(src0);
|
|
uint8x8_t _src1 = vld1_u8(src1);
|
|
|
|
uint8x8_t _src2 = vld1_u8(src0 + src_step);
|
|
uint8x8_t _src3 = vld1_u8(src1 + src_step);
|
|
|
|
uint8x8_t _src4 = vld1_u8(src0 + 2 * src_step);
|
|
uint8x8_t _src5 = vld1_u8(src1 + 2 * src_step);
|
|
|
|
uint8x8_t _src6 = vld1_u8(src0 + 3 * src_step);
|
|
uint8x8_t _src7 = vld1_u8(src1 + 3 * src_step);
|
|
|
|
uint8x8x2_t _src01t_r = vtrn_u8(_src0, _src1);
|
|
uint8x8x2_t _src23t_r = vtrn_u8(_src2, _src3);
|
|
uint8x8x2_t _src45t_r = vtrn_u8(_src4, _src5);
|
|
uint8x8x2_t _src67t_r = vtrn_u8(_src6, _src7);
|
|
|
|
uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
|
|
uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
|
|
uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
|
|
uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
|
|
|
|
uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
|
|
uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
|
|
uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
|
|
uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
|
|
|
|
uint8x8_t _dst0 = vreinterpret_u8_u32(_src04ttt_r.val[0]);
|
|
uint8x8_t _dst1 = vreinterpret_u8_u32(_src15ttt_r.val[0]);
|
|
uint8x8_t _dst2 = vreinterpret_u8_u32(_src26ttt_r.val[0]);
|
|
uint8x8_t _dst3 = vreinterpret_u8_u32(_src37ttt_r.val[0]);
|
|
uint8x8_t _dst4 = vreinterpret_u8_u32(_src04ttt_r.val[1]);
|
|
uint8x8_t _dst5 = vreinterpret_u8_u32(_src15ttt_r.val[1]);
|
|
uint8x8_t _dst6 = vreinterpret_u8_u32(_src26ttt_r.val[1]);
|
|
uint8x8_t _dst7 = vreinterpret_u8_u32(_src37ttt_r.val[1]);
|
|
|
|
vst1_u8(dst0, _dst0);
|
|
vst1_u8(dst1, _dst1);
|
|
vst1_u8(dst0 + dst_step, _dst2);
|
|
vst1_u8(dst1 + dst_step, _dst3);
|
|
vst1_u8(dst0 + 2 * dst_step, _dst4);
|
|
vst1_u8(dst1 + 2 * dst_step, _dst5);
|
|
vst1_u8(dst0 + 3 * dst_step, _dst6);
|
|
vst1_u8(dst1 + 3 * dst_step, _dst7);
|
|
|
|
src0 += 8;
|
|
src1 += 8;
|
|
|
|
dst0 += 4 * dst_step;
|
|
dst1 += 4 * dst_step;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"0: \n"
|
|
"pld [%1, #64] \n"
|
|
"vld1.u8 {d0}, [%1], %10 \n"
|
|
|
|
"pld [%2, #64] \n"
|
|
"vld1.u8 {d1}, [%2], %10 \n"
|
|
|
|
"pld [%1, #64] \n"
|
|
"vld1.u8 {d2}, [%1], %10 \n"
|
|
|
|
"vtrn.u8 d0, d1 \n" // _src01t_r
|
|
|
|
"pld [%2, #64] \n"
|
|
"vld1.u8 {d3}, [%2], %10 \n"
|
|
|
|
"pld [%1, #64] \n"
|
|
"vld1.u8 {d4}, [%1], %10 \n"
|
|
|
|
"vtrn.u8 d2, d3 \n" // _src23t_r
|
|
|
|
"pld [%2, #64] \n"
|
|
"vld1.u8 {d5}, [%2], %10 \n"
|
|
|
|
"pld [%1, #64] \n"
|
|
"vld1.u8 {d6}, [%1], %10 \n"
|
|
|
|
"vtrn.u8 d4, d5 \n" // _src45t_r
|
|
|
|
"pld [%2, #64] \n"
|
|
"vld1.u8 {d7}, [%2], %10 \n"
|
|
|
|
"vtrn.u8 d6, d7 \n" // _src67t_r
|
|
|
|
"sub %1, %1, %10, lsl #2 \n" // restore src0
|
|
|
|
"vtrn.u16 q0, q1 \n" // _src02tt_r _src13tt_r
|
|
|
|
"sub %2, %2, %10, lsl #2 \n" // restore src1
|
|
|
|
"vtrn.u16 q2, q3 \n" // _src13tt_r _src46tt_r
|
|
|
|
"add %1, #8 \n" // src0 += 8
|
|
|
|
"vtrn.u32 q0, q2 \n" // _src04ttt_r _src15ttt_r
|
|
|
|
"add %2, #8 \n" // src1 += 8
|
|
|
|
"vtrn.u32 q1, q3 \n" // _src26ttt_r _src37ttt_r
|
|
"vst1.u8 {d0}, [%3], %11 \n"
|
|
"vst1.u8 {d1}, [%4], %11 \n"
|
|
|
|
"subs %0, #1 \n"
|
|
|
|
"vst1.u8 {d2}, [%3], %11 \n"
|
|
"vst1.u8 {d3}, [%4], %11 \n"
|
|
"vst1.u8 {d4}, [%3], %11 \n"
|
|
"vst1.u8 {d5}, [%4], %11 \n"
|
|
"vst1.u8 {d6}, [%3], %11 \n"
|
|
"vst1.u8 {d7}, [%4], %11 \n"
|
|
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(src1), // %2
|
|
"=r"(dst0), // %3
|
|
"=r"(dst1) // %4
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(src1),
|
|
"3"(dst0),
|
|
"4"(dst1),
|
|
"r"(src_step), // %10
|
|
"r"(dst_step) // %11
|
|
: "cc", "memory", "q0", "q1", "q2", "q3");
|
|
}
|
|
#endif // __aarch64__
|
|
for (; remain > 0; remain--)
|
|
{
|
|
dst0[0] = src0[0];
|
|
dst0[1] = src1[0];
|
|
dst0[2] = src0[0 + src_step];
|
|
dst0[3] = src1[0 + src_step];
|
|
dst0[4] = src0[0 + 2 * src_step];
|
|
dst0[5] = src1[0 + 2 * src_step];
|
|
dst0[6] = src0[0 + 3 * src_step];
|
|
dst0[7] = src1[0 + 3 * src_step];
|
|
|
|
src0 += 1;
|
|
src1 += 1;
|
|
|
|
dst0 += stride;
|
|
}
|
|
|
|
src0 += srcwgap + 7 * srcstride;
|
|
}
|
|
#endif // __ARM_NEON
|
|
for (; y < srch; y++)
|
|
{
|
|
unsigned char* dst0 = dst + y;
|
|
|
|
int x = 0;
|
|
for (; x < srcw; x++)
|
|
{
|
|
*dst0 = *src0;
|
|
|
|
src0 += 1;
|
|
dst0 += stride;
|
|
}
|
|
|
|
src0 += srcwgap;
|
|
}
|
|
}
|
|
|
|
static void kanna_rotate_5_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int /*h*/, int stride)
|
|
{
|
|
const int srcwgap = srcstride - srcw * 2;
|
|
|
|
const unsigned char* src0 = src;
|
|
|
|
int y = 0;
|
|
#if __ARM_NEON
|
|
for (; y + 7 < srch; y += 8)
|
|
{
|
|
const unsigned char* src1 = src0 + srcstride;
|
|
|
|
unsigned char* dst0 = dst + y * 2;
|
|
unsigned char* dst1 = dst + y * 2 + stride;
|
|
|
|
int src_step = 2 * srcstride;
|
|
int dst_step = 2 * stride;
|
|
|
|
int nn = srcw >> 3;
|
|
int remain = srcw - (nn << 3);
|
|
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x8x2_t _src0 = vld2_u8(src0);
|
|
uint8x8x2_t _src1 = vld2_u8(src1);
|
|
|
|
uint8x8x2_t _src2 = vld2_u8(src0 + src_step);
|
|
uint8x8x2_t _src3 = vld2_u8(src1 + src_step);
|
|
|
|
uint8x8x2_t _src4 = vld2_u8(src0 + 2 * src_step);
|
|
uint8x8x2_t _src5 = vld2_u8(src1 + 2 * src_step);
|
|
|
|
uint8x8x2_t _src6 = vld2_u8(src0 + 3 * src_step);
|
|
uint8x8x2_t _src7 = vld2_u8(src1 + 3 * src_step);
|
|
|
|
uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]);
|
|
uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]);
|
|
uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]);
|
|
uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]);
|
|
|
|
uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]);
|
|
uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]);
|
|
uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]);
|
|
uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]);
|
|
|
|
uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
|
|
uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
|
|
uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
|
|
uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
|
|
|
|
uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0]));
|
|
uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1]));
|
|
uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0]));
|
|
uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1]));
|
|
|
|
uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
|
|
uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
|
|
uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
|
|
uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
|
|
|
|
uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0]));
|
|
uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0]));
|
|
uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1]));
|
|
uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1]));
|
|
|
|
uint8x8x2_t _dst0;
|
|
uint8x8x2_t _dst1;
|
|
uint8x8x2_t _dst2;
|
|
uint8x8x2_t _dst3;
|
|
uint8x8x2_t _dst4;
|
|
uint8x8x2_t _dst5;
|
|
uint8x8x2_t _dst6;
|
|
uint8x8x2_t _dst7;
|
|
|
|
_dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
|
|
_dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
|
|
_dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
|
|
_dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
|
|
_dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
|
|
_dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
|
|
_dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
|
|
_dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
|
|
|
|
_dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
|
|
_dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
|
|
_dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
|
|
_dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
|
|
_dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
|
|
_dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
|
|
_dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
|
|
_dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
|
|
|
|
vst2_u8(dst0, _dst0);
|
|
vst2_u8(dst1, _dst1);
|
|
vst2_u8(dst0 + dst_step, _dst2);
|
|
vst2_u8(dst1 + dst_step, _dst3);
|
|
vst2_u8(dst0 + 2 * dst_step, _dst4);
|
|
vst2_u8(dst1 + 2 * dst_step, _dst5);
|
|
vst2_u8(dst0 + 3 * dst_step, _dst6);
|
|
vst2_u8(dst1 + 3 * dst_step, _dst7);
|
|
|
|
src0 += 2 * 8;
|
|
src1 += 2 * 8;
|
|
|
|
dst0 += 4 * dst_step;
|
|
dst1 += 4 * dst_step;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"0: \n"
|
|
"pld [%1, #128] \n"
|
|
"vld2.u8 {d0-d1}, [%1], %10 \n"
|
|
|
|
"pld [%2, #128] \n"
|
|
"vld2.u8 {d2-d3}, [%2], %10 \n"
|
|
|
|
"pld [%1, #128] \n"
|
|
"vld2.u8 {d4-d5}, [%1], %10 \n"
|
|
|
|
"vtrn.u8 q0, q1 \n" // _src01t_r
|
|
|
|
"pld [%2, #128] \n"
|
|
"vld2.u8 {d6-d7}, [%2], %10 \n"
|
|
|
|
"pld [%1, #128] \n"
|
|
"vld2.u8 {d16-d17}, [%1], %10\n"
|
|
|
|
"vtrn.u8 q2, q3 \n" // _src23t_r
|
|
|
|
"pld [%2, #128] \n"
|
|
"vld2.u8 {d18-d19}, [%2], %10\n"
|
|
|
|
"pld [%1, #128] \n"
|
|
"vld2.u8 {d20-d21}, [%1], %10\n"
|
|
|
|
"vtrn.u8 q8, q9 \n" // _src45t_r
|
|
|
|
"pld [%2, #128] \n"
|
|
"vld2.u8 {d22-d23}, [%2], %10\n"
|
|
|
|
"vtrn.u8 q10, q11 \n" // _src67t_r
|
|
|
|
"sub %1, %1, %10, lsl #2 \n" // restore src0
|
|
|
|
"vtrn.u16 q0, q2 \n" // _src02tt_r
|
|
|
|
"sub %2, %2, %10, lsl #2 \n" // restore src1
|
|
|
|
"vtrn.u16 q1, q3 \n" // _src13tt_r
|
|
|
|
"add %1, #16 \n" // src0 += 16
|
|
|
|
"vtrn.u16 q8, q10 \n" // _src46tt_r
|
|
|
|
"add %2, #16 \n" // src1 += 16
|
|
|
|
"vtrn.u16 q9, q11 \n" // _src57tt_r
|
|
|
|
"vtrn.u32 q0, q8 \n" // _src04ttt_r
|
|
|
|
"vtrn.u32 q1, q9 \n" // _src15ttt_r
|
|
"vst2.u8 {d0-d1}, [%3], %11 \n"
|
|
|
|
"vtrn.u32 q2, q10 \n" // _src26ttt_r
|
|
"vst2.u8 {d2-d3}, [%4], %11 \n"
|
|
|
|
"vtrn.u32 q3, q11 \n" // _src37ttt_r
|
|
"vst2.u8 {d4-d5}, [%3], %11 \n"
|
|
|
|
"subs %0, #1 \n"
|
|
|
|
"vst2.u8 {d6-d7}, [%4], %11 \n"
|
|
"vst2.u8 {d16-d17}, [%3], %11\n"
|
|
"vst2.u8 {d18-d19}, [%4], %11\n"
|
|
"vst2.u8 {d20-d21}, [%3], %11\n"
|
|
"vst2.u8 {d22-d23}, [%4], %11\n"
|
|
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(src1), // %2
|
|
"=r"(dst0), // %3
|
|
"=r"(dst1) // %4
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(src1),
|
|
"3"(dst0),
|
|
"4"(dst1),
|
|
"r"(src_step), // %10
|
|
"r"(dst_step) // %11
|
|
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
|
|
}
|
|
#endif // __aarch64__
|
|
for (; remain > 0; remain--)
|
|
{
|
|
dst0[0] = src0[0];
|
|
dst0[1] = src0[1];
|
|
dst0[2] = src1[0];
|
|
dst0[3] = src1[1];
|
|
dst0[4] = src0[0 + src_step];
|
|
dst0[5] = src0[1 + src_step];
|
|
dst0[6] = src1[0 + src_step];
|
|
dst0[7] = src1[1 + src_step];
|
|
dst0[8] = src0[0 + 2 * src_step];
|
|
dst0[9] = src0[1 + 2 * src_step];
|
|
dst0[10] = src1[0 + 2 * src_step];
|
|
dst0[11] = src1[1 + 2 * src_step];
|
|
dst0[12] = src0[0 + 3 * src_step];
|
|
dst0[13] = src0[1 + 3 * src_step];
|
|
dst0[14] = src1[0 + 3 * src_step];
|
|
dst0[15] = src1[1 + 3 * src_step];
|
|
|
|
src0 += 2;
|
|
src1 += 2;
|
|
|
|
dst0 += stride;
|
|
}
|
|
|
|
src0 += srcwgap + 7 * srcstride;
|
|
}
|
|
#endif // __ARM_NEON
|
|
for (; y < srch; y++)
|
|
{
|
|
unsigned char* dst0 = dst + y * 2;
|
|
|
|
int x = 0;
|
|
for (; x < srcw; x++)
|
|
{
|
|
dst0[0] = src0[0];
|
|
dst0[1] = src0[1];
|
|
|
|
src0 += 2;
|
|
dst0 += stride;
|
|
}
|
|
|
|
src0 += srcwgap;
|
|
}
|
|
}
|
|
|
|
static void kanna_rotate_5_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int /*h*/, int stride)
|
|
{
|
|
const int srcwgap = srcstride - srcw * 3;
|
|
|
|
const unsigned char* src0 = src;
|
|
|
|
int y = 0;
|
|
#if __ARM_NEON
|
|
for (; y + 7 < srch; y += 8)
|
|
{
|
|
const unsigned char* src1 = src0 + srcstride;
|
|
|
|
unsigned char* dst0 = dst + y * 3;
|
|
unsigned char* dst1 = dst + y * 3 + stride;
|
|
|
|
int src_step = 2 * srcstride;
|
|
int dst_step = 2 * stride;
|
|
|
|
int nn = srcw >> 3;
|
|
int remain = srcw - (nn << 3);
|
|
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x8x3_t _src0 = vld3_u8(src0);
|
|
uint8x8x3_t _src1 = vld3_u8(src1);
|
|
|
|
uint8x8x3_t _src2 = vld3_u8(src0 + src_step);
|
|
uint8x8x3_t _src3 = vld3_u8(src1 + src_step);
|
|
|
|
uint8x8x3_t _src4 = vld3_u8(src0 + 2 * src_step);
|
|
uint8x8x3_t _src5 = vld3_u8(src1 + 2 * src_step);
|
|
|
|
uint8x8x3_t _src6 = vld3_u8(src0 + 3 * src_step);
|
|
uint8x8x3_t _src7 = vld3_u8(src1 + 3 * src_step);
|
|
|
|
uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]);
|
|
uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]);
|
|
uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]);
|
|
uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]);
|
|
|
|
uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]);
|
|
uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]);
|
|
uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]);
|
|
uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]);
|
|
|
|
uint8x8x2_t _src01t_b = vtrn_u8(_src0.val[2], _src1.val[2]);
|
|
uint8x8x2_t _src23t_b = vtrn_u8(_src2.val[2], _src3.val[2]);
|
|
uint8x8x2_t _src45t_b = vtrn_u8(_src4.val[2], _src5.val[2]);
|
|
uint8x8x2_t _src67t_b = vtrn_u8(_src6.val[2], _src7.val[2]);
|
|
|
|
uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
|
|
uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
|
|
uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
|
|
uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
|
|
|
|
uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0]));
|
|
uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1]));
|
|
uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0]));
|
|
uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1]));
|
|
|
|
uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[0]), vreinterpret_u16_u8(_src23t_b.val[0]));
|
|
uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[1]), vreinterpret_u16_u8(_src23t_b.val[1]));
|
|
uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[0]), vreinterpret_u16_u8(_src67t_b.val[0]));
|
|
uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[1]), vreinterpret_u16_u8(_src67t_b.val[1]));
|
|
|
|
uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
|
|
uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
|
|
uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
|
|
uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
|
|
|
|
uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0]));
|
|
uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0]));
|
|
uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1]));
|
|
uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1]));
|
|
|
|
uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[0]), vreinterpret_u32_u16(_src46tt_b.val[0]));
|
|
uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[0]), vreinterpret_u32_u16(_src57tt_b.val[0]));
|
|
uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[1]), vreinterpret_u32_u16(_src46tt_b.val[1]));
|
|
uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[1]), vreinterpret_u32_u16(_src57tt_b.val[1]));
|
|
|
|
uint8x8x3_t _dst0;
|
|
uint8x8x3_t _dst1;
|
|
uint8x8x3_t _dst2;
|
|
uint8x8x3_t _dst3;
|
|
uint8x8x3_t _dst4;
|
|
uint8x8x3_t _dst5;
|
|
uint8x8x3_t _dst6;
|
|
uint8x8x3_t _dst7;
|
|
|
|
_dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
|
|
_dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
|
|
_dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
|
|
_dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
|
|
_dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
|
|
_dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
|
|
_dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
|
|
_dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
|
|
|
|
_dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
|
|
_dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
|
|
_dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
|
|
_dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
|
|
_dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
|
|
_dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
|
|
_dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
|
|
_dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
|
|
|
|
_dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
|
|
_dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
|
|
_dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
|
|
_dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
|
|
_dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
|
|
_dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
|
|
_dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
|
|
_dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
|
|
|
|
vst3_u8(dst0, _dst0);
|
|
vst3_u8(dst1, _dst1);
|
|
vst3_u8(dst0 + dst_step, _dst2);
|
|
vst3_u8(dst1 + dst_step, _dst3);
|
|
vst3_u8(dst0 + 2 * dst_step, _dst4);
|
|
vst3_u8(dst1 + 2 * dst_step, _dst5);
|
|
vst3_u8(dst0 + 3 * dst_step, _dst6);
|
|
vst3_u8(dst1 + 3 * dst_step, _dst7);
|
|
|
|
src0 += 3 * 8;
|
|
src1 += 3 * 8;
|
|
|
|
dst0 += 4 * dst_step;
|
|
dst1 += 4 * dst_step;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"0: \n"
|
|
"pld [%1, #192] \n"
|
|
"vld3.u8 {d0-d2}, [%1], %10 \n"
|
|
|
|
"pld [%2, #192] \n"
|
|
"vld3.u8 {d4-d6}, [%2], %10 \n"
|
|
|
|
"pld [%1, #192] \n"
|
|
"vld3.u8 {d8-d10}, [%1], %10 \n"
|
|
|
|
"vtrn.u8 q0, q2 \n" // _src01t_r
|
|
"vtrn.u8 d2, d6 \n"
|
|
|
|
"pld [%2, #192] \n"
|
|
"vld3.u8 {d12-d14}, [%2], %10\n"
|
|
|
|
"pld [%1, #192] \n"
|
|
"vld3.u8 {d16-d18}, [%1], %10\n"
|
|
|
|
"vtrn.u8 q4, q6 \n" // _src23t_r
|
|
"vtrn.u8 d10, d14 \n"
|
|
|
|
"pld [%2, #192] \n"
|
|
"vld3.u8 {d20-d22}, [%2], %10\n"
|
|
|
|
"pld [%1, #192] \n"
|
|
"vld3.u8 {d24-d26}, [%1], %10\n"
|
|
|
|
"vtrn.u8 q8, q10 \n" // _src45t_r
|
|
"vtrn.u8 d18, d22 \n"
|
|
|
|
"pld [%2, #192] \n"
|
|
"vld3.u8 {d28-d30}, [%2], %10\n"
|
|
|
|
"vtrn.u8 q12, q14 \n" // _src67t_r
|
|
"vtrn.u8 d26, d30 \n"
|
|
|
|
"sub %1, %1, %10, lsl #2 \n" // restore src0
|
|
|
|
"vtrn.u16 q0, q4 \n" // _src02tt_r
|
|
"vtrn.u16 d2, d10 \n"
|
|
|
|
"sub %2, %2, %10, lsl #2 \n" // restore src1
|
|
|
|
"vtrn.u16 q2, q6 \n" // _src13tt_r
|
|
"vtrn.u16 d6, d14 \n"
|
|
|
|
"add %1, #24 \n" // src0 += 24
|
|
|
|
"vtrn.u16 q8, q12 \n" // _src46tt_r
|
|
"vtrn.u16 d18, d26 \n"
|
|
|
|
"add %2, #24 \n" // src1 += 24
|
|
|
|
"vtrn.u16 q10, q14 \n" // _src57tt_r
|
|
"vtrn.u16 d22, d30 \n"
|
|
|
|
"vtrn.u32 q0, q8 \n" // _src04ttt_r
|
|
"vtrn.u32 d2, d18 \n"
|
|
|
|
"vtrn.u32 q2, q10 \n" // _src15ttt_r
|
|
"vst3.u8 {d0-d2}, [%3], %11 \n"
|
|
"vtrn.u32 d6, d22 \n"
|
|
|
|
"vtrn.u32 q4, q12 \n" // _src26ttt_r
|
|
"vst3.u8 {d4-d6}, [%4], %11 \n"
|
|
"vtrn.u32 d10, d26 \n"
|
|
|
|
"vtrn.u32 q6, q14 \n" // _src37ttt_r
|
|
"vst3.u8 {d8-d10}, [%3], %11 \n"
|
|
"vtrn.u32 d14, d30 \n"
|
|
|
|
"subs %0, #1 \n"
|
|
|
|
"vst3.u8 {d16-d18}, [%3], %11\n"
|
|
"vst3.u8 {d12-d14}, [%4], %11\n"
|
|
"vst3.u8 {d20-d22}, [%4], %11\n"
|
|
"vst3.u8 {d24-d26}, [%3], %11\n"
|
|
"vst3.u8 {d28-d30}, [%4], %11\n"
|
|
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(src1), // %2
|
|
"=r"(dst0), // %3
|
|
"=r"(dst1) // %4
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(src1),
|
|
"3"(dst0),
|
|
"4"(dst1),
|
|
"r"(src_step), // %10
|
|
"r"(dst_step) // %11
|
|
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
|
|
}
|
|
#endif // __aarch64__
|
|
for (; remain > 0; remain--)
|
|
{
|
|
dst0[0] = src0[0];
|
|
dst0[1] = src0[1];
|
|
dst0[2] = src0[2];
|
|
dst0[3] = src1[0];
|
|
dst0[4] = src1[1];
|
|
dst0[5] = src1[2];
|
|
dst0[6] = src0[0 + src_step];
|
|
dst0[7] = src0[1 + src_step];
|
|
dst0[8] = src0[2 + src_step];
|
|
dst0[9] = src1[0 + src_step];
|
|
dst0[10] = src1[1 + src_step];
|
|
dst0[11] = src1[2 + src_step];
|
|
dst0[12] = src0[0 + 2 * src_step];
|
|
dst0[13] = src0[1 + 2 * src_step];
|
|
dst0[14] = src0[2 + 2 * src_step];
|
|
dst0[15] = src1[0 + 2 * src_step];
|
|
dst0[16] = src1[1 + 2 * src_step];
|
|
dst0[17] = src1[2 + 2 * src_step];
|
|
dst0[18] = src0[0 + 3 * src_step];
|
|
dst0[19] = src0[1 + 3 * src_step];
|
|
dst0[20] = src0[2 + 3 * src_step];
|
|
dst0[21] = src1[0 + 3 * src_step];
|
|
dst0[22] = src1[1 + 3 * src_step];
|
|
dst0[23] = src1[2 + 3 * src_step];
|
|
|
|
src0 += 3;
|
|
src1 += 3;
|
|
|
|
dst0 += stride;
|
|
}
|
|
|
|
src0 += srcwgap + 7 * srcstride;
|
|
}
|
|
#endif // __ARM_NEON
|
|
for (; y < srch; y++)
|
|
{
|
|
unsigned char* dst0 = dst + y * 3;
|
|
|
|
int x = 0;
|
|
for (; x < srcw; x++)
|
|
{
|
|
dst0[0] = src0[0];
|
|
dst0[1] = src0[1];
|
|
dst0[2] = src0[2];
|
|
|
|
src0 += 3;
|
|
dst0 += stride;
|
|
}
|
|
|
|
src0 += srcwgap;
|
|
}
|
|
}
|
|
|
|
static void kanna_rotate_5_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int /*h*/, int stride)
|
|
{
|
|
const int srcwgap = srcstride - srcw * 4;
|
|
|
|
const unsigned char* src0 = src;
|
|
|
|
int y = 0;
|
|
#if __ARM_NEON
|
|
for (; y + 7 < srch; y += 8)
|
|
{
|
|
const unsigned char* src1 = src0 + srcstride;
|
|
|
|
unsigned char* dst0 = dst + y * 4;
|
|
unsigned char* dst1 = dst + y * 4 + stride;
|
|
|
|
int src_step = 2 * srcstride;
|
|
int dst_step = 2 * stride;
|
|
|
|
int nn = srcw >> 3;
|
|
int remain = srcw - (nn << 3);
|
|
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x8x4_t _src0 = vld4_u8(src0);
|
|
uint8x8x4_t _src1 = vld4_u8(src1);
|
|
|
|
uint8x8x4_t _src2 = vld4_u8(src0 + src_step);
|
|
uint8x8x4_t _src3 = vld4_u8(src1 + src_step);
|
|
|
|
uint8x8x4_t _src4 = vld4_u8(src0 + 2 * src_step);
|
|
uint8x8x4_t _src5 = vld4_u8(src1 + 2 * src_step);
|
|
|
|
uint8x8x4_t _src6 = vld4_u8(src0 + 3 * src_step);
|
|
uint8x8x4_t _src7 = vld4_u8(src1 + 3 * src_step);
|
|
|
|
uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]);
|
|
uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]);
|
|
uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]);
|
|
uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]);
|
|
|
|
uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]);
|
|
uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]);
|
|
uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]);
|
|
uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]);
|
|
|
|
uint8x8x2_t _src01t_b = vtrn_u8(_src0.val[2], _src1.val[2]);
|
|
uint8x8x2_t _src23t_b = vtrn_u8(_src2.val[2], _src3.val[2]);
|
|
uint8x8x2_t _src45t_b = vtrn_u8(_src4.val[2], _src5.val[2]);
|
|
uint8x8x2_t _src67t_b = vtrn_u8(_src6.val[2], _src7.val[2]);
|
|
|
|
uint8x8x2_t _src01t_a = vtrn_u8(_src0.val[3], _src1.val[3]);
|
|
uint8x8x2_t _src23t_a = vtrn_u8(_src2.val[3], _src3.val[3]);
|
|
uint8x8x2_t _src45t_a = vtrn_u8(_src4.val[3], _src5.val[3]);
|
|
uint8x8x2_t _src67t_a = vtrn_u8(_src6.val[3], _src7.val[3]);
|
|
|
|
uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
|
|
uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
|
|
uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
|
|
uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
|
|
|
|
uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0]));
|
|
uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1]));
|
|
uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0]));
|
|
uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1]));
|
|
|
|
uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[0]), vreinterpret_u16_u8(_src23t_b.val[0]));
|
|
uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[1]), vreinterpret_u16_u8(_src23t_b.val[1]));
|
|
uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[0]), vreinterpret_u16_u8(_src67t_b.val[0]));
|
|
uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[1]), vreinterpret_u16_u8(_src67t_b.val[1]));
|
|
|
|
uint16x4x2_t _src02tt_a = vtrn_u16(vreinterpret_u16_u8(_src01t_a.val[0]), vreinterpret_u16_u8(_src23t_a.val[0]));
|
|
uint16x4x2_t _src13tt_a = vtrn_u16(vreinterpret_u16_u8(_src01t_a.val[1]), vreinterpret_u16_u8(_src23t_a.val[1]));
|
|
uint16x4x2_t _src46tt_a = vtrn_u16(vreinterpret_u16_u8(_src45t_a.val[0]), vreinterpret_u16_u8(_src67t_a.val[0]));
|
|
uint16x4x2_t _src57tt_a = vtrn_u16(vreinterpret_u16_u8(_src45t_a.val[1]), vreinterpret_u16_u8(_src67t_a.val[1]));
|
|
|
|
uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
|
|
uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
|
|
uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
|
|
uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
|
|
|
|
uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0]));
|
|
uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0]));
|
|
uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1]));
|
|
uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1]));
|
|
|
|
uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[0]), vreinterpret_u32_u16(_src46tt_b.val[0]));
|
|
uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[0]), vreinterpret_u32_u16(_src57tt_b.val[0]));
|
|
uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[1]), vreinterpret_u32_u16(_src46tt_b.val[1]));
|
|
uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[1]), vreinterpret_u32_u16(_src57tt_b.val[1]));
|
|
|
|
uint32x2x2_t _src04ttt_a = vtrn_u32(vreinterpret_u32_u16(_src02tt_a.val[0]), vreinterpret_u32_u16(_src46tt_a.val[0]));
|
|
uint32x2x2_t _src15ttt_a = vtrn_u32(vreinterpret_u32_u16(_src13tt_a.val[0]), vreinterpret_u32_u16(_src57tt_a.val[0]));
|
|
uint32x2x2_t _src26ttt_a = vtrn_u32(vreinterpret_u32_u16(_src02tt_a.val[1]), vreinterpret_u32_u16(_src46tt_a.val[1]));
|
|
uint32x2x2_t _src37ttt_a = vtrn_u32(vreinterpret_u32_u16(_src13tt_a.val[1]), vreinterpret_u32_u16(_src57tt_a.val[1]));
|
|
|
|
uint8x8x4_t _dst0;
|
|
uint8x8x4_t _dst1;
|
|
uint8x8x4_t _dst2;
|
|
uint8x8x4_t _dst3;
|
|
uint8x8x4_t _dst4;
|
|
uint8x8x4_t _dst5;
|
|
uint8x8x4_t _dst6;
|
|
uint8x8x4_t _dst7;
|
|
|
|
_dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
|
|
_dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
|
|
_dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
|
|
_dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
|
|
_dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
|
|
_dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
|
|
_dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
|
|
_dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
|
|
|
|
_dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
|
|
_dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
|
|
_dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
|
|
_dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
|
|
_dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
|
|
_dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
|
|
_dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
|
|
_dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
|
|
|
|
_dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
|
|
_dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
|
|
_dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
|
|
_dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
|
|
_dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
|
|
_dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
|
|
_dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
|
|
_dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
|
|
|
|
_dst0.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[0]);
|
|
_dst1.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[0]);
|
|
_dst2.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[0]);
|
|
_dst3.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[0]);
|
|
_dst4.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[1]);
|
|
_dst5.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[1]);
|
|
_dst6.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[1]);
|
|
_dst7.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[1]);
|
|
|
|
vst4_u8(dst0, _dst0);
|
|
vst4_u8(dst1, _dst1);
|
|
vst4_u8(dst0 + dst_step, _dst2);
|
|
vst4_u8(dst1 + dst_step, _dst3);
|
|
vst4_u8(dst0 + 2 * dst_step, _dst4);
|
|
vst4_u8(dst1 + 2 * dst_step, _dst5);
|
|
vst4_u8(dst0 + 3 * dst_step, _dst6);
|
|
vst4_u8(dst1 + 3 * dst_step, _dst7);
|
|
|
|
src0 += 4 * 8;
|
|
src1 += 4 * 8;
|
|
|
|
dst0 += 4 * dst_step;
|
|
dst1 += 4 * dst_step;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"0: \n"
|
|
"pld [%1, #256] \n"
|
|
"vld4.u8 {d0-d3}, [%1], %10 \n"
|
|
|
|
"pld [%2, #256] \n"
|
|
"vld4.u8 {d4-d7}, [%2], %10 \n"
|
|
|
|
"pld [%1, #256] \n"
|
|
"vld4.u8 {d8-d11}, [%1], %10 \n"
|
|
|
|
"vtrn.u8 q0, q2 \n" // _src01t_r
|
|
"vtrn.u8 q1, q3 \n"
|
|
|
|
"pld [%2, #256] \n"
|
|
"vld4.u8 {d12-d15}, [%2], %10\n"
|
|
|
|
"pld [%1, #256] \n"
|
|
"vld4.u8 {d16-d19}, [%1], %10\n"
|
|
|
|
"vtrn.u8 q4, q6 \n" // _src23t_r
|
|
"vtrn.u8 q5, q7 \n"
|
|
|
|
"pld [%2, #256] \n"
|
|
"vld4.u8 {d20-d23}, [%2], %10\n"
|
|
|
|
"pld [%1, #256] \n"
|
|
"vld4.u8 {d24-d27}, [%1], %10\n"
|
|
|
|
"vtrn.u8 q8, q10 \n" // _src45t_r
|
|
"vtrn.u8 q9, q11 \n"
|
|
|
|
"pld [%2, #256] \n"
|
|
"vld4.u8 {d28-d31}, [%2], %10\n"
|
|
|
|
"vtrn.u8 q12, q14 \n" // _src67t_r
|
|
"vtrn.u8 q13, q15 \n"
|
|
|
|
"sub %1, %1, %10, lsl #2 \n" // restore src0
|
|
|
|
"vtrn.u16 q0, q4 \n" // _src02tt_r
|
|
"vtrn.u16 q1, q5 \n"
|
|
|
|
"sub %2, %2, %10, lsl #2 \n" // restore src1
|
|
|
|
"vtrn.u16 q2, q6 \n" // _src13tt_r
|
|
"vtrn.u16 q3, q7 \n"
|
|
|
|
"add %1, #32 \n" // src0 += 32
|
|
|
|
"vtrn.u16 q8, q12 \n" // _src46tt_r
|
|
"vtrn.u16 q9, q13 \n"
|
|
|
|
"add %2, #32 \n" // src1 += 32
|
|
|
|
"vtrn.u16 q10, q14 \n" // _src57tt_r
|
|
"vtrn.u16 q11, q15 \n"
|
|
|
|
"vtrn.u32 q0, q8 \n" // _src04ttt_r
|
|
"vtrn.u32 q1, q9 \n"
|
|
|
|
"vtrn.u32 q2, q10 \n" // _src15ttt_r
|
|
"vst4.u8 {d0-d3}, [%3], %11 \n"
|
|
"vtrn.u32 q3, q11 \n"
|
|
|
|
"vtrn.u32 q4, q12 \n" // _src26ttt_r
|
|
"vst4.u8 {d4-d7}, [%4], %11 \n"
|
|
"vtrn.u32 q5, q13 \n"
|
|
|
|
"vtrn.u32 q6, q14 \n" // _src37ttt_r
|
|
"vst4.u8 {d8-d11}, [%3], %11 \n"
|
|
"vtrn.u32 q7, q15 \n"
|
|
|
|
"subs %0, #1 \n"
|
|
|
|
"vst4.u8 {d16-d19}, [%3], %11\n"
|
|
"vst4.u8 {d12-d15}, [%4], %11\n"
|
|
"vst4.u8 {d20-d23}, [%4], %11\n"
|
|
"vst4.u8 {d24-d27}, [%3], %11\n"
|
|
"vst4.u8 {d28-d31}, [%4], %11\n"
|
|
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(src1), // %2
|
|
"=r"(dst0), // %3
|
|
"=r"(dst1) // %4
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(src1),
|
|
"3"(dst0),
|
|
"4"(dst1),
|
|
"r"(src_step), // %10
|
|
"r"(dst_step) // %11
|
|
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
|
|
}
|
|
#endif // __aarch64__
|
|
for (; remain > 0; remain--)
|
|
{
|
|
dst0[0] = src0[0];
|
|
dst0[1] = src0[1];
|
|
dst0[2] = src0[2];
|
|
dst0[3] = src0[3];
|
|
dst0[4] = src1[0];
|
|
dst0[5] = src1[1];
|
|
dst0[6] = src1[2];
|
|
dst0[7] = src1[3];
|
|
dst0[8] = src0[0 + src_step];
|
|
dst0[9] = src0[1 + src_step];
|
|
dst0[10] = src0[2 + src_step];
|
|
dst0[11] = src0[3 + src_step];
|
|
dst0[12] = src1[0 + src_step];
|
|
dst0[13] = src1[1 + src_step];
|
|
dst0[14] = src1[2 + src_step];
|
|
dst0[15] = src1[3 + src_step];
|
|
dst0[16] = src0[0 + 2 * src_step];
|
|
dst0[17] = src0[1 + 2 * src_step];
|
|
dst0[18] = src0[2 + 2 * src_step];
|
|
dst0[19] = src0[3 + 2 * src_step];
|
|
dst0[20] = src1[0 + 2 * src_step];
|
|
dst0[21] = src1[1 + 2 * src_step];
|
|
dst0[22] = src1[2 + 2 * src_step];
|
|
dst0[23] = src1[3 + 2 * src_step];
|
|
dst0[24] = src0[0 + 3 * src_step];
|
|
dst0[25] = src0[1 + 3 * src_step];
|
|
dst0[26] = src0[2 + 3 * src_step];
|
|
dst0[27] = src0[3 + 3 * src_step];
|
|
dst0[28] = src1[0 + 3 * src_step];
|
|
dst0[29] = src1[1 + 3 * src_step];
|
|
dst0[30] = src1[2 + 3 * src_step];
|
|
dst0[31] = src1[3 + 3 * src_step];
|
|
|
|
src0 += 4;
|
|
src1 += 4;
|
|
|
|
dst0 += stride;
|
|
}
|
|
|
|
src0 += srcwgap + 7 * srcstride;
|
|
}
|
|
#endif // __ARM_NEON
|
|
for (; y < srch; y++)
|
|
{
|
|
unsigned char* dst0 = dst + y * 4;
|
|
|
|
int x = 0;
|
|
for (; x < srcw; x++)
|
|
{
|
|
dst0[0] = src0[0];
|
|
dst0[1] = src0[1];
|
|
dst0[2] = src0[2];
|
|
dst0[3] = src0[3];
|
|
|
|
src0 += 4;
|
|
dst0 += stride;
|
|
}
|
|
|
|
src0 += srcwgap;
|
|
}
|
|
}
|
|
|
|
static void kanna_rotate_6_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
|
|
{
|
|
const int srcwgap = srcstride - srcw;
|
|
|
|
// point to the last dst pixel in row
|
|
unsigned char* dstend = dst + w;
|
|
|
|
const unsigned char* src0 = src;
|
|
|
|
int y = 0;
|
|
#if __ARM_NEON
|
|
for (; y + 7 < srch; y += 8)
|
|
{
|
|
const unsigned char* src1 = src0 + srcstride;
|
|
|
|
unsigned char* dst0 = dstend - y - 8;
|
|
unsigned char* dst1 = dstend - y - 8 + stride;
|
|
|
|
int src_step = 2 * srcstride;
|
|
int dst_step = 2 * stride;
|
|
|
|
int nn = srcw >> 3;
|
|
int remain = srcw - (nn << 3);
|
|
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x8_t _src0 = vld1_u8(src0);
|
|
uint8x8_t _src1 = vld1_u8(src1);
|
|
|
|
uint8x8_t _src2 = vld1_u8(src0 + src_step);
|
|
uint8x8_t _src3 = vld1_u8(src1 + src_step);
|
|
|
|
uint8x8_t _src4 = vld1_u8(src0 + 2 * src_step);
|
|
uint8x8_t _src5 = vld1_u8(src1 + 2 * src_step);
|
|
|
|
uint8x8_t _src6 = vld1_u8(src0 + 3 * src_step);
|
|
uint8x8_t _src7 = vld1_u8(src1 + 3 * src_step);
|
|
|
|
uint8x8x2_t _src01t_r = vtrn_u8(_src1, _src0);
|
|
uint8x8x2_t _src23t_r = vtrn_u8(_src3, _src2);
|
|
uint8x8x2_t _src45t_r = vtrn_u8(_src5, _src4);
|
|
uint8x8x2_t _src67t_r = vtrn_u8(_src7, _src6);
|
|
|
|
uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
|
|
uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
|
|
uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
|
|
uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
|
|
|
|
uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
|
|
uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
|
|
uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
|
|
uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
|
|
|
|
uint8x8_t _dst0 = vreinterpret_u8_u32(_src04ttt_r.val[1]);
|
|
uint8x8_t _dst1 = vreinterpret_u8_u32(_src15ttt_r.val[1]);
|
|
uint8x8_t _dst2 = vreinterpret_u8_u32(_src26ttt_r.val[1]);
|
|
uint8x8_t _dst3 = vreinterpret_u8_u32(_src37ttt_r.val[1]);
|
|
uint8x8_t _dst4 = vreinterpret_u8_u32(_src04ttt_r.val[0]);
|
|
uint8x8_t _dst5 = vreinterpret_u8_u32(_src15ttt_r.val[0]);
|
|
uint8x8_t _dst6 = vreinterpret_u8_u32(_src26ttt_r.val[0]);
|
|
uint8x8_t _dst7 = vreinterpret_u8_u32(_src37ttt_r.val[0]);
|
|
|
|
vst1_u8(dst0, _dst7);
|
|
vst1_u8(dst1, _dst6);
|
|
vst1_u8(dst0 + dst_step, _dst5);
|
|
vst1_u8(dst1 + dst_step, _dst4);
|
|
vst1_u8(dst0 + 2 * dst_step, _dst3);
|
|
vst1_u8(dst1 + 2 * dst_step, _dst2);
|
|
vst1_u8(dst0 + 3 * dst_step, _dst1);
|
|
vst1_u8(dst1 + 3 * dst_step, _dst0);
|
|
|
|
src0 += 8;
|
|
src1 += 8;
|
|
|
|
dst0 += 4 * dst_step;
|
|
dst1 += 4 * dst_step;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"0: \n"
|
|
"pld [%1, #64] \n"
|
|
"vld1.u8 {d0}, [%1], %10 \n"
|
|
|
|
"pld [%2, #64] \n"
|
|
"vld1.u8 {d1}, [%2], %10 \n"
|
|
|
|
"pld [%1, #64] \n"
|
|
"vld1.u8 {d2}, [%1], %10 \n"
|
|
|
|
"vtrn.u8 d1, d0 \n" // _src01t_r
|
|
|
|
"pld [%2, #64] \n"
|
|
"vld1.u8 {d3}, [%2], %10 \n"
|
|
|
|
"pld [%1, #64] \n"
|
|
"vld1.u8 {d4}, [%1], %10 \n"
|
|
|
|
"vtrn.u8 d3, d2 \n" // _src23t_r
|
|
|
|
"pld [%2, #64] \n"
|
|
"vld1.u8 {d5}, [%2], %10 \n"
|
|
|
|
"pld [%1, #64] \n"
|
|
"vld1.u8 {d6}, [%1], %10 \n"
|
|
|
|
"vtrn.u8 d5, d4 \n" // _src45t_r
|
|
|
|
"pld [%2, #64] \n"
|
|
"vld1.u8 {d7}, [%2], %10 \n"
|
|
|
|
"vtrn.u8 d7, d6 \n" // _src67t_r
|
|
|
|
"sub %1, %1, %10, lsl #2 \n" // restore src0
|
|
|
|
"vtrn.u16 q1, q0 \n" // _src02tt_r _src13tt_r
|
|
|
|
"sub %2, %2, %10, lsl #2 \n" // restore src1
|
|
|
|
"vtrn.u16 q3, q2 \n" // _src46tt_r _src57tt_r
|
|
|
|
"add %1, #8 \n" // src0 += 8
|
|
|
|
"vtrn.u32 q3, q1 \n" // _src26ttt_r _src37ttt_r
|
|
|
|
"add %2, #8 \n" // src1 += 8
|
|
|
|
"vtrn.u32 q2, q0 \n" // _src04ttt_r _src15ttt_r
|
|
"vst1.u8 {d6}, [%4], %11 \n"
|
|
"vst1.u8 {d7}, [%3], %11 \n"
|
|
|
|
"subs %0, #1 \n"
|
|
|
|
"vst1.u8 {d4}, [%4], %11 \n"
|
|
"vst1.u8 {d5}, [%3], %11 \n"
|
|
"vst1.u8 {d2}, [%4], %11 \n"
|
|
"vst1.u8 {d3}, [%3], %11 \n"
|
|
"vst1.u8 {d0}, [%4], %11 \n"
|
|
"vst1.u8 {d1}, [%3], %11 \n"
|
|
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(src1), // %2
|
|
"=r"(dst0), // %3
|
|
"=r"(dst1) // %4
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(src1),
|
|
"3"(dst0),
|
|
"4"(dst1),
|
|
"r"(src_step), // %10
|
|
"r"(dst_step) // %11
|
|
: "cc", "memory", "q0", "q1", "q2", "q3");
|
|
}
|
|
#endif // __aarch64__
|
|
for (; remain > 0; remain--)
|
|
{
|
|
dst0[0] = src1[0 + 3 * src_step];
|
|
dst0[1] = src0[0 + 3 * src_step];
|
|
dst0[2] = src1[0 + 2 * src_step];
|
|
dst0[3] = src0[0 + 2 * src_step];
|
|
dst0[4] = src1[0 + src_step];
|
|
dst0[5] = src0[0 + src_step];
|
|
dst0[6] = src1[0];
|
|
dst0[7] = src0[0];
|
|
|
|
src0 += 1;
|
|
src1 += 1;
|
|
|
|
dst0 += stride;
|
|
}
|
|
|
|
src0 += srcwgap + 7 * srcstride;
|
|
}
|
|
#endif // __ARM_NEON
|
|
for (; y < srch; y++)
|
|
{
|
|
unsigned char* dst0 = dstend - y - 1;
|
|
|
|
int x = 0;
|
|
for (; x < srcw; x++)
|
|
{
|
|
*dst0 = *src0;
|
|
|
|
src0 += 1;
|
|
dst0 += stride;
|
|
}
|
|
|
|
src0 += srcwgap;
|
|
}
|
|
}
|
|
|
|
static void kanna_rotate_6_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
|
|
{
|
|
const int srcwgap = srcstride - srcw * 2;
|
|
|
|
// point to the last dst pixel in row
|
|
unsigned char* dstend = dst + w * 2;
|
|
|
|
const unsigned char* src0 = src;
|
|
|
|
int y = 0;
|
|
#if __ARM_NEON
|
|
for (; y + 7 < srch; y += 8)
|
|
{
|
|
const unsigned char* src1 = src0 + srcstride;
|
|
|
|
unsigned char* dst0 = dstend - y * 2 - 8 * 2;
|
|
unsigned char* dst1 = dstend - y * 2 - 8 * 2 + stride;
|
|
|
|
int src_step = 2 * srcstride;
|
|
int dst_step = 2 * stride;
|
|
|
|
int nn = srcw >> 3;
|
|
int remain = srcw - (nn << 3);
|
|
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x8x2_t _src0 = vld2_u8(src0);
|
|
uint8x8x2_t _src1 = vld2_u8(src1);
|
|
|
|
uint8x8x2_t _src2 = vld2_u8(src0 + src_step);
|
|
uint8x8x2_t _src3 = vld2_u8(src1 + src_step);
|
|
|
|
uint8x8x2_t _src4 = vld2_u8(src0 + 2 * src_step);
|
|
uint8x8x2_t _src5 = vld2_u8(src1 + 2 * src_step);
|
|
|
|
uint8x8x2_t _src6 = vld2_u8(src0 + 3 * src_step);
|
|
uint8x8x2_t _src7 = vld2_u8(src1 + 3 * src_step);
|
|
|
|
uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]);
|
|
uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]);
|
|
uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]);
|
|
uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]);
|
|
|
|
uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]);
|
|
uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]);
|
|
uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]);
|
|
uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]);
|
|
|
|
uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
|
|
uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
|
|
uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
|
|
uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
|
|
|
|
uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1]));
|
|
uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0]));
|
|
uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1]));
|
|
uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0]));
|
|
|
|
uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
|
|
uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
|
|
uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
|
|
uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
|
|
|
|
uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1]));
|
|
uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1]));
|
|
uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0]));
|
|
uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0]));
|
|
|
|
uint8x8x2_t _dst0;
|
|
uint8x8x2_t _dst1;
|
|
uint8x8x2_t _dst2;
|
|
uint8x8x2_t _dst3;
|
|
uint8x8x2_t _dst4;
|
|
uint8x8x2_t _dst5;
|
|
uint8x8x2_t _dst6;
|
|
uint8x8x2_t _dst7;
|
|
|
|
_dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
|
|
_dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
|
|
_dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
|
|
_dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
|
|
_dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
|
|
_dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
|
|
_dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
|
|
_dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
|
|
|
|
_dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
|
|
_dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
|
|
_dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
|
|
_dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
|
|
_dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
|
|
_dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
|
|
_dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
|
|
_dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
|
|
|
|
vst2_u8(dst0, _dst7);
|
|
vst2_u8(dst1, _dst6);
|
|
vst2_u8(dst0 + dst_step, _dst5);
|
|
vst2_u8(dst1 + dst_step, _dst4);
|
|
vst2_u8(dst0 + 2 * dst_step, _dst3);
|
|
vst2_u8(dst1 + 2 * dst_step, _dst2);
|
|
vst2_u8(dst0 + 3 * dst_step, _dst1);
|
|
vst2_u8(dst1 + 3 * dst_step, _dst0);
|
|
|
|
src0 += 2 * 8;
|
|
src1 += 2 * 8;
|
|
|
|
dst0 += 4 * dst_step;
|
|
dst1 += 4 * dst_step;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"0: \n"
|
|
"pld [%1, #128] \n"
|
|
"vld2.u8 {d0-d1}, [%1], %10 \n"
|
|
|
|
"pld [%2, #128] \n"
|
|
"vld2.u8 {d2-d3}, [%2], %10 \n"
|
|
|
|
"pld [%1, #128] \n"
|
|
"vld2.u8 {d4-d5}, [%1], %10 \n"
|
|
|
|
"vtrn.u8 q1, q0 \n" // _src01t_r
|
|
|
|
"pld [%2, #128] \n"
|
|
"vld2.u8 {d6-d7}, [%2], %10 \n"
|
|
|
|
"pld [%1, #128] \n"
|
|
"vld2.u8 {d16-d17}, [%1], %10\n"
|
|
|
|
"vtrn.u8 q3, q2 \n" // _src23t_r
|
|
|
|
"pld [%2, #128] \n"
|
|
"vld2.u8 {d18-d19}, [%2], %10\n"
|
|
|
|
"pld [%1, #128] \n"
|
|
"vld2.u8 {d20-d21}, [%1], %10\n"
|
|
|
|
"vtrn.u8 q9, q8 \n" // _src45t_r
|
|
|
|
"pld [%2, #128] \n"
|
|
"vld2.u8 {d22-d23}, [%2], %10\n"
|
|
|
|
"vtrn.u8 q11, q10 \n" // _src67t_r
|
|
|
|
"sub %1, %1, %10, lsl #2 \n" // restore src0
|
|
|
|
"vtrn.u16 q2, q0 \n" // _src02tt_r
|
|
|
|
"sub %2, %2, %10, lsl #2 \n" // restore src1
|
|
|
|
"vtrn.u16 q3, q1 \n" // _src13tt_r
|
|
|
|
"add %1, #16 \n" // src0 += 16
|
|
|
|
"vtrn.u16 q10, q8 \n" // _src46tt_r
|
|
|
|
"add %2, #16 \n" // src1 += 16
|
|
|
|
"vtrn.u16 q11, q9 \n" // _src57tt_r
|
|
|
|
"vtrn.u32 q10, q2 \n" // _src26ttt_r
|
|
|
|
"vtrn.u32 q11, q3 \n" // _src37ttt_r
|
|
"vst2.u8 {d20-d21}, [%4], %11\n"
|
|
|
|
"vtrn.u32 q8, q0 \n" // _src04ttt_r
|
|
"vst2.u8 {d22-d23}, [%3], %11\n"
|
|
|
|
"vtrn.u32 q9, q1 \n" // _src15ttt_r
|
|
"vst2.u8 {d16-d17}, [%4], %11\n"
|
|
|
|
"subs %0, #1 \n"
|
|
|
|
"vst2.u8 {d18-d19}, [%3], %11\n"
|
|
"vst2.u8 {d4-d5}, [%4], %11 \n"
|
|
"vst2.u8 {d6-d7}, [%3], %11 \n"
|
|
"vst2.u8 {d0-d1}, [%4], %11 \n"
|
|
"vst2.u8 {d2-d3}, [%3], %11 \n"
|
|
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(src1), // %2
|
|
"=r"(dst0), // %3
|
|
"=r"(dst1) // %4
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(src1),
|
|
"3"(dst0),
|
|
"4"(dst1),
|
|
"r"(src_step), // %10
|
|
"r"(dst_step) // %11
|
|
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
|
|
}
|
|
#endif // __aarch64__
|
|
for (; remain > 0; remain--)
|
|
{
|
|
dst0[0] = src1[0 + 3 * src_step];
|
|
dst0[1] = src1[1 + 3 * src_step];
|
|
dst0[2] = src0[0 + 3 * src_step];
|
|
dst0[3] = src0[1 + 3 * src_step];
|
|
dst0[4] = src1[0 + 2 * src_step];
|
|
dst0[5] = src1[1 + 2 * src_step];
|
|
dst0[6] = src0[0 + 2 * src_step];
|
|
dst0[7] = src0[1 + 2 * src_step];
|
|
dst0[8] = src1[0 + src_step];
|
|
dst0[9] = src1[1 + src_step];
|
|
dst0[10] = src0[0 + src_step];
|
|
dst0[11] = src0[1 + src_step];
|
|
dst0[12] = src1[0];
|
|
dst0[13] = src1[1];
|
|
dst0[14] = src0[0];
|
|
dst0[15] = src0[1];
|
|
|
|
src0 += 2;
|
|
src1 += 2;
|
|
|
|
dst0 += stride;
|
|
}
|
|
|
|
src0 += srcwgap + 7 * srcstride;
|
|
}
|
|
#endif // __ARM_NEON
|
|
for (; y < srch; y++)
|
|
{
|
|
unsigned char* dst0 = dstend - y * 2 - 2;
|
|
|
|
int x = 0;
|
|
for (; x < srcw; x++)
|
|
{
|
|
dst0[0] = src0[0];
|
|
dst0[1] = src0[1];
|
|
|
|
src0 += 2;
|
|
dst0 += stride;
|
|
}
|
|
|
|
src0 += srcwgap;
|
|
}
|
|
}
|
|
|
|
static void kanna_rotate_6_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
|
|
{
|
|
const int srcwgap = srcstride - srcw * 3;
|
|
|
|
// point to the last dst pixel in row
|
|
unsigned char* dstend = dst + w * 3;
|
|
|
|
const unsigned char* src0 = src;
|
|
|
|
int y = 0;
|
|
#if __ARM_NEON
|
|
for (; y + 7 < srch; y += 8)
|
|
{
|
|
const unsigned char* src1 = src0 + srcstride;
|
|
|
|
unsigned char* dst0 = dstend - y * 3 - 8 * 3;
|
|
unsigned char* dst1 = dstend - y * 3 - 8 * 3 + stride;
|
|
|
|
int src_step = 2 * srcstride;
|
|
int dst_step = 2 * stride;
|
|
|
|
int nn = srcw >> 3;
|
|
int remain = srcw - (nn << 3);
|
|
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x8x3_t _src0 = vld3_u8(src0);
|
|
uint8x8x3_t _src1 = vld3_u8(src1);
|
|
|
|
uint8x8x3_t _src2 = vld3_u8(src0 + src_step);
|
|
uint8x8x3_t _src3 = vld3_u8(src1 + src_step);
|
|
|
|
uint8x8x3_t _src4 = vld3_u8(src0 + 2 * src_step);
|
|
uint8x8x3_t _src5 = vld3_u8(src1 + 2 * src_step);
|
|
|
|
uint8x8x3_t _src6 = vld3_u8(src0 + 3 * src_step);
|
|
uint8x8x3_t _src7 = vld3_u8(src1 + 3 * src_step);
|
|
|
|
uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]);
|
|
uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]);
|
|
uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]);
|
|
uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]);
|
|
|
|
uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]);
|
|
uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]);
|
|
uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]);
|
|
uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]);
|
|
|
|
uint8x8x2_t _src01t_b = vtrn_u8(_src1.val[2], _src0.val[2]);
|
|
uint8x8x2_t _src23t_b = vtrn_u8(_src3.val[2], _src2.val[2]);
|
|
uint8x8x2_t _src45t_b = vtrn_u8(_src5.val[2], _src4.val[2]);
|
|
uint8x8x2_t _src67t_b = vtrn_u8(_src7.val[2], _src6.val[2]);
|
|
|
|
uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
|
|
uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
|
|
uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
|
|
uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
|
|
|
|
uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1]));
|
|
uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0]));
|
|
uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1]));
|
|
uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0]));
|
|
|
|
uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[1]), vreinterpret_u16_u8(_src01t_b.val[1]));
|
|
uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[0]), vreinterpret_u16_u8(_src01t_b.val[0]));
|
|
uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[1]), vreinterpret_u16_u8(_src45t_b.val[1]));
|
|
uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[0]), vreinterpret_u16_u8(_src45t_b.val[0]));
|
|
|
|
uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
|
|
uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
|
|
uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
|
|
uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
|
|
|
|
uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1]));
|
|
uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1]));
|
|
uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0]));
|
|
uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0]));
|
|
|
|
uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[1]), vreinterpret_u32_u16(_src02tt_b.val[1]));
|
|
uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[1]), vreinterpret_u32_u16(_src13tt_b.val[1]));
|
|
uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[0]), vreinterpret_u32_u16(_src02tt_b.val[0]));
|
|
uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[0]), vreinterpret_u32_u16(_src13tt_b.val[0]));
|
|
|
|
uint8x8x3_t _dst0;
|
|
uint8x8x3_t _dst1;
|
|
uint8x8x3_t _dst2;
|
|
uint8x8x3_t _dst3;
|
|
uint8x8x3_t _dst4;
|
|
uint8x8x3_t _dst5;
|
|
uint8x8x3_t _dst6;
|
|
uint8x8x3_t _dst7;
|
|
|
|
_dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
|
|
_dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
|
|
_dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
|
|
_dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
|
|
_dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
|
|
_dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
|
|
_dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
|
|
_dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
|
|
|
|
_dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
|
|
_dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
|
|
_dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
|
|
_dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
|
|
_dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
|
|
_dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
|
|
_dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
|
|
_dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
|
|
|
|
_dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
|
|
_dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
|
|
_dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
|
|
_dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
|
|
_dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
|
|
_dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
|
|
_dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
|
|
_dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
|
|
|
|
vst3_u8(dst0, _dst7);
|
|
vst3_u8(dst1, _dst6);
|
|
vst3_u8(dst0 + dst_step, _dst5);
|
|
vst3_u8(dst1 + dst_step, _dst4);
|
|
vst3_u8(dst0 + 2 * dst_step, _dst3);
|
|
vst3_u8(dst1 + 2 * dst_step, _dst2);
|
|
vst3_u8(dst0 + 3 * dst_step, _dst1);
|
|
vst3_u8(dst1 + 3 * dst_step, _dst0);
|
|
|
|
src0 += 3 * 8;
|
|
src1 += 3 * 8;
|
|
|
|
dst0 += 4 * dst_step;
|
|
dst1 += 4 * dst_step;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"0: \n"
|
|
"pld [%1, #192] \n"
|
|
"vld3.u8 {d0-d2}, [%1], %10 \n"
|
|
|
|
"pld [%2, #192] \n"
|
|
"vld3.u8 {d4-d6}, [%2], %10 \n"
|
|
|
|
"pld [%1, #192] \n"
|
|
"vld3.u8 {d8-d10}, [%1], %10 \n"
|
|
|
|
"vtrn.u8 q2, q0 \n" // _src01t_r
|
|
"vtrn.u8 d6, d2 \n"
|
|
|
|
"pld [%2, #192] \n"
|
|
"vld3.u8 {d12-d14}, [%2], %10\n"
|
|
|
|
"pld [%1, #192] \n"
|
|
"vld3.u8 {d16-d18}, [%1], %10\n"
|
|
|
|
"vtrn.u8 q6, q4 \n" // _src23t_r
|
|
"vtrn.u8 d14, d10 \n"
|
|
|
|
"pld [%2, #192] \n"
|
|
"vld3.u8 {d20-d22}, [%2], %10\n"
|
|
|
|
"pld [%1, #192] \n"
|
|
"vld3.u8 {d24-d26}, [%1], %10\n"
|
|
|
|
"vtrn.u8 q10, q8 \n" // _src45t_r
|
|
"vtrn.u8 d22, d18 \n"
|
|
|
|
"pld [%2, #192] \n"
|
|
"vld3.u8 {d28-d30}, [%2], %10\n"
|
|
|
|
"vtrn.u8 q14, q12 \n" // _src67t_r
|
|
"vtrn.u8 d30, d26 \n"
|
|
|
|
"sub %1, %1, %10, lsl #2 \n" // restore src0
|
|
|
|
"vtrn.u16 q4, q0 \n" // _src02tt_r
|
|
"vtrn.u16 d10, d2 \n"
|
|
|
|
"sub %2, %2, %10, lsl #2 \n" // restore src1
|
|
|
|
"vtrn.u16 q6, q2 \n" // _src13tt_r
|
|
"vtrn.u16 d14, d6 \n"
|
|
|
|
"add %1, #24 \n" // src0 += 24
|
|
|
|
"vtrn.u16 q12, q8 \n" // _src46tt_r
|
|
"vtrn.u16 d26, d18 \n"
|
|
|
|
"add %2, #24 \n" // src1 += 24
|
|
|
|
"vtrn.u16 q14, q10 \n" // _src57tt_r
|
|
"vtrn.u16 d30, d22 \n"
|
|
|
|
"vtrn.u32 q12, q4 \n" // _src26ttt_r
|
|
"vtrn.u32 d26, d10 \n"
|
|
|
|
"vtrn.u32 q14, q6 \n" // _src37ttt_r
|
|
"vst3.u8 {d24-d26}, [%4], %11\n"
|
|
"vtrn.u32 d30, d14 \n"
|
|
|
|
"vtrn.u32 q8, q0 \n" // _src04ttt_r
|
|
"vst3.u8 {d28-d30}, [%3], %11\n"
|
|
"vtrn.u32 d18, d2 \n"
|
|
|
|
"vtrn.u32 q10, q2 \n" // _src15ttt_r
|
|
"vst3.u8 {d16-d18}, [%4], %11\n"
|
|
"vtrn.u32 d22, d6 \n"
|
|
|
|
"subs %0, #1 \n"
|
|
|
|
"vst3.u8 {d20-d22}, [%3], %11\n"
|
|
"vst3.u8 {d8-d10}, [%4], %11 \n"
|
|
"vst3.u8 {d12-d14}, [%3], %11\n"
|
|
"vst3.u8 {d0-d2}, [%4], %11 \n"
|
|
"vst3.u8 {d4-d6}, [%3], %11 \n"
|
|
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(src1), // %2
|
|
"=r"(dst0), // %3
|
|
"=r"(dst1) // %4
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(src1),
|
|
"3"(dst0),
|
|
"4"(dst1),
|
|
"r"(src_step), // %10
|
|
"r"(dst_step) // %11
|
|
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
|
|
}
|
|
#endif // __aarch64__
|
|
for (; remain > 0; remain--)
|
|
{
|
|
dst0[0] = src1[0 + 3 * src_step];
|
|
dst0[1] = src1[1 + 3 * src_step];
|
|
dst0[2] = src1[2 + 3 * src_step];
|
|
dst0[3] = src0[0 + 3 * src_step];
|
|
dst0[4] = src0[1 + 3 * src_step];
|
|
dst0[5] = src0[2 + 3 * src_step];
|
|
dst0[6] = src1[0 + 2 * src_step];
|
|
dst0[7] = src1[1 + 2 * src_step];
|
|
dst0[8] = src1[2 + 2 * src_step];
|
|
dst0[9] = src0[0 + 2 * src_step];
|
|
dst0[10] = src0[1 + 2 * src_step];
|
|
dst0[11] = src0[2 + 2 * src_step];
|
|
dst0[12] = src1[0 + src_step];
|
|
dst0[13] = src1[1 + src_step];
|
|
dst0[14] = src1[2 + src_step];
|
|
dst0[15] = src0[0 + src_step];
|
|
dst0[16] = src0[1 + src_step];
|
|
dst0[17] = src0[2 + src_step];
|
|
dst0[18] = src1[0];
|
|
dst0[19] = src1[1];
|
|
dst0[20] = src1[2];
|
|
dst0[21] = src0[0];
|
|
dst0[22] = src0[1];
|
|
dst0[23] = src0[2];
|
|
|
|
src0 += 3;
|
|
src1 += 3;
|
|
|
|
dst0 += stride;
|
|
}
|
|
|
|
src0 += srcwgap + 7 * srcstride;
|
|
}
|
|
#endif // __ARM_NEON
|
|
for (; y < srch; y++)
|
|
{
|
|
unsigned char* dst0 = dstend - y * 3 - 3;
|
|
|
|
int x = 0;
|
|
for (; x < srcw; x++)
|
|
{
|
|
dst0[0] = src0[0];
|
|
dst0[1] = src0[1];
|
|
dst0[2] = src0[2];
|
|
|
|
src0 += 3;
|
|
dst0 += stride;
|
|
}
|
|
|
|
src0 += srcwgap;
|
|
}
|
|
}
|
|
|
|
static void kanna_rotate_6_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
|
|
{
|
|
const int srcwgap = srcstride - srcw * 4;
|
|
|
|
// point to the last dst pixel in row
|
|
unsigned char* dstend = dst + w * 4;
|
|
|
|
const unsigned char* src0 = src;
|
|
|
|
int y = 0;
|
|
#if __ARM_NEON
|
|
for (; y + 7 < srch; y += 8)
|
|
{
|
|
const unsigned char* src1 = src0 + srcstride;
|
|
|
|
unsigned char* dst0 = dstend - y * 4 - 8 * 4;
|
|
unsigned char* dst1 = dstend - y * 4 - 8 * 4 + stride;
|
|
|
|
int src_step = 2 * srcstride;
|
|
int dst_step = 2 * stride;
|
|
|
|
int nn = srcw >> 3;
|
|
int remain = srcw - (nn << 3);
|
|
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x8x4_t _src0 = vld4_u8(src0);
|
|
uint8x8x4_t _src1 = vld4_u8(src1);
|
|
|
|
uint8x8x4_t _src2 = vld4_u8(src0 + src_step);
|
|
uint8x8x4_t _src3 = vld4_u8(src1 + src_step);
|
|
|
|
uint8x8x4_t _src4 = vld4_u8(src0 + 2 * src_step);
|
|
uint8x8x4_t _src5 = vld4_u8(src1 + 2 * src_step);
|
|
|
|
uint8x8x4_t _src6 = vld4_u8(src0 + 3 * src_step);
|
|
uint8x8x4_t _src7 = vld4_u8(src1 + 3 * src_step);
|
|
|
|
uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]);
|
|
uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]);
|
|
uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]);
|
|
uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]);
|
|
|
|
uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]);
|
|
uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]);
|
|
uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]);
|
|
uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]);
|
|
|
|
uint8x8x2_t _src01t_b = vtrn_u8(_src1.val[2], _src0.val[2]);
|
|
uint8x8x2_t _src23t_b = vtrn_u8(_src3.val[2], _src2.val[2]);
|
|
uint8x8x2_t _src45t_b = vtrn_u8(_src5.val[2], _src4.val[2]);
|
|
uint8x8x2_t _src67t_b = vtrn_u8(_src7.val[2], _src6.val[2]);
|
|
|
|
uint8x8x2_t _src01t_a = vtrn_u8(_src1.val[3], _src0.val[3]);
|
|
uint8x8x2_t _src23t_a = vtrn_u8(_src3.val[3], _src2.val[3]);
|
|
uint8x8x2_t _src45t_a = vtrn_u8(_src5.val[3], _src4.val[3]);
|
|
uint8x8x2_t _src67t_a = vtrn_u8(_src7.val[3], _src6.val[3]);
|
|
|
|
uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
|
|
uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
|
|
uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
|
|
uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
|
|
|
|
uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1]));
|
|
uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0]));
|
|
uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1]));
|
|
uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0]));
|
|
|
|
uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[1]), vreinterpret_u16_u8(_src01t_b.val[1]));
|
|
uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[0]), vreinterpret_u16_u8(_src01t_b.val[0]));
|
|
uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[1]), vreinterpret_u16_u8(_src45t_b.val[1]));
|
|
uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[0]), vreinterpret_u16_u8(_src45t_b.val[0]));
|
|
|
|
uint16x4x2_t _src02tt_a = vtrn_u16(vreinterpret_u16_u8(_src23t_a.val[1]), vreinterpret_u16_u8(_src01t_a.val[1]));
|
|
uint16x4x2_t _src13tt_a = vtrn_u16(vreinterpret_u16_u8(_src23t_a.val[0]), vreinterpret_u16_u8(_src01t_a.val[0]));
|
|
uint16x4x2_t _src46tt_a = vtrn_u16(vreinterpret_u16_u8(_src67t_a.val[1]), vreinterpret_u16_u8(_src45t_a.val[1]));
|
|
uint16x4x2_t _src57tt_a = vtrn_u16(vreinterpret_u16_u8(_src67t_a.val[0]), vreinterpret_u16_u8(_src45t_a.val[0]));
|
|
|
|
uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
|
|
uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
|
|
uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
|
|
uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
|
|
|
|
uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1]));
|
|
uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1]));
|
|
uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0]));
|
|
uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0]));
|
|
|
|
uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[1]), vreinterpret_u32_u16(_src02tt_b.val[1]));
|
|
uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[1]), vreinterpret_u32_u16(_src13tt_b.val[1]));
|
|
uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[0]), vreinterpret_u32_u16(_src02tt_b.val[0]));
|
|
uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[0]), vreinterpret_u32_u16(_src13tt_b.val[0]));
|
|
|
|
uint32x2x2_t _src04ttt_a = vtrn_u32(vreinterpret_u32_u16(_src46tt_a.val[1]), vreinterpret_u32_u16(_src02tt_a.val[1]));
|
|
uint32x2x2_t _src15ttt_a = vtrn_u32(vreinterpret_u32_u16(_src57tt_a.val[1]), vreinterpret_u32_u16(_src13tt_a.val[1]));
|
|
uint32x2x2_t _src26ttt_a = vtrn_u32(vreinterpret_u32_u16(_src46tt_a.val[0]), vreinterpret_u32_u16(_src02tt_a.val[0]));
|
|
uint32x2x2_t _src37ttt_a = vtrn_u32(vreinterpret_u32_u16(_src57tt_a.val[0]), vreinterpret_u32_u16(_src13tt_a.val[0]));
|
|
|
|
uint8x8x4_t _dst0;
|
|
uint8x8x4_t _dst1;
|
|
uint8x8x4_t _dst2;
|
|
uint8x8x4_t _dst3;
|
|
uint8x8x4_t _dst4;
|
|
uint8x8x4_t _dst5;
|
|
uint8x8x4_t _dst6;
|
|
uint8x8x4_t _dst7;
|
|
|
|
_dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
|
|
_dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
|
|
_dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
|
|
_dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
|
|
_dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
|
|
_dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
|
|
_dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
|
|
_dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
|
|
|
|
_dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
|
|
_dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
|
|
_dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
|
|
_dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
|
|
_dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
|
|
_dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
|
|
_dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
|
|
_dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
|
|
|
|
_dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
|
|
_dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
|
|
_dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
|
|
_dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
|
|
_dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
|
|
_dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
|
|
_dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
|
|
_dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
|
|
|
|
_dst0.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[1]);
|
|
_dst1.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[1]);
|
|
_dst2.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[1]);
|
|
_dst3.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[1]);
|
|
_dst4.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[0]);
|
|
_dst5.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[0]);
|
|
_dst6.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[0]);
|
|
_dst7.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[0]);
|
|
|
|
vst4_u8(dst0, _dst7);
|
|
vst4_u8(dst1, _dst6);
|
|
vst4_u8(dst0 + dst_step, _dst5);
|
|
vst4_u8(dst1 + dst_step, _dst4);
|
|
vst4_u8(dst0 + 2 * dst_step, _dst3);
|
|
vst4_u8(dst1 + 2 * dst_step, _dst2);
|
|
vst4_u8(dst0 + 3 * dst_step, _dst1);
|
|
vst4_u8(dst1 + 3 * dst_step, _dst0);
|
|
|
|
src0 += 4 * 8;
|
|
src1 += 4 * 8;
|
|
|
|
dst0 += 4 * dst_step;
|
|
dst1 += 4 * dst_step;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"0: \n"
|
|
"pld [%1, #256] \n"
|
|
"vld4.u8 {d0-d3}, [%1], %10 \n"
|
|
|
|
"pld [%2, #256] \n"
|
|
"vld4.u8 {d4-d7}, [%2], %10 \n"
|
|
|
|
"pld [%1, #256] \n"
|
|
"vld4.u8 {d8-d11}, [%1], %10 \n"
|
|
|
|
"vtrn.u8 q2, q0 \n" // _src01t_r
|
|
"vtrn.u8 q3, q1 \n"
|
|
|
|
"pld [%2, #256] \n"
|
|
"vld4.u8 {d12-d15}, [%2], %10\n"
|
|
|
|
"pld [%1, #256] \n"
|
|
"vld4.u8 {d16-d19}, [%1], %10\n"
|
|
|
|
"vtrn.u8 q6, q4 \n" // _src23t_r
|
|
"vtrn.u8 q7, q5 \n"
|
|
|
|
"pld [%2, #256] \n"
|
|
"vld4.u8 {d20-d23}, [%2], %10\n"
|
|
|
|
"pld [%1, #256] \n"
|
|
"vld4.u8 {d24-d27}, [%1], %10\n"
|
|
|
|
"vtrn.u8 q10, q8 \n" // _src45t_r
|
|
"vtrn.u8 q11, q9 \n"
|
|
|
|
"pld [%2, #256] \n"
|
|
"vld4.u8 {d28-d31}, [%2], %10\n"
|
|
|
|
"vtrn.u8 q14, q12 \n" // _src67t_r
|
|
"vtrn.u8 q15, q13 \n"
|
|
|
|
"sub %1, %1, %10, lsl #2 \n" // restore src0
|
|
|
|
"vtrn.u16 q4, q0 \n" // _src02tt_r
|
|
"vtrn.u16 q5, q1 \n"
|
|
|
|
"sub %2, %2, %10, lsl #2 \n" // restore src1
|
|
|
|
"vtrn.u16 q6, q2 \n" // _src13tt_r
|
|
"vtrn.u16 q7, q3 \n"
|
|
|
|
"add %1, #32 \n" // src0 += 32
|
|
|
|
"vtrn.u16 q12, q8 \n" // _src46tt_r
|
|
"vtrn.u16 q13, q9 \n"
|
|
|
|
"add %2, #32 \n" // src1 += 32
|
|
|
|
"vtrn.u16 q14, q10 \n" // _src57tt_r
|
|
"vtrn.u16 q15, q11 \n"
|
|
|
|
"vtrn.u32 q12, q4 \n" // _src26ttt_r
|
|
"vtrn.u32 q13, q5 \n"
|
|
|
|
"vtrn.u32 q14, q6 \n" // _src37ttt_r
|
|
"vst4.u8 {d24-d27}, [%4], %11\n"
|
|
"vtrn.u32 q15, q7 \n"
|
|
|
|
"vtrn.u32 q8, q0 \n" // _src04ttt_r
|
|
"vst4.u8 {d28-d31}, [%3], %11\n"
|
|
"vtrn.u32 q9, q1 \n"
|
|
|
|
"vtrn.u32 q10, q2 \n" // _src15ttt_r
|
|
"vst4.u8 {d16-d19}, [%4], %11\n"
|
|
"vtrn.u32 q11, q3 \n"
|
|
|
|
"subs %0, #1 \n"
|
|
|
|
"vst4.u8 {d8-d11}, [%4], %11 \n"
|
|
"vst4.u8 {d20-d23}, [%3], %11\n"
|
|
"vst4.u8 {d12-d15}, [%3], %11\n"
|
|
"vst4.u8 {d0-d3}, [%4], %11 \n"
|
|
"vst4.u8 {d4-d7}, [%3], %11 \n"
|
|
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(src1), // %2
|
|
"=r"(dst0), // %3
|
|
"=r"(dst1) // %4
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(src1),
|
|
"3"(dst0),
|
|
"4"(dst1),
|
|
"r"(src_step), // %10
|
|
"r"(dst_step) // %11
|
|
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
|
|
}
|
|
#endif // __aarch64__
|
|
for (; remain > 0; remain--)
|
|
{
|
|
dst0[0] = src1[0 + 3 * src_step];
|
|
dst0[1] = src1[1 + 3 * src_step];
|
|
dst0[2] = src1[2 + 3 * src_step];
|
|
dst0[3] = src1[3 + 3 * src_step];
|
|
dst0[4] = src0[0 + 3 * src_step];
|
|
dst0[5] = src0[1 + 3 * src_step];
|
|
dst0[6] = src0[2 + 3 * src_step];
|
|
dst0[7] = src0[3 + 3 * src_step];
|
|
dst0[8] = src1[0 + 2 * src_step];
|
|
dst0[9] = src1[1 + 2 * src_step];
|
|
dst0[10] = src1[2 + 2 * src_step];
|
|
dst0[11] = src1[3 + 2 * src_step];
|
|
dst0[12] = src0[0 + 2 * src_step];
|
|
dst0[13] = src0[1 + 2 * src_step];
|
|
dst0[14] = src0[2 + 2 * src_step];
|
|
dst0[15] = src0[3 + 2 * src_step];
|
|
dst0[16] = src1[0 + src_step];
|
|
dst0[17] = src1[1 + src_step];
|
|
dst0[18] = src1[2 + src_step];
|
|
dst0[19] = src1[3 + src_step];
|
|
dst0[20] = src0[0 + src_step];
|
|
dst0[21] = src0[1 + src_step];
|
|
dst0[22] = src0[2 + src_step];
|
|
dst0[23] = src0[3 + src_step];
|
|
dst0[24] = src1[0];
|
|
dst0[25] = src1[1];
|
|
dst0[26] = src1[2];
|
|
dst0[27] = src1[3];
|
|
dst0[28] = src0[0];
|
|
dst0[29] = src0[1];
|
|
dst0[30] = src0[2];
|
|
dst0[31] = src0[3];
|
|
|
|
src0 += 4;
|
|
src1 += 4;
|
|
|
|
dst0 += stride;
|
|
}
|
|
|
|
src0 += srcwgap + 7 * srcstride;
|
|
}
|
|
#endif // __ARM_NEON
|
|
for (; y < srch; y++)
|
|
{
|
|
unsigned char* dst0 = dstend - y * 4 - 4;
|
|
|
|
int x = 0;
|
|
for (; x < srcw; x++)
|
|
{
|
|
dst0[0] = src0[0];
|
|
dst0[1] = src0[1];
|
|
dst0[2] = src0[2];
|
|
dst0[3] = src0[3];
|
|
|
|
src0 += 4;
|
|
dst0 += stride;
|
|
}
|
|
|
|
src0 += srcwgap;
|
|
}
|
|
}
|
|
|
|
static void kanna_rotate_7_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
|
|
{
|
|
const int srcwgap = srcstride - srcw;
|
|
|
|
// point to the last dst pixel
|
|
unsigned char* dstend = dst + stride * (h - 1) + w;
|
|
|
|
const unsigned char* src0 = src;
|
|
|
|
int y = 0;
|
|
#if __ARM_NEON
|
|
for (; y + 7 < srch; y += 8)
|
|
{
|
|
const unsigned char* src1 = src0 + srcstride;
|
|
|
|
unsigned char* dst6 = dstend - y - 8 - stride;
|
|
unsigned char* dst7 = dstend - y - 8;
|
|
|
|
int src_step = 2 * srcstride;
|
|
int dst_step = -2 * stride;
|
|
|
|
int nn = srcw >> 3;
|
|
int remain = srcw - (nn << 3);
|
|
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x8_t _src0 = vld1_u8(src0);
|
|
uint8x8_t _src1 = vld1_u8(src1);
|
|
|
|
uint8x8_t _src2 = vld1_u8(src0 + src_step);
|
|
uint8x8_t _src3 = vld1_u8(src1 + src_step);
|
|
|
|
uint8x8_t _src4 = vld1_u8(src0 + 2 * src_step);
|
|
uint8x8_t _src5 = vld1_u8(src1 + 2 * src_step);
|
|
|
|
uint8x8_t _src6 = vld1_u8(src0 + 3 * src_step);
|
|
uint8x8_t _src7 = vld1_u8(src1 + 3 * src_step);
|
|
|
|
uint8x8x2_t _src01t_r = vtrn_u8(_src1, _src0);
|
|
uint8x8x2_t _src23t_r = vtrn_u8(_src3, _src2);
|
|
uint8x8x2_t _src45t_r = vtrn_u8(_src5, _src4);
|
|
uint8x8x2_t _src67t_r = vtrn_u8(_src7, _src6);
|
|
|
|
uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
|
|
uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
|
|
uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
|
|
uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
|
|
|
|
uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
|
|
uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
|
|
uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
|
|
uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
|
|
|
|
uint8x8_t _dst0 = vreinterpret_u8_u32(_src04ttt_r.val[1]);
|
|
uint8x8_t _dst1 = vreinterpret_u8_u32(_src15ttt_r.val[1]);
|
|
uint8x8_t _dst2 = vreinterpret_u8_u32(_src26ttt_r.val[1]);
|
|
uint8x8_t _dst3 = vreinterpret_u8_u32(_src37ttt_r.val[1]);
|
|
uint8x8_t _dst4 = vreinterpret_u8_u32(_src04ttt_r.val[0]);
|
|
uint8x8_t _dst5 = vreinterpret_u8_u32(_src15ttt_r.val[0]);
|
|
uint8x8_t _dst6 = vreinterpret_u8_u32(_src26ttt_r.val[0]);
|
|
uint8x8_t _dst7 = vreinterpret_u8_u32(_src37ttt_r.val[0]);
|
|
|
|
vst1_u8(dst7, _dst7);
|
|
vst1_u8(dst6, _dst6);
|
|
vst1_u8(dst7 + dst_step, _dst5);
|
|
vst1_u8(dst6 + dst_step, _dst4);
|
|
vst1_u8(dst7 + 2 * dst_step, _dst3);
|
|
vst1_u8(dst6 + 2 * dst_step, _dst2);
|
|
vst1_u8(dst7 + 3 * dst_step, _dst1);
|
|
vst1_u8(dst6 + 3 * dst_step, _dst0);
|
|
|
|
src0 += 8;
|
|
src1 += 8;
|
|
|
|
dst7 += 4 * dst_step;
|
|
dst6 += 4 * dst_step;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"0: \n"
|
|
"pld [%1, #64] \n"
|
|
"vld1.u8 {d0}, [%1], %10 \n"
|
|
|
|
"pld [%2, #64] \n"
|
|
"vld1.u8 {d1}, [%2], %10 \n"
|
|
|
|
"pld [%1, #64] \n"
|
|
"vld1.u8 {d2}, [%1], %10 \n"
|
|
|
|
"vtrn.u8 d1, d0 \n" // _src01t_r
|
|
|
|
"pld [%2, #64] \n"
|
|
"vld1.u8 {d3}, [%2], %10 \n"
|
|
|
|
"pld [%1, #64] \n"
|
|
"vld1.u8 {d4}, [%1], %10 \n"
|
|
|
|
"vtrn.u8 d3, d2 \n" // _src23t_r
|
|
|
|
"pld [%2, #64] \n"
|
|
"vld1.u8 {d5}, [%2], %10 \n"
|
|
|
|
"pld [%1, #64] \n"
|
|
"vld1.u8 {d6}, [%1], %10 \n"
|
|
|
|
"vtrn.u8 d5, d4 \n" // _src45t_r
|
|
|
|
"pld [%2, #64] \n"
|
|
"vld1.u8 {d7}, [%2], %10 \n"
|
|
|
|
"vtrn.u8 d7, d6 \n" // _src67t_r
|
|
|
|
"sub %1, %1, %10, lsl #2 \n" // restore src0
|
|
|
|
"vtrn.u16 q1, q0 \n" // _src02tt_r _src13tt_r
|
|
|
|
"sub %2, %2, %10, lsl #2 \n" // restore src1
|
|
|
|
"vtrn.u16 q3, q2 \n" // _src46tt_r _src57tt_r
|
|
|
|
"add %1, #8 \n" // src0 += 8
|
|
|
|
"vtrn.u32 q3, q1 \n" // _src26ttt_r _src37ttt_r
|
|
|
|
"add %2, #8 \n" // src1 += 8
|
|
|
|
"vtrn.u32 q2, q0 \n" // _src04ttt_r _src15ttt_r
|
|
"vst1.u8 {d6}, [%4], %11 \n"
|
|
"vst1.u8 {d7}, [%3], %11 \n"
|
|
|
|
"subs %0, #1 \n"
|
|
|
|
"vst1.u8 {d4}, [%4], %11 \n"
|
|
"vst1.u8 {d5}, [%3], %11 \n"
|
|
"vst1.u8 {d2}, [%4], %11 \n"
|
|
"vst1.u8 {d3}, [%3], %11 \n"
|
|
"vst1.u8 {d0}, [%4], %11 \n"
|
|
"vst1.u8 {d1}, [%3], %11 \n"
|
|
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(src1), // %2
|
|
"=r"(dst7), // %3
|
|
"=r"(dst6) // %4
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(src1),
|
|
"3"(dst7),
|
|
"4"(dst6),
|
|
"r"(src_step), // %10
|
|
"r"(dst_step) // %11
|
|
: "cc", "memory", "q0", "q1", "q2", "q3");
|
|
}
|
|
#endif // __aarch64__
|
|
for (; remain > 0; remain--)
|
|
{
|
|
dst7[0] = src1[0 + 3 * src_step];
|
|
dst7[1] = src0[0 + 3 * src_step];
|
|
dst7[2] = src1[0 + 2 * src_step];
|
|
dst7[3] = src0[0 + 2 * src_step];
|
|
dst7[4] = src1[0 + src_step];
|
|
dst7[5] = src0[0 + src_step];
|
|
dst7[6] = src1[0];
|
|
dst7[7] = src0[0];
|
|
|
|
src0 += 1;
|
|
src1 += 1;
|
|
|
|
dst7 -= stride;
|
|
}
|
|
|
|
src0 += srcwgap + 7 * srcstride;
|
|
}
|
|
#endif // __ARM_NEON
|
|
for (; y < srch; y++)
|
|
{
|
|
unsigned char* dst0 = dstend - y - 1;
|
|
|
|
int x = 0;
|
|
for (; x < srcw; x++)
|
|
{
|
|
*dst0 = *src0;
|
|
|
|
src0 += 1;
|
|
dst0 -= stride;
|
|
}
|
|
|
|
src0 += srcwgap;
|
|
}
|
|
}
|
|
|
|
static void kanna_rotate_7_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
|
|
{
|
|
const int srcwgap = srcstride - srcw * 2;
|
|
|
|
// point to the last dst pixel
|
|
unsigned char* dstend = dst + stride * (h - 1) + w * 2;
|
|
|
|
const unsigned char* src0 = src;
|
|
|
|
int y = 0;
|
|
#if __ARM_NEON
|
|
for (; y + 7 < srch; y += 8)
|
|
{
|
|
const unsigned char* src1 = src0 + srcstride;
|
|
|
|
unsigned char* dst6 = dstend - y * 2 - 8 * 2 - stride;
|
|
unsigned char* dst7 = dstend - y * 2 - 8 * 2;
|
|
|
|
int src_step = 2 * srcstride;
|
|
int dst_step = -2 * stride;
|
|
|
|
int nn = srcw >> 3;
|
|
int remain = srcw - (nn << 3);
|
|
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x8x2_t _src0 = vld2_u8(src0);
|
|
uint8x8x2_t _src1 = vld2_u8(src1);
|
|
|
|
uint8x8x2_t _src2 = vld2_u8(src0 + src_step);
|
|
uint8x8x2_t _src3 = vld2_u8(src1 + src_step);
|
|
|
|
uint8x8x2_t _src4 = vld2_u8(src0 + 2 * src_step);
|
|
uint8x8x2_t _src5 = vld2_u8(src1 + 2 * src_step);
|
|
|
|
uint8x8x2_t _src6 = vld2_u8(src0 + 3 * src_step);
|
|
uint8x8x2_t _src7 = vld2_u8(src1 + 3 * src_step);
|
|
|
|
uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]);
|
|
uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]);
|
|
uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]);
|
|
uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]);
|
|
|
|
uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]);
|
|
uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]);
|
|
uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]);
|
|
uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]);
|
|
|
|
uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
|
|
uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
|
|
uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
|
|
uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
|
|
|
|
uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1]));
|
|
uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0]));
|
|
uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1]));
|
|
uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0]));
|
|
|
|
uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
|
|
uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
|
|
uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
|
|
uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
|
|
|
|
uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1]));
|
|
uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1]));
|
|
uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0]));
|
|
uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0]));
|
|
|
|
uint8x8x2_t _dst0;
|
|
uint8x8x2_t _dst1;
|
|
uint8x8x2_t _dst2;
|
|
uint8x8x2_t _dst3;
|
|
uint8x8x2_t _dst4;
|
|
uint8x8x2_t _dst5;
|
|
uint8x8x2_t _dst6;
|
|
uint8x8x2_t _dst7;
|
|
|
|
_dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
|
|
_dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
|
|
_dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
|
|
_dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
|
|
_dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
|
|
_dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
|
|
_dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
|
|
_dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
|
|
|
|
_dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
|
|
_dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
|
|
_dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
|
|
_dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
|
|
_dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
|
|
_dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
|
|
_dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
|
|
_dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
|
|
|
|
vst2_u8(dst7, _dst7);
|
|
vst2_u8(dst6, _dst6);
|
|
vst2_u8(dst7 + dst_step, _dst5);
|
|
vst2_u8(dst6 + dst_step, _dst4);
|
|
vst2_u8(dst7 + 2 * dst_step, _dst3);
|
|
vst2_u8(dst6 + 2 * dst_step, _dst2);
|
|
vst2_u8(dst7 + 3 * dst_step, _dst1);
|
|
vst2_u8(dst6 + 3 * dst_step, _dst0);
|
|
|
|
src0 += 2 * 8;
|
|
src1 += 2 * 8;
|
|
|
|
dst7 += 4 * dst_step;
|
|
dst6 += 4 * dst_step;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"0: \n"
|
|
"pld [%1, #128] \n"
|
|
"vld2.u8 {d0-d1}, [%1], %10 \n"
|
|
|
|
"pld [%2, #128] \n"
|
|
"vld2.u8 {d2-d3}, [%2], %10 \n"
|
|
|
|
"pld [%1, #128] \n"
|
|
"vld2.u8 {d4-d5}, [%1], %10 \n"
|
|
|
|
"vtrn.u8 q1, q0 \n" // _src01t_r
|
|
|
|
"pld [%2, #128] \n"
|
|
"vld2.u8 {d6-d7}, [%2], %10 \n"
|
|
|
|
"pld [%1, #128] \n"
|
|
"vld2.u8 {d16-d17}, [%1], %10\n"
|
|
|
|
"vtrn.u8 q3, q2 \n" // _src23t_r
|
|
|
|
"pld [%2, #128] \n"
|
|
"vld2.u8 {d18-d19}, [%2], %10\n"
|
|
|
|
"pld [%1, #128] \n"
|
|
"vld2.u8 {d20-d21}, [%1], %10\n"
|
|
|
|
"vtrn.u8 q9, q8 \n" // _src45t_r
|
|
|
|
"pld [%2, #128] \n"
|
|
"vld2.u8 {d22-d23}, [%2], %10\n"
|
|
|
|
"vtrn.u8 q11, q10 \n" // _src67t_r
|
|
|
|
"sub %1, %1, %10, lsl #2 \n" // restore src0
|
|
|
|
"vtrn.u16 q2, q0 \n" // _src02tt_r
|
|
|
|
"sub %2, %2, %10, lsl #2 \n" // restore src1
|
|
|
|
"vtrn.u16 q3, q1 \n" // _src13tt_r
|
|
|
|
"add %1, #16 \n" // src0 += 16
|
|
|
|
"vtrn.u16 q10, q8 \n" // _src46tt_r
|
|
|
|
"add %2, #16 \n" // src1 += 16
|
|
|
|
"vtrn.u16 q11, q9 \n" // _src57tt_r
|
|
|
|
"vtrn.u32 q10, q2 \n" // _src26ttt_r
|
|
|
|
"vtrn.u32 q11, q3 \n" // _src37ttt_r
|
|
"vst2.u8 {d20-d21}, [%4], %11\n"
|
|
|
|
"vtrn.u32 q8, q0 \n" // _src04ttt_r
|
|
"vst2.u8 {d22-d23}, [%3], %11\n"
|
|
|
|
"vtrn.u32 q9, q1 \n" // _src15ttt_r
|
|
"vst2.u8 {d16-d17}, [%4], %11\n"
|
|
|
|
"subs %0, #1 \n"
|
|
|
|
"vst2.u8 {d4-d5}, [%4], %11 \n"
|
|
"vst2.u8 {d18-d19}, [%3], %11\n"
|
|
"vst2.u8 {d6-d7}, [%3], %11 \n"
|
|
"vst2.u8 {d0-d1}, [%4], %11 \n"
|
|
"vst2.u8 {d2-d3}, [%3], %11 \n"
|
|
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(src1), // %2
|
|
"=r"(dst7), // %3
|
|
"=r"(dst6) // %4
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(src1),
|
|
"3"(dst7),
|
|
"4"(dst6),
|
|
"r"(src_step), // %10
|
|
"r"(dst_step) // %11
|
|
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
|
|
}
|
|
#endif // __aarch64__
|
|
for (; remain > 0; remain--)
|
|
{
|
|
dst7[0] = src1[0 + 3 * src_step];
|
|
dst7[1] = src1[1 + 3 * src_step];
|
|
dst7[2] = src0[0 + 3 * src_step];
|
|
dst7[3] = src0[1 + 3 * src_step];
|
|
dst7[4] = src1[0 + 2 * src_step];
|
|
dst7[5] = src1[1 + 2 * src_step];
|
|
dst7[6] = src0[0 + 2 * src_step];
|
|
dst7[7] = src0[1 + 2 * src_step];
|
|
dst7[8] = src1[0 + src_step];
|
|
dst7[9] = src1[1 + src_step];
|
|
dst7[10] = src0[0 + src_step];
|
|
dst7[11] = src0[1 + src_step];
|
|
dst7[12] = src1[0];
|
|
dst7[13] = src1[1];
|
|
dst7[14] = src0[0];
|
|
dst7[15] = src0[1];
|
|
|
|
src0 += 2;
|
|
src1 += 2;
|
|
|
|
dst7 -= stride;
|
|
}
|
|
|
|
src0 += srcwgap + 7 * srcstride;
|
|
}
|
|
#endif // __ARM_NEON
|
|
for (; y < srch; y++)
|
|
{
|
|
unsigned char* dst0 = dstend - y * 2 - 2;
|
|
|
|
int x = 0;
|
|
for (; x < srcw; x++)
|
|
{
|
|
dst0[0] = src0[0];
|
|
dst0[1] = src0[1];
|
|
|
|
src0 += 2;
|
|
dst0 -= stride;
|
|
}
|
|
|
|
src0 += srcwgap;
|
|
}
|
|
}
|
|
|
|
static void kanna_rotate_7_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
|
|
{
|
|
const int srcwgap = srcstride - srcw * 3;
|
|
|
|
// point to the last dst pixel
|
|
unsigned char* dstend = dst + stride * (h - 1) + w * 3;
|
|
|
|
const unsigned char* src0 = src;
|
|
|
|
int y = 0;
|
|
#if __ARM_NEON
|
|
for (; y + 7 < srch; y += 8)
|
|
{
|
|
const unsigned char* src1 = src0 + srcstride;
|
|
|
|
unsigned char* dst6 = dstend - y * 3 - 8 * 3 - stride;
|
|
unsigned char* dst7 = dstend - y * 3 - 8 * 3;
|
|
|
|
int src_step = 2 * srcstride;
|
|
int dst_step = -2 * stride;
|
|
|
|
int nn = srcw >> 3;
|
|
int remain = srcw - (nn << 3);
|
|
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x8x3_t _src0 = vld3_u8(src0);
|
|
uint8x8x3_t _src1 = vld3_u8(src1);
|
|
|
|
uint8x8x3_t _src2 = vld3_u8(src0 + src_step);
|
|
uint8x8x3_t _src3 = vld3_u8(src1 + src_step);
|
|
|
|
uint8x8x3_t _src4 = vld3_u8(src0 + 2 * src_step);
|
|
uint8x8x3_t _src5 = vld3_u8(src1 + 2 * src_step);
|
|
|
|
uint8x8x3_t _src6 = vld3_u8(src0 + 3 * src_step);
|
|
uint8x8x3_t _src7 = vld3_u8(src1 + 3 * src_step);
|
|
|
|
uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]);
|
|
uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]);
|
|
uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]);
|
|
uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]);
|
|
|
|
uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]);
|
|
uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]);
|
|
uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]);
|
|
uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]);
|
|
|
|
uint8x8x2_t _src01t_b = vtrn_u8(_src1.val[2], _src0.val[2]);
|
|
uint8x8x2_t _src23t_b = vtrn_u8(_src3.val[2], _src2.val[2]);
|
|
uint8x8x2_t _src45t_b = vtrn_u8(_src5.val[2], _src4.val[2]);
|
|
uint8x8x2_t _src67t_b = vtrn_u8(_src7.val[2], _src6.val[2]);
|
|
|
|
uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
|
|
uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
|
|
uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
|
|
uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
|
|
|
|
uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1]));
|
|
uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0]));
|
|
uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1]));
|
|
uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0]));
|
|
|
|
uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[1]), vreinterpret_u16_u8(_src01t_b.val[1]));
|
|
uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[0]), vreinterpret_u16_u8(_src01t_b.val[0]));
|
|
uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[1]), vreinterpret_u16_u8(_src45t_b.val[1]));
|
|
uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[0]), vreinterpret_u16_u8(_src45t_b.val[0]));
|
|
|
|
uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
|
|
uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
|
|
uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
|
|
uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
|
|
|
|
uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1]));
|
|
uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1]));
|
|
uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0]));
|
|
uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0]));
|
|
|
|
uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[1]), vreinterpret_u32_u16(_src02tt_b.val[1]));
|
|
uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[1]), vreinterpret_u32_u16(_src13tt_b.val[1]));
|
|
uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[0]), vreinterpret_u32_u16(_src02tt_b.val[0]));
|
|
uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[0]), vreinterpret_u32_u16(_src13tt_b.val[0]));
|
|
|
|
uint8x8x3_t _dst0;
|
|
uint8x8x3_t _dst1;
|
|
uint8x8x3_t _dst2;
|
|
uint8x8x3_t _dst3;
|
|
uint8x8x3_t _dst4;
|
|
uint8x8x3_t _dst5;
|
|
uint8x8x3_t _dst6;
|
|
uint8x8x3_t _dst7;
|
|
|
|
_dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
|
|
_dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
|
|
_dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
|
|
_dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
|
|
_dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
|
|
_dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
|
|
_dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
|
|
_dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
|
|
|
|
_dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
|
|
_dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
|
|
_dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
|
|
_dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
|
|
_dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
|
|
_dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
|
|
_dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
|
|
_dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
|
|
|
|
_dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
|
|
_dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
|
|
_dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
|
|
_dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
|
|
_dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
|
|
_dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
|
|
_dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
|
|
_dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
|
|
|
|
vst3_u8(dst7, _dst7);
|
|
vst3_u8(dst6, _dst6);
|
|
vst3_u8(dst7 + dst_step, _dst5);
|
|
vst3_u8(dst6 + dst_step, _dst4);
|
|
vst3_u8(dst7 + 2 * dst_step, _dst3);
|
|
vst3_u8(dst6 + 2 * dst_step, _dst2);
|
|
vst3_u8(dst7 + 3 * dst_step, _dst1);
|
|
vst3_u8(dst6 + 3 * dst_step, _dst0);
|
|
|
|
src0 += 3 * 8;
|
|
src1 += 3 * 8;
|
|
|
|
dst7 += 4 * dst_step;
|
|
dst6 += 4 * dst_step;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"0: \n"
|
|
"pld [%1, #192] \n"
|
|
"vld3.u8 {d0-d2}, [%1], %10 \n"
|
|
|
|
"pld [%2, #192] \n"
|
|
"vld3.u8 {d4-d6}, [%2], %10 \n"
|
|
|
|
"pld [%1, #192] \n"
|
|
"vld3.u8 {d8-d10}, [%1], %10 \n"
|
|
|
|
"vtrn.u8 q2, q0 \n" // _src01t_r
|
|
"vtrn.u8 d6, d2 \n"
|
|
|
|
"pld [%2, #192] \n"
|
|
"vld3.u8 {d12-d14}, [%2], %10\n"
|
|
|
|
"pld [%1, #192] \n"
|
|
"vld3.u8 {d16-d18}, [%1], %10\n"
|
|
|
|
"vtrn.u8 q6, q4 \n" // _src23t_r
|
|
"vtrn.u8 d14, d10 \n"
|
|
|
|
"pld [%2, #192] \n"
|
|
"vld3.u8 {d20-d22}, [%2], %10\n"
|
|
|
|
"pld [%1, #192] \n"
|
|
"vld3.u8 {d24-d26}, [%1], %10\n"
|
|
|
|
"vtrn.u8 q10, q8 \n" // _src45t_r
|
|
"vtrn.u8 d22, d18 \n"
|
|
|
|
"pld [%2, #192] \n"
|
|
"vld3.u8 {d28-d30}, [%2], %10\n"
|
|
|
|
"vtrn.u8 q14, q12 \n" // _src67t_r
|
|
"vtrn.u8 d30, d26 \n"
|
|
|
|
"sub %1, %1, %10, lsl #2 \n" // restore src0
|
|
|
|
"vtrn.u16 q4, q0 \n" // _src02tt_r
|
|
"vtrn.u16 d10, d2 \n"
|
|
|
|
"sub %2, %2, %10, lsl #2 \n" // restore src1
|
|
|
|
"vtrn.u16 q6, q2 \n" // _src13tt_r
|
|
"vtrn.u16 d14, d6 \n"
|
|
|
|
"add %1, #24 \n" // src0 += 24
|
|
|
|
"vtrn.u16 q12, q8 \n" // _src46tt_r
|
|
"vtrn.u16 d26, d18 \n"
|
|
|
|
"add %2, #24 \n" // src1 += 24
|
|
|
|
"vtrn.u16 q14, q10 \n" // _src57tt_r
|
|
"vtrn.u16 d30, d22 \n"
|
|
|
|
"vtrn.u32 q12, q4 \n" // _src26ttt_r
|
|
"vtrn.u32 d26, d10 \n"
|
|
|
|
"vtrn.u32 q14, q6 \n" // _src37ttt_r
|
|
"vst3.u8 {d24-d26}, [%4], %11\n"
|
|
"vtrn.u32 d30, d14 \n"
|
|
|
|
"vtrn.u32 q8, q0 \n" // _src04ttt_r
|
|
"vst3.u8 {d28-d30}, [%3], %11\n"
|
|
"vtrn.u32 d18, d2 \n"
|
|
|
|
"vtrn.u32 q10, q2 \n" // _src15ttt_r
|
|
"vst3.u8 {d16-d18}, [%4], %11\n"
|
|
"vtrn.u32 d22, d6 \n"
|
|
|
|
"subs %0, #1 \n"
|
|
|
|
"vst3.u8 {d8-d10}, [%4], %11 \n"
|
|
"vst3.u8 {d20-d22}, [%3], %11\n"
|
|
"vst3.u8 {d12-d14}, [%3], %11\n"
|
|
"vst3.u8 {d0-d2}, [%4], %11 \n"
|
|
"vst3.u8 {d4-d6}, [%3], %11 \n"
|
|
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(src1), // %2
|
|
"=r"(dst7), // %3
|
|
"=r"(dst6) // %4
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(src1),
|
|
"3"(dst7),
|
|
"4"(dst6),
|
|
"r"(src_step), // %10
|
|
"r"(dst_step) // %11
|
|
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
|
|
}
|
|
#endif // __aarch64__
|
|
for (; remain > 0; remain--)
|
|
{
|
|
dst7[0] = src1[0 + 3 * src_step];
|
|
dst7[1] = src1[1 + 3 * src_step];
|
|
dst7[2] = src1[2 + 3 * src_step];
|
|
dst7[3] = src0[0 + 3 * src_step];
|
|
dst7[4] = src0[1 + 3 * src_step];
|
|
dst7[5] = src0[2 + 3 * src_step];
|
|
dst7[6] = src1[0 + 2 * src_step];
|
|
dst7[7] = src1[1 + 2 * src_step];
|
|
dst7[8] = src1[2 + 2 * src_step];
|
|
dst7[9] = src0[0 + 2 * src_step];
|
|
dst7[10] = src0[1 + 2 * src_step];
|
|
dst7[11] = src0[2 + 2 * src_step];
|
|
dst7[12] = src1[0 + src_step];
|
|
dst7[13] = src1[1 + src_step];
|
|
dst7[14] = src1[2 + src_step];
|
|
dst7[15] = src0[0 + src_step];
|
|
dst7[16] = src0[1 + src_step];
|
|
dst7[17] = src0[2 + src_step];
|
|
dst7[18] = src1[0];
|
|
dst7[19] = src1[1];
|
|
dst7[20] = src1[2];
|
|
dst7[21] = src0[0];
|
|
dst7[22] = src0[1];
|
|
dst7[23] = src0[2];
|
|
|
|
src0 += 3;
|
|
src1 += 3;
|
|
|
|
dst7 -= stride;
|
|
}
|
|
|
|
src0 += srcwgap + 7 * srcstride;
|
|
}
|
|
#endif // __ARM_NEON
|
|
for (; y < srch; y++)
|
|
{
|
|
unsigned char* dst0 = dstend - y * 3 - 3;
|
|
|
|
int x = 0;
|
|
for (; x < srcw; x++)
|
|
{
|
|
dst0[0] = src0[0];
|
|
dst0[1] = src0[1];
|
|
dst0[2] = src0[2];
|
|
|
|
src0 += 3;
|
|
dst0 -= stride;
|
|
}
|
|
|
|
src0 += srcwgap;
|
|
}
|
|
}
|
|
|
|
static void kanna_rotate_7_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
|
|
{
|
|
const int srcwgap = srcstride - srcw * 4;
|
|
|
|
// point to the last dst pixel
|
|
unsigned char* dstend = dst + stride * (h - 1) + w * 4;
|
|
|
|
const unsigned char* src0 = src;
|
|
|
|
int y = 0;
|
|
#if __ARM_NEON
|
|
for (; y + 7 < srch; y += 8)
|
|
{
|
|
const unsigned char* src1 = src0 + srcstride;
|
|
|
|
unsigned char* dst6 = dstend - y * 4 - 8 * 4 - stride;
|
|
unsigned char* dst7 = dstend - y * 4 - 8 * 4;
|
|
|
|
int src_step = 2 * srcstride;
|
|
int dst_step = -2 * stride;
|
|
|
|
int nn = srcw >> 3;
|
|
int remain = srcw - (nn << 3);
|
|
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x8x4_t _src0 = vld4_u8(src0);
|
|
uint8x8x4_t _src1 = vld4_u8(src1);
|
|
|
|
uint8x8x4_t _src2 = vld4_u8(src0 + src_step);
|
|
uint8x8x4_t _src3 = vld4_u8(src1 + src_step);
|
|
|
|
uint8x8x4_t _src4 = vld4_u8(src0 + 2 * src_step);
|
|
uint8x8x4_t _src5 = vld4_u8(src1 + 2 * src_step);
|
|
|
|
uint8x8x4_t _src6 = vld4_u8(src0 + 3 * src_step);
|
|
uint8x8x4_t _src7 = vld4_u8(src1 + 3 * src_step);
|
|
|
|
uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]);
|
|
uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]);
|
|
uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]);
|
|
uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]);
|
|
|
|
uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]);
|
|
uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]);
|
|
uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]);
|
|
uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]);
|
|
|
|
uint8x8x2_t _src01t_b = vtrn_u8(_src1.val[2], _src0.val[2]);
|
|
uint8x8x2_t _src23t_b = vtrn_u8(_src3.val[2], _src2.val[2]);
|
|
uint8x8x2_t _src45t_b = vtrn_u8(_src5.val[2], _src4.val[2]);
|
|
uint8x8x2_t _src67t_b = vtrn_u8(_src7.val[2], _src6.val[2]);
|
|
|
|
uint8x8x2_t _src01t_a = vtrn_u8(_src1.val[3], _src0.val[3]);
|
|
uint8x8x2_t _src23t_a = vtrn_u8(_src3.val[3], _src2.val[3]);
|
|
uint8x8x2_t _src45t_a = vtrn_u8(_src5.val[3], _src4.val[3]);
|
|
uint8x8x2_t _src67t_a = vtrn_u8(_src7.val[3], _src6.val[3]);
|
|
|
|
uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
|
|
uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
|
|
uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
|
|
uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
|
|
|
|
uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1]));
|
|
uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0]));
|
|
uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1]));
|
|
uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0]));
|
|
|
|
uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[1]), vreinterpret_u16_u8(_src01t_b.val[1]));
|
|
uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[0]), vreinterpret_u16_u8(_src01t_b.val[0]));
|
|
uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[1]), vreinterpret_u16_u8(_src45t_b.val[1]));
|
|
uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[0]), vreinterpret_u16_u8(_src45t_b.val[0]));
|
|
|
|
uint16x4x2_t _src02tt_a = vtrn_u16(vreinterpret_u16_u8(_src23t_a.val[1]), vreinterpret_u16_u8(_src01t_a.val[1]));
|
|
uint16x4x2_t _src13tt_a = vtrn_u16(vreinterpret_u16_u8(_src23t_a.val[0]), vreinterpret_u16_u8(_src01t_a.val[0]));
|
|
uint16x4x2_t _src46tt_a = vtrn_u16(vreinterpret_u16_u8(_src67t_a.val[1]), vreinterpret_u16_u8(_src45t_a.val[1]));
|
|
uint16x4x2_t _src57tt_a = vtrn_u16(vreinterpret_u16_u8(_src67t_a.val[0]), vreinterpret_u16_u8(_src45t_a.val[0]));
|
|
|
|
uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
|
|
uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
|
|
uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
|
|
uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
|
|
|
|
uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1]));
|
|
uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1]));
|
|
uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0]));
|
|
uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0]));
|
|
|
|
uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[1]), vreinterpret_u32_u16(_src02tt_b.val[1]));
|
|
uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[1]), vreinterpret_u32_u16(_src13tt_b.val[1]));
|
|
uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[0]), vreinterpret_u32_u16(_src02tt_b.val[0]));
|
|
uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[0]), vreinterpret_u32_u16(_src13tt_b.val[0]));
|
|
|
|
uint32x2x2_t _src04ttt_a = vtrn_u32(vreinterpret_u32_u16(_src46tt_a.val[1]), vreinterpret_u32_u16(_src02tt_a.val[1]));
|
|
uint32x2x2_t _src15ttt_a = vtrn_u32(vreinterpret_u32_u16(_src57tt_a.val[1]), vreinterpret_u32_u16(_src13tt_a.val[1]));
|
|
uint32x2x2_t _src26ttt_a = vtrn_u32(vreinterpret_u32_u16(_src46tt_a.val[0]), vreinterpret_u32_u16(_src02tt_a.val[0]));
|
|
uint32x2x2_t _src37ttt_a = vtrn_u32(vreinterpret_u32_u16(_src57tt_a.val[0]), vreinterpret_u32_u16(_src13tt_a.val[0]));
|
|
|
|
uint8x8x4_t _dst0;
|
|
uint8x8x4_t _dst1;
|
|
uint8x8x4_t _dst2;
|
|
uint8x8x4_t _dst3;
|
|
uint8x8x4_t _dst4;
|
|
uint8x8x4_t _dst5;
|
|
uint8x8x4_t _dst6;
|
|
uint8x8x4_t _dst7;
|
|
|
|
_dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
|
|
_dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
|
|
_dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
|
|
_dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
|
|
_dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
|
|
_dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
|
|
_dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
|
|
_dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
|
|
|
|
_dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
|
|
_dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
|
|
_dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
|
|
_dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
|
|
_dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
|
|
_dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
|
|
_dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
|
|
_dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
|
|
|
|
_dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
|
|
_dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
|
|
_dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
|
|
_dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
|
|
_dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
|
|
_dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
|
|
_dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
|
|
_dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
|
|
|
|
_dst0.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[1]);
|
|
_dst1.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[1]);
|
|
_dst2.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[1]);
|
|
_dst3.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[1]);
|
|
_dst4.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[0]);
|
|
_dst5.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[0]);
|
|
_dst6.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[0]);
|
|
_dst7.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[0]);
|
|
|
|
vst4_u8(dst7, _dst7);
|
|
vst4_u8(dst6, _dst6);
|
|
vst4_u8(dst7 + dst_step, _dst5);
|
|
vst4_u8(dst6 + dst_step, _dst4);
|
|
vst4_u8(dst7 + 2 * dst_step, _dst3);
|
|
vst4_u8(dst6 + 2 * dst_step, _dst2);
|
|
vst4_u8(dst7 + 3 * dst_step, _dst1);
|
|
vst4_u8(dst6 + 3 * dst_step, _dst0);
|
|
|
|
src0 += 4 * 8;
|
|
src1 += 4 * 8;
|
|
|
|
dst7 += 4 * dst_step;
|
|
dst6 += 4 * dst_step;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"0: \n"
|
|
"pld [%1, #256] \n"
|
|
"vld4.u8 {d0-d3}, [%1], %10 \n"
|
|
|
|
"pld [%2, #256] \n"
|
|
"vld4.u8 {d4-d7}, [%2], %10 \n"
|
|
|
|
"pld [%1, #256] \n"
|
|
"vld4.u8 {d8-d11}, [%1], %10 \n"
|
|
|
|
"vtrn.u8 q2, q0 \n" // _src01t_r
|
|
"vtrn.u8 q3, q1 \n"
|
|
|
|
"pld [%2, #256] \n"
|
|
"vld4.u8 {d12-d15}, [%2], %10\n"
|
|
|
|
"pld [%1, #256] \n"
|
|
"vld4.u8 {d16-d19}, [%1], %10\n"
|
|
|
|
"vtrn.u8 q6, q4 \n" // _src23t_r
|
|
"vtrn.u8 q7, q5 \n"
|
|
|
|
"pld [%2, #256] \n"
|
|
"vld4.u8 {d20-d23}, [%2], %10\n"
|
|
|
|
"pld [%1, #256] \n"
|
|
"vld4.u8 {d24-d27}, [%1], %10\n"
|
|
|
|
"vtrn.u8 q10, q8 \n" // _src45t_r
|
|
"vtrn.u8 q11, q9 \n"
|
|
|
|
"pld [%2, #256] \n"
|
|
"vld4.u8 {d28-d31}, [%2], %10\n"
|
|
|
|
"vtrn.u8 q14, q12 \n" // _src67t_r
|
|
"vtrn.u8 q15, q13 \n"
|
|
|
|
"sub %1, %1, %10, lsl #2 \n" // restore src0
|
|
|
|
"vtrn.u16 q4, q0 \n" // _src02tt_r
|
|
"vtrn.u16 q5, q1 \n"
|
|
|
|
"sub %2, %2, %10, lsl #2 \n" // restore src1
|
|
|
|
"vtrn.u16 q6, q2 \n" // _src13tt_r
|
|
"vtrn.u16 q7, q3 \n"
|
|
|
|
"add %1, #32 \n" // src0 += 32
|
|
|
|
"vtrn.u16 q12, q8 \n" // _src46tt_r
|
|
"vtrn.u16 q13, q9 \n"
|
|
|
|
"add %2, #32 \n" // src1 += 32
|
|
|
|
"vtrn.u16 q14, q10 \n" // _src57tt_r
|
|
"vtrn.u16 q15, q11 \n"
|
|
|
|
"vtrn.u32 q12, q4 \n" // _src26ttt_r
|
|
"vtrn.u32 q13, q5 \n"
|
|
|
|
"vtrn.u32 q14, q6 \n" // _src37ttt_r
|
|
"vst4.u8 {d24-d27}, [%4], %11\n"
|
|
"vtrn.u32 q15, q7 \n"
|
|
|
|
"vtrn.u32 q8, q0 \n" // _src04ttt_r
|
|
"vst4.u8 {d28-d31}, [%3], %11\n"
|
|
"vtrn.u32 q9, q1 \n"
|
|
|
|
"vtrn.u32 q10, q2 \n" // _src15ttt_r
|
|
"vst4.u8 {d16-d19}, [%4], %11\n"
|
|
"vtrn.u32 q11, q3 \n"
|
|
|
|
"subs %0, #1 \n"
|
|
|
|
"vst4.u8 {d8-d11}, [%4], %11 \n"
|
|
"vst4.u8 {d20-d23}, [%3], %11\n"
|
|
"vst4.u8 {d12-d15}, [%3], %11\n"
|
|
"vst4.u8 {d0-d3}, [%4], %11 \n"
|
|
"vst4.u8 {d4-d7}, [%3], %11 \n"
|
|
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(src1), // %2
|
|
"=r"(dst7), // %3
|
|
"=r"(dst6) // %4
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(src1),
|
|
"3"(dst7),
|
|
"4"(dst6),
|
|
"r"(src_step), // %10
|
|
"r"(dst_step) // %11
|
|
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
|
|
}
|
|
#endif // __aarch64__
|
|
for (; remain > 0; remain--)
|
|
{
|
|
dst7[0] = src1[0 + 3 * src_step];
|
|
dst7[1] = src1[1 + 3 * src_step];
|
|
dst7[2] = src1[2 + 3 * src_step];
|
|
dst7[3] = src1[3 + 3 * src_step];
|
|
dst7[4] = src0[0 + 3 * src_step];
|
|
dst7[5] = src0[1 + 3 * src_step];
|
|
dst7[6] = src0[2 + 3 * src_step];
|
|
dst7[7] = src0[3 + 3 * src_step];
|
|
dst7[8] = src1[0 + 2 * src_step];
|
|
dst7[9] = src1[1 + 2 * src_step];
|
|
dst7[10] = src1[2 + 2 * src_step];
|
|
dst7[11] = src1[3 + 2 * src_step];
|
|
dst7[12] = src0[0 + 2 * src_step];
|
|
dst7[13] = src0[1 + 2 * src_step];
|
|
dst7[14] = src0[2 + 2 * src_step];
|
|
dst7[15] = src0[3 + 2 * src_step];
|
|
dst7[16] = src1[0 + src_step];
|
|
dst7[17] = src1[1 + src_step];
|
|
dst7[18] = src1[2 + src_step];
|
|
dst7[19] = src1[3 + src_step];
|
|
dst7[20] = src0[0 + src_step];
|
|
dst7[21] = src0[1 + src_step];
|
|
dst7[22] = src0[2 + src_step];
|
|
dst7[23] = src0[3 + src_step];
|
|
dst7[24] = src1[0];
|
|
dst7[25] = src1[1];
|
|
dst7[26] = src1[2];
|
|
dst7[27] = src1[3];
|
|
dst7[28] = src0[0];
|
|
dst7[29] = src0[1];
|
|
dst7[30] = src0[2];
|
|
dst7[31] = src0[3];
|
|
|
|
src0 += 4;
|
|
src1 += 4;
|
|
|
|
dst7 -= stride;
|
|
}
|
|
|
|
src0 += srcwgap + 7 * srcstride;
|
|
}
|
|
#endif // __ARM_NEON
|
|
for (; y < srch; y++)
|
|
{
|
|
unsigned char* dst0 = dstend - y * 4 - 4;
|
|
|
|
int x = 0;
|
|
for (; x < srcw; x++)
|
|
{
|
|
dst0[0] = src0[0];
|
|
dst0[1] = src0[1];
|
|
dst0[2] = src0[2];
|
|
dst0[3] = src0[3];
|
|
|
|
src0 += 4;
|
|
dst0 -= stride;
|
|
}
|
|
|
|
src0 += srcwgap;
|
|
}
|
|
}
|
|
|
|
static void kanna_rotate_8_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int h, int stride)
|
|
{
|
|
const int srcwgap = srcstride - srcw;
|
|
|
|
// point to the last dst pixel row
|
|
unsigned char* dstend = dst + stride * (h - 1);
|
|
|
|
const unsigned char* src0 = src;
|
|
|
|
int y = 0;
|
|
#if __ARM_NEON
|
|
for (; y + 7 < srch; y += 8)
|
|
{
|
|
const unsigned char* src1 = src0 + srcstride;
|
|
|
|
unsigned char* dst7 = dstend + y;
|
|
unsigned char* dst6 = dstend + y - stride;
|
|
|
|
int src_step = 2 * srcstride;
|
|
int dst_step = -2 * stride;
|
|
|
|
int nn = srcw >> 3;
|
|
int remain = srcw - (nn << 3);
|
|
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x8_t _src0 = vld1_u8(src0);
|
|
uint8x8_t _src1 = vld1_u8(src1);
|
|
|
|
uint8x8_t _src2 = vld1_u8(src0 + src_step);
|
|
uint8x8_t _src3 = vld1_u8(src1 + src_step);
|
|
|
|
uint8x8_t _src4 = vld1_u8(src0 + 2 * src_step);
|
|
uint8x8_t _src5 = vld1_u8(src1 + 2 * src_step);
|
|
|
|
uint8x8_t _src6 = vld1_u8(src0 + 3 * src_step);
|
|
uint8x8_t _src7 = vld1_u8(src1 + 3 * src_step);
|
|
|
|
uint8x8x2_t _src01t_r = vtrn_u8(_src0, _src1);
|
|
uint8x8x2_t _src23t_r = vtrn_u8(_src2, _src3);
|
|
uint8x8x2_t _src45t_r = vtrn_u8(_src4, _src5);
|
|
uint8x8x2_t _src67t_r = vtrn_u8(_src6, _src7);
|
|
|
|
uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
|
|
uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
|
|
uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
|
|
uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
|
|
|
|
uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
|
|
uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
|
|
uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
|
|
uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
|
|
|
|
uint8x8_t _dst0 = vreinterpret_u8_u32(_src04ttt_r.val[0]);
|
|
uint8x8_t _dst1 = vreinterpret_u8_u32(_src15ttt_r.val[0]);
|
|
uint8x8_t _dst2 = vreinterpret_u8_u32(_src26ttt_r.val[0]);
|
|
uint8x8_t _dst3 = vreinterpret_u8_u32(_src37ttt_r.val[0]);
|
|
uint8x8_t _dst4 = vreinterpret_u8_u32(_src04ttt_r.val[1]);
|
|
uint8x8_t _dst5 = vreinterpret_u8_u32(_src15ttt_r.val[1]);
|
|
uint8x8_t _dst6 = vreinterpret_u8_u32(_src26ttt_r.val[1]);
|
|
uint8x8_t _dst7 = vreinterpret_u8_u32(_src37ttt_r.val[1]);
|
|
|
|
vst1_u8(dst7, _dst0);
|
|
vst1_u8(dst6, _dst1);
|
|
vst1_u8(dst7 + dst_step, _dst2);
|
|
vst1_u8(dst6 + dst_step, _dst3);
|
|
vst1_u8(dst7 + 2 * dst_step, _dst4);
|
|
vst1_u8(dst6 + 2 * dst_step, _dst5);
|
|
vst1_u8(dst7 + 3 * dst_step, _dst6);
|
|
vst1_u8(dst6 + 3 * dst_step, _dst7);
|
|
|
|
src0 += 8;
|
|
src1 += 8;
|
|
|
|
dst7 += 4 * dst_step;
|
|
dst6 += 4 * dst_step;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"0: \n"
|
|
"pld [%1, #64] \n"
|
|
"vld1.u8 {d0}, [%1], %10 \n"
|
|
|
|
"pld [%2, #64] \n"
|
|
"vld1.u8 {d1}, [%2], %10 \n"
|
|
|
|
"pld [%1, #64] \n"
|
|
"vld1.u8 {d2}, [%1], %10 \n"
|
|
|
|
"vtrn.u8 d0, d1 \n" // _src01t_r
|
|
|
|
"pld [%2, #64] \n"
|
|
"vld1.u8 {d3}, [%2], %10 \n"
|
|
|
|
"pld [%1, #64] \n"
|
|
"vld1.u8 {d4}, [%1], %10 \n"
|
|
|
|
"vtrn.u8 d2, d3 \n" // _src23t_r
|
|
|
|
"pld [%2, #64] \n"
|
|
"vld1.u8 {d5}, [%2], %10 \n"
|
|
|
|
"pld [%1, #64] \n"
|
|
"vld1.u8 {d6}, [%1], %10 \n"
|
|
|
|
"vtrn.u8 d4, d5 \n" // _src45t_r
|
|
|
|
"pld [%2, #64] \n"
|
|
"vld1.u8 {d7}, [%2], %10 \n"
|
|
|
|
"vtrn.u8 d6, d7 \n" // _src67t_r
|
|
|
|
"sub %1, %1, %10, lsl #2 \n" // restore src0
|
|
|
|
"vtrn.u16 q0, q1 \n" // _src02tt_r _src13tt_r
|
|
|
|
"sub %2, %2, %10, lsl #2 \n" // restore src1
|
|
|
|
"vtrn.u16 q2, q3 \n" // _src46tt_r _src57tt_r
|
|
|
|
"add %1, #8 \n" // src0 += 8
|
|
|
|
"vtrn.u32 q0, q2 \n" // _src04ttt_r _src15ttt_r
|
|
|
|
"add %2, #8 \n" // src1 += 8
|
|
|
|
"vtrn.u32 q1, q3 \n" // _src26ttt_r _src37ttt_r
|
|
"vst1.u8 {d0}, [%3], %11 \n"
|
|
"vst1.u8 {d1}, [%4], %11 \n"
|
|
|
|
"subs %0, #1 \n"
|
|
|
|
"vst1.u8 {d2}, [%3], %11 \n"
|
|
"vst1.u8 {d3}, [%4], %11 \n"
|
|
"vst1.u8 {d4}, [%3], %11 \n"
|
|
"vst1.u8 {d5}, [%4], %11 \n"
|
|
"vst1.u8 {d6}, [%3], %11 \n"
|
|
"vst1.u8 {d7}, [%4], %11 \n"
|
|
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(src1), // %2
|
|
"=r"(dst7), // %3
|
|
"=r"(dst6) // %4
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(src1),
|
|
"3"(dst7),
|
|
"4"(dst6),
|
|
"r"(src_step), // %10
|
|
"r"(dst_step) // %11
|
|
: "cc", "memory", "q0", "q1", "q2", "q3");
|
|
}
|
|
#endif // __aarch64__
|
|
for (; remain > 0; remain--)
|
|
{
|
|
dst7[0] = src0[0];
|
|
dst7[1] = src1[0];
|
|
dst7[2] = src0[0 + src_step];
|
|
dst7[3] = src1[0 + src_step];
|
|
dst7[4] = src0[0 + 2 * src_step];
|
|
dst7[5] = src1[0 + 2 * src_step];
|
|
dst7[6] = src0[0 + 3 * src_step];
|
|
dst7[7] = src1[0 + 3 * src_step];
|
|
|
|
src0 += 1;
|
|
src1 += 1;
|
|
|
|
dst7 -= stride;
|
|
}
|
|
|
|
src0 += srcwgap + 7 * srcstride;
|
|
}
|
|
#endif // __ARM_NEON
|
|
for (; y < srch; y++)
|
|
{
|
|
unsigned char* dst0 = dstend + y;
|
|
|
|
int x = 0;
|
|
for (; x < srcw; x++)
|
|
{
|
|
*dst0 = *src0;
|
|
|
|
src0 += 1;
|
|
dst0 -= stride;
|
|
}
|
|
|
|
src0 += srcwgap;
|
|
}
|
|
}
|
|
|
|
static void kanna_rotate_8_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int h, int stride)
|
|
{
|
|
const int srcwgap = srcstride - srcw * 2;
|
|
|
|
// point to the last dst pixel row
|
|
unsigned char* dstend = dst + stride * (h - 1);
|
|
|
|
const unsigned char* src0 = src;
|
|
|
|
int y = 0;
|
|
#if __ARM_NEON
|
|
for (; y + 7 < srch; y += 8)
|
|
{
|
|
const unsigned char* src1 = src0 + srcstride;
|
|
|
|
unsigned char* dst7 = dstend + y * 2;
|
|
unsigned char* dst6 = dstend + y * 2 - stride;
|
|
|
|
int src_step = 2 * srcstride;
|
|
int dst_step = -2 * stride;
|
|
|
|
int nn = srcw >> 3;
|
|
int remain = srcw - (nn << 3);
|
|
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x8x2_t _src0 = vld2_u8(src0);
|
|
uint8x8x2_t _src1 = vld2_u8(src1);
|
|
|
|
uint8x8x2_t _src2 = vld2_u8(src0 + src_step);
|
|
uint8x8x2_t _src3 = vld2_u8(src1 + src_step);
|
|
|
|
uint8x8x2_t _src4 = vld2_u8(src0 + 2 * src_step);
|
|
uint8x8x2_t _src5 = vld2_u8(src1 + 2 * src_step);
|
|
|
|
uint8x8x2_t _src6 = vld2_u8(src0 + 3 * src_step);
|
|
uint8x8x2_t _src7 = vld2_u8(src1 + 3 * src_step);
|
|
|
|
uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]);
|
|
uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]);
|
|
uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]);
|
|
uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]);
|
|
|
|
uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]);
|
|
uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]);
|
|
uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]);
|
|
uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]);
|
|
|
|
uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
|
|
uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
|
|
uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
|
|
uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
|
|
|
|
uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0]));
|
|
uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1]));
|
|
uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0]));
|
|
uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1]));
|
|
|
|
uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
|
|
uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
|
|
uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
|
|
uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
|
|
|
|
uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0]));
|
|
uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0]));
|
|
uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1]));
|
|
uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1]));
|
|
|
|
uint8x8x2_t _dst0;
|
|
uint8x8x2_t _dst1;
|
|
uint8x8x2_t _dst2;
|
|
uint8x8x2_t _dst3;
|
|
uint8x8x2_t _dst4;
|
|
uint8x8x2_t _dst5;
|
|
uint8x8x2_t _dst6;
|
|
uint8x8x2_t _dst7;
|
|
|
|
_dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
|
|
_dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
|
|
_dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
|
|
_dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
|
|
_dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
|
|
_dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
|
|
_dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
|
|
_dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
|
|
|
|
_dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
|
|
_dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
|
|
_dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
|
|
_dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
|
|
_dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
|
|
_dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
|
|
_dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
|
|
_dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
|
|
|
|
vst2_u8(dst7, _dst0);
|
|
vst2_u8(dst6, _dst1);
|
|
vst2_u8(dst7 + dst_step, _dst2);
|
|
vst2_u8(dst6 + dst_step, _dst3);
|
|
vst2_u8(dst7 + 2 * dst_step, _dst4);
|
|
vst2_u8(dst6 + 2 * dst_step, _dst5);
|
|
vst2_u8(dst7 + 3 * dst_step, _dst6);
|
|
vst2_u8(dst6 + 3 * dst_step, _dst7);
|
|
|
|
src0 += 2 * 8;
|
|
src1 += 2 * 8;
|
|
|
|
dst7 += 4 * dst_step;
|
|
dst6 += 4 * dst_step;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"0: \n"
|
|
"pld [%1, #128] \n"
|
|
"vld2.u8 {d0-d1}, [%1], %10 \n"
|
|
|
|
"pld [%2, #128] \n"
|
|
"vld2.u8 {d2-d3}, [%2], %10 \n"
|
|
|
|
"pld [%1, #128] \n"
|
|
"vld2.u8 {d4-d5}, [%1], %10 \n"
|
|
|
|
"vtrn.u8 q0, q1 \n" // _src01t_r
|
|
|
|
"pld [%2, #128] \n"
|
|
"vld2.u8 {d6-d7}, [%2], %10 \n"
|
|
|
|
"pld [%1, #128] \n"
|
|
"vld2.u8 {d16-d17}, [%1], %10\n"
|
|
|
|
"vtrn.u8 q2, q3 \n" // _src23t_r
|
|
|
|
"pld [%2, #128] \n"
|
|
"vld2.u8 {d18-d19}, [%2], %10\n"
|
|
|
|
"pld [%1, #128] \n"
|
|
"vld2.u8 {d20-d21}, [%1], %10\n"
|
|
|
|
"vtrn.u8 q8, q9 \n" // _src45t_r
|
|
|
|
"pld [%2, #128] \n"
|
|
"vld2.u8 {d22-d23}, [%2], %10\n"
|
|
|
|
"vtrn.u8 q10, q11 \n" // _src67t_r
|
|
|
|
"sub %1, %1, %10, lsl #2 \n" // restore src0
|
|
|
|
"vtrn.u16 q0, q2 \n" // _src02tt_r
|
|
|
|
"sub %2, %2, %10, lsl #2 \n" // restore src1
|
|
|
|
"vtrn.u16 q1, q3 \n" // _src13tt_r
|
|
|
|
"add %1, #16 \n" // src0 += 16
|
|
|
|
"vtrn.u16 q8, q10 \n" // _src46tt_r
|
|
|
|
"add %2, #16 \n" // src1 += 16
|
|
|
|
"vtrn.u16 q9, q11 \n" // _src57tt_r
|
|
|
|
"vtrn.u32 q0, q8 \n" // _src04ttt_r
|
|
|
|
"vtrn.u32 q1, q9 \n" // _src15ttt_r
|
|
"vst2.u8 {d0-d1}, [%3], %11 \n"
|
|
|
|
"vtrn.u32 q2, q10 \n" // _src26ttt_r
|
|
"vst2.u8 {d2-d3}, [%4], %11 \n"
|
|
|
|
"vtrn.u32 q3, q11 \n" // _src37ttt_r
|
|
"vst2.u8 {d4-d5}, [%3], %11 \n"
|
|
|
|
"subs %0, #1 \n"
|
|
|
|
"vst2.u8 {d16-d17}, [%3], %11\n"
|
|
"vst2.u8 {d6-d7}, [%4], %11 \n"
|
|
"vst2.u8 {d18-d19}, [%4], %11\n"
|
|
"vst2.u8 {d20-d21}, [%3], %11\n"
|
|
"vst2.u8 {d22-d23}, [%4], %11\n"
|
|
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(src1), // %2
|
|
"=r"(dst7), // %3
|
|
"=r"(dst6) // %4
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(src1),
|
|
"3"(dst7),
|
|
"4"(dst6),
|
|
"r"(src_step), // %10
|
|
"r"(dst_step) // %11
|
|
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
|
|
}
|
|
#endif // __aarch64__
|
|
for (; remain > 0; remain--)
|
|
{
|
|
dst7[0] = src0[0];
|
|
dst7[1] = src0[1];
|
|
dst7[2] = src1[0];
|
|
dst7[3] = src1[1];
|
|
dst7[4] = src0[0 + src_step];
|
|
dst7[5] = src0[1 + src_step];
|
|
dst7[6] = src1[0 + src_step];
|
|
dst7[7] = src1[1 + src_step];
|
|
dst7[8] = src0[0 + 2 * src_step];
|
|
dst7[9] = src0[1 + 2 * src_step];
|
|
dst7[10] = src1[0 + 2 * src_step];
|
|
dst7[11] = src1[1 + 2 * src_step];
|
|
dst7[12] = src0[0 + 3 * src_step];
|
|
dst7[13] = src0[1 + 3 * src_step];
|
|
dst7[14] = src1[0 + 3 * src_step];
|
|
dst7[15] = src1[1 + 3 * src_step];
|
|
|
|
src0 += 2;
|
|
src1 += 2;
|
|
|
|
dst7 -= stride;
|
|
}
|
|
|
|
src0 += srcwgap + 7 * srcstride;
|
|
}
|
|
#endif // __ARM_NEON
|
|
for (; y < srch; y++)
|
|
{
|
|
unsigned char* dst0 = dstend + y * 2;
|
|
|
|
int x = 0;
|
|
for (; x < srcw; x++)
|
|
{
|
|
dst0[0] = src0[0];
|
|
dst0[1] = src0[1];
|
|
|
|
src0 += 2;
|
|
dst0 -= stride;
|
|
}
|
|
|
|
src0 += srcwgap;
|
|
}
|
|
}
|
|
|
|
static void kanna_rotate_8_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int h, int stride)
|
|
{
|
|
const int srcwgap = srcstride - srcw * 3;
|
|
|
|
// point to the last dst pixel row
|
|
unsigned char* dstend = dst + stride * (h - 1);
|
|
|
|
const unsigned char* src0 = src;
|
|
|
|
int y = 0;
|
|
#if __ARM_NEON
|
|
for (; y + 7 < srch; y += 8)
|
|
{
|
|
const unsigned char* src1 = src0 + srcstride;
|
|
|
|
unsigned char* dst7 = dstend + y * 3;
|
|
unsigned char* dst6 = dstend + y * 3 - stride;
|
|
|
|
int src_step = 2 * srcstride;
|
|
int dst_step = -2 * stride;
|
|
|
|
int nn = srcw >> 3;
|
|
int remain = srcw - (nn << 3);
|
|
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x8x3_t _src0 = vld3_u8(src0);
|
|
uint8x8x3_t _src1 = vld3_u8(src1);
|
|
|
|
uint8x8x3_t _src2 = vld3_u8(src0 + src_step);
|
|
uint8x8x3_t _src3 = vld3_u8(src1 + src_step);
|
|
|
|
uint8x8x3_t _src4 = vld3_u8(src0 + 2 * src_step);
|
|
uint8x8x3_t _src5 = vld3_u8(src1 + 2 * src_step);
|
|
|
|
uint8x8x3_t _src6 = vld3_u8(src0 + 3 * src_step);
|
|
uint8x8x3_t _src7 = vld3_u8(src1 + 3 * src_step);
|
|
|
|
uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]);
|
|
uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]);
|
|
uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]);
|
|
uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]);
|
|
|
|
uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]);
|
|
uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]);
|
|
uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]);
|
|
uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]);
|
|
|
|
uint8x8x2_t _src01t_b = vtrn_u8(_src0.val[2], _src1.val[2]);
|
|
uint8x8x2_t _src23t_b = vtrn_u8(_src2.val[2], _src3.val[2]);
|
|
uint8x8x2_t _src45t_b = vtrn_u8(_src4.val[2], _src5.val[2]);
|
|
uint8x8x2_t _src67t_b = vtrn_u8(_src6.val[2], _src7.val[2]);
|
|
|
|
uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
|
|
uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
|
|
uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
|
|
uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
|
|
|
|
uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0]));
|
|
uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1]));
|
|
uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0]));
|
|
uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1]));
|
|
|
|
uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[0]), vreinterpret_u16_u8(_src23t_b.val[0]));
|
|
uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[1]), vreinterpret_u16_u8(_src23t_b.val[1]));
|
|
uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[0]), vreinterpret_u16_u8(_src67t_b.val[0]));
|
|
uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[1]), vreinterpret_u16_u8(_src67t_b.val[1]));
|
|
|
|
uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
|
|
uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
|
|
uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
|
|
uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
|
|
|
|
uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0]));
|
|
uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0]));
|
|
uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1]));
|
|
uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1]));
|
|
|
|
uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[0]), vreinterpret_u32_u16(_src46tt_b.val[0]));
|
|
uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[0]), vreinterpret_u32_u16(_src57tt_b.val[0]));
|
|
uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[1]), vreinterpret_u32_u16(_src46tt_b.val[1]));
|
|
uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[1]), vreinterpret_u32_u16(_src57tt_b.val[1]));
|
|
|
|
uint8x8x3_t _dst0;
|
|
uint8x8x3_t _dst1;
|
|
uint8x8x3_t _dst2;
|
|
uint8x8x3_t _dst3;
|
|
uint8x8x3_t _dst4;
|
|
uint8x8x3_t _dst5;
|
|
uint8x8x3_t _dst6;
|
|
uint8x8x3_t _dst7;
|
|
|
|
_dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
|
|
_dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
|
|
_dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
|
|
_dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
|
|
_dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
|
|
_dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
|
|
_dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
|
|
_dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
|
|
|
|
_dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
|
|
_dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
|
|
_dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
|
|
_dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
|
|
_dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
|
|
_dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
|
|
_dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
|
|
_dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
|
|
|
|
_dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
|
|
_dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
|
|
_dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
|
|
_dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
|
|
_dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
|
|
_dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
|
|
_dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
|
|
_dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
|
|
|
|
vst3_u8(dst7, _dst0);
|
|
vst3_u8(dst6, _dst1);
|
|
vst3_u8(dst7 + dst_step, _dst2);
|
|
vst3_u8(dst6 + dst_step, _dst3);
|
|
vst3_u8(dst7 + 2 * dst_step, _dst4);
|
|
vst3_u8(dst6 + 2 * dst_step, _dst5);
|
|
vst3_u8(dst7 + 3 * dst_step, _dst6);
|
|
vst3_u8(dst6 + 3 * dst_step, _dst7);
|
|
|
|
src0 += 3 * 8;
|
|
src1 += 3 * 8;
|
|
|
|
dst7 += 4 * dst_step;
|
|
dst6 += 4 * dst_step;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"0: \n"
|
|
"pld [%1, #192] \n"
|
|
"vld3.u8 {d0-d2}, [%1], %10 \n"
|
|
|
|
"pld [%2, #192] \n"
|
|
"vld3.u8 {d4-d6}, [%2], %10 \n"
|
|
|
|
"pld [%1, #192] \n"
|
|
"vld3.u8 {d8-d10}, [%1], %10 \n"
|
|
|
|
"vtrn.u8 q0, q2 \n" // _src01t_r
|
|
"vtrn.u8 d2, d6 \n"
|
|
|
|
"pld [%2, #192] \n"
|
|
"vld3.u8 {d12-d14}, [%2], %10\n"
|
|
|
|
"pld [%1, #192] \n"
|
|
"vld3.u8 {d16-d18}, [%1], %10\n"
|
|
|
|
"vtrn.u8 q4, q6 \n" // _src23t_r
|
|
"vtrn.u8 d10, d14 \n"
|
|
|
|
"pld [%2, #192] \n"
|
|
"vld3.u8 {d20-d22}, [%2], %10\n"
|
|
|
|
"pld [%1, #192] \n"
|
|
"vld3.u8 {d24-d26}, [%1], %10\n"
|
|
|
|
"vtrn.u8 q8, q10 \n" // _src45t_r
|
|
"vtrn.u8 d18, d22 \n"
|
|
|
|
"pld [%2, #192] \n"
|
|
"vld3.u8 {d28-d30}, [%2], %10\n"
|
|
|
|
"vtrn.u8 q12, q14 \n" // _src67t_r
|
|
"vtrn.u8 d26, d30 \n"
|
|
|
|
"sub %1, %1, %10, lsl #2 \n" // restore src0
|
|
|
|
"vtrn.u16 q0, q4 \n" // _src02tt_r
|
|
"vtrn.u16 d2, d10 \n"
|
|
|
|
"sub %2, %2, %10, lsl #2 \n" // restore src1
|
|
|
|
"vtrn.u16 q2, q6 \n" // _src13tt_r
|
|
"vtrn.u16 d6, d14 \n"
|
|
|
|
"add %1, #24 \n" // src0 += 24
|
|
|
|
"vtrn.u16 q8, q12 \n" // _src46tt_r
|
|
"vtrn.u16 d18, d26 \n"
|
|
|
|
"add %2, #24 \n" // src1 += 24
|
|
|
|
"vtrn.u16 q10, q14 \n" // _src57tt_r
|
|
"vtrn.u16 d22, d30 \n"
|
|
|
|
"vtrn.u32 q0, q8 \n" // _src04ttt_r
|
|
"vtrn.u32 d2, d18 \n"
|
|
|
|
"vtrn.u32 q2, q10 \n" // _src15ttt_r
|
|
"vst3.u8 {d0-d2}, [%3], %11 \n"
|
|
"vtrn.u32 d6, d22 \n"
|
|
|
|
"vtrn.u32 q4, q12 \n" // _src26ttt_r
|
|
"vst3.u8 {d4-d6}, [%4], %11 \n"
|
|
"vtrn.u32 d10, d26 \n"
|
|
|
|
"vtrn.u32 q6, q14 \n" // _src37ttt_r
|
|
"vst3.u8 {d8-d10}, [%3], %11 \n"
|
|
"vtrn.u32 d14, d30 \n"
|
|
|
|
"subs %0, #1 \n"
|
|
|
|
"vst3.u8 {d16-d18}, [%3], %11\n"
|
|
"vst3.u8 {d12-d14}, [%4], %11\n"
|
|
"vst3.u8 {d20-d22}, [%4], %11\n"
|
|
"vst3.u8 {d24-d26}, [%3], %11\n"
|
|
"vst3.u8 {d28-d30}, [%4], %11\n"
|
|
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(src1), // %2
|
|
"=r"(dst7), // %3
|
|
"=r"(dst6) // %4
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(src1),
|
|
"3"(dst7),
|
|
"4"(dst6),
|
|
"r"(src_step), // %10
|
|
"r"(dst_step) // %11
|
|
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
|
|
}
|
|
#endif // __aarch64__
|
|
for (; remain > 0; remain--)
|
|
{
|
|
dst7[0] = src0[0];
|
|
dst7[1] = src0[1];
|
|
dst7[2] = src0[2];
|
|
dst7[3] = src1[0];
|
|
dst7[4] = src1[1];
|
|
dst7[5] = src1[2];
|
|
dst7[6] = src0[0 + src_step];
|
|
dst7[7] = src0[1 + src_step];
|
|
dst7[8] = src0[2 + src_step];
|
|
dst7[9] = src1[0 + src_step];
|
|
dst7[10] = src1[1 + src_step];
|
|
dst7[11] = src1[2 + src_step];
|
|
dst7[12] = src0[0 + 2 * src_step];
|
|
dst7[13] = src0[1 + 2 * src_step];
|
|
dst7[14] = src0[2 + 2 * src_step];
|
|
dst7[15] = src1[0 + 2 * src_step];
|
|
dst7[16] = src1[1 + 2 * src_step];
|
|
dst7[17] = src1[2 + 2 * src_step];
|
|
dst7[18] = src0[0 + 3 * src_step];
|
|
dst7[19] = src0[1 + 3 * src_step];
|
|
dst7[20] = src0[2 + 3 * src_step];
|
|
dst7[21] = src1[0 + 3 * src_step];
|
|
dst7[22] = src1[1 + 3 * src_step];
|
|
dst7[23] = src1[2 + 3 * src_step];
|
|
|
|
src0 += 3;
|
|
src1 += 3;
|
|
|
|
dst7 -= stride;
|
|
}
|
|
|
|
src0 += srcwgap + 7 * srcstride;
|
|
}
|
|
#endif // __ARM_NEON
|
|
for (; y < srch; y++)
|
|
{
|
|
unsigned char* dst0 = dstend + y * 3;
|
|
|
|
int x = 0;
|
|
for (; x < srcw; x++)
|
|
{
|
|
dst0[0] = src0[0];
|
|
dst0[1] = src0[1];
|
|
dst0[2] = src0[2];
|
|
|
|
src0 += 3;
|
|
dst0 -= stride;
|
|
}
|
|
|
|
src0 += srcwgap;
|
|
}
|
|
}
|
|
|
|
static void kanna_rotate_8_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int h, int stride)
|
|
{
|
|
const int srcwgap = srcstride - srcw * 4;
|
|
|
|
// point to the last dst pixel row
|
|
unsigned char* dstend = dst + stride * (h - 1);
|
|
|
|
const unsigned char* src0 = src;
|
|
|
|
int y = 0;
|
|
#if __ARM_NEON
|
|
for (; y + 7 < srch; y += 8)
|
|
{
|
|
const unsigned char* src1 = src0 + srcstride;
|
|
|
|
unsigned char* dst7 = dstend + y * 4;
|
|
unsigned char* dst6 = dstend + y * 4 - stride;
|
|
|
|
int src_step = 2 * srcstride;
|
|
int dst_step = -2 * stride;
|
|
|
|
int nn = srcw >> 3;
|
|
int remain = srcw - (nn << 3);
|
|
|
|
#if __aarch64__
|
|
for (; nn > 0; nn--)
|
|
{
|
|
uint8x8x4_t _src0 = vld4_u8(src0);
|
|
uint8x8x4_t _src1 = vld4_u8(src1);
|
|
|
|
uint8x8x4_t _src2 = vld4_u8(src0 + src_step);
|
|
uint8x8x4_t _src3 = vld4_u8(src1 + src_step);
|
|
|
|
uint8x8x4_t _src4 = vld4_u8(src0 + 2 * src_step);
|
|
uint8x8x4_t _src5 = vld4_u8(src1 + 2 * src_step);
|
|
|
|
uint8x8x4_t _src6 = vld4_u8(src0 + 3 * src_step);
|
|
uint8x8x4_t _src7 = vld4_u8(src1 + 3 * src_step);
|
|
|
|
uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]);
|
|
uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]);
|
|
uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]);
|
|
uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]);
|
|
|
|
uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]);
|
|
uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]);
|
|
uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]);
|
|
uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]);
|
|
|
|
uint8x8x2_t _src01t_b = vtrn_u8(_src0.val[2], _src1.val[2]);
|
|
uint8x8x2_t _src23t_b = vtrn_u8(_src2.val[2], _src3.val[2]);
|
|
uint8x8x2_t _src45t_b = vtrn_u8(_src4.val[2], _src5.val[2]);
|
|
uint8x8x2_t _src67t_b = vtrn_u8(_src6.val[2], _src7.val[2]);
|
|
|
|
uint8x8x2_t _src01t_a = vtrn_u8(_src0.val[3], _src1.val[3]);
|
|
uint8x8x2_t _src23t_a = vtrn_u8(_src2.val[3], _src3.val[3]);
|
|
uint8x8x2_t _src45t_a = vtrn_u8(_src4.val[3], _src5.val[3]);
|
|
uint8x8x2_t _src67t_a = vtrn_u8(_src6.val[3], _src7.val[3]);
|
|
|
|
uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
|
|
uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
|
|
uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
|
|
uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
|
|
|
|
uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0]));
|
|
uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1]));
|
|
uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0]));
|
|
uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1]));
|
|
|
|
uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[0]), vreinterpret_u16_u8(_src23t_b.val[0]));
|
|
uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[1]), vreinterpret_u16_u8(_src23t_b.val[1]));
|
|
uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[0]), vreinterpret_u16_u8(_src67t_b.val[0]));
|
|
uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[1]), vreinterpret_u16_u8(_src67t_b.val[1]));
|
|
|
|
uint16x4x2_t _src02tt_a = vtrn_u16(vreinterpret_u16_u8(_src01t_a.val[0]), vreinterpret_u16_u8(_src23t_a.val[0]));
|
|
uint16x4x2_t _src13tt_a = vtrn_u16(vreinterpret_u16_u8(_src01t_a.val[1]), vreinterpret_u16_u8(_src23t_a.val[1]));
|
|
uint16x4x2_t _src46tt_a = vtrn_u16(vreinterpret_u16_u8(_src45t_a.val[0]), vreinterpret_u16_u8(_src67t_a.val[0]));
|
|
uint16x4x2_t _src57tt_a = vtrn_u16(vreinterpret_u16_u8(_src45t_a.val[1]), vreinterpret_u16_u8(_src67t_a.val[1]));
|
|
|
|
uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
|
|
uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
|
|
uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
|
|
uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
|
|
|
|
uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0]));
|
|
uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0]));
|
|
uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1]));
|
|
uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1]));
|
|
|
|
uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[0]), vreinterpret_u32_u16(_src46tt_b.val[0]));
|
|
uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[0]), vreinterpret_u32_u16(_src57tt_b.val[0]));
|
|
uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[1]), vreinterpret_u32_u16(_src46tt_b.val[1]));
|
|
uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[1]), vreinterpret_u32_u16(_src57tt_b.val[1]));
|
|
|
|
uint32x2x2_t _src04ttt_a = vtrn_u32(vreinterpret_u32_u16(_src02tt_a.val[0]), vreinterpret_u32_u16(_src46tt_a.val[0]));
|
|
uint32x2x2_t _src15ttt_a = vtrn_u32(vreinterpret_u32_u16(_src13tt_a.val[0]), vreinterpret_u32_u16(_src57tt_a.val[0]));
|
|
uint32x2x2_t _src26ttt_a = vtrn_u32(vreinterpret_u32_u16(_src02tt_a.val[1]), vreinterpret_u32_u16(_src46tt_a.val[1]));
|
|
uint32x2x2_t _src37ttt_a = vtrn_u32(vreinterpret_u32_u16(_src13tt_a.val[1]), vreinterpret_u32_u16(_src57tt_a.val[1]));
|
|
|
|
uint8x8x4_t _dst0;
|
|
uint8x8x4_t _dst1;
|
|
uint8x8x4_t _dst2;
|
|
uint8x8x4_t _dst3;
|
|
uint8x8x4_t _dst4;
|
|
uint8x8x4_t _dst5;
|
|
uint8x8x4_t _dst6;
|
|
uint8x8x4_t _dst7;
|
|
|
|
_dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
|
|
_dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
|
|
_dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
|
|
_dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
|
|
_dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
|
|
_dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
|
|
_dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
|
|
_dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
|
|
|
|
_dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
|
|
_dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
|
|
_dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
|
|
_dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
|
|
_dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
|
|
_dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
|
|
_dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
|
|
_dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
|
|
|
|
_dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
|
|
_dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
|
|
_dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
|
|
_dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
|
|
_dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
|
|
_dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
|
|
_dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
|
|
_dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
|
|
|
|
_dst0.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[0]);
|
|
_dst1.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[0]);
|
|
_dst2.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[0]);
|
|
_dst3.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[0]);
|
|
_dst4.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[1]);
|
|
_dst5.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[1]);
|
|
_dst6.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[1]);
|
|
_dst7.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[1]);
|
|
|
|
vst4_u8(dst7, _dst0);
|
|
vst4_u8(dst6, _dst1);
|
|
vst4_u8(dst7 + dst_step, _dst2);
|
|
vst4_u8(dst6 + dst_step, _dst3);
|
|
vst4_u8(dst7 + 2 * dst_step, _dst4);
|
|
vst4_u8(dst6 + 2 * dst_step, _dst5);
|
|
vst4_u8(dst7 + 3 * dst_step, _dst6);
|
|
vst4_u8(dst6 + 3 * dst_step, _dst7);
|
|
|
|
src0 += 4 * 8;
|
|
src1 += 4 * 8;
|
|
|
|
dst7 += 4 * dst_step;
|
|
dst6 += 4 * dst_step;
|
|
}
|
|
#else
|
|
if (nn > 0)
|
|
{
|
|
asm volatile(
|
|
"0: \n"
|
|
"pld [%1, #256] \n"
|
|
"vld4.u8 {d0-d3}, [%1], %10 \n"
|
|
|
|
"pld [%2, #256] \n"
|
|
"vld4.u8 {d4-d7}, [%2], %10 \n"
|
|
|
|
"pld [%1, #256] \n"
|
|
"vld4.u8 {d8-d11}, [%1], %10 \n"
|
|
|
|
"vtrn.u8 q0, q2 \n" // _src01t_r
|
|
"vtrn.u8 q1, q3 \n"
|
|
|
|
"pld [%2, #256] \n"
|
|
"vld4.u8 {d12-d15}, [%2], %10\n"
|
|
|
|
"pld [%1, #256] \n"
|
|
"vld4.u8 {d16-d19}, [%1], %10\n"
|
|
|
|
"vtrn.u8 q4, q6 \n" // _src23t_r
|
|
"vtrn.u8 q5, q7 \n"
|
|
|
|
"pld [%2, #256] \n"
|
|
"vld4.u8 {d20-d23}, [%2], %10\n"
|
|
|
|
"pld [%1, #256] \n"
|
|
"vld4.u8 {d24-d27}, [%1], %10\n"
|
|
|
|
"vtrn.u8 q8, q10 \n" // _src45t_r
|
|
"vtrn.u8 q9, q11 \n"
|
|
|
|
"pld [%2, #256] \n"
|
|
"vld4.u8 {d28-d31}, [%2], %10\n"
|
|
|
|
"vtrn.u8 q12, q14 \n" // _src67t_r
|
|
"vtrn.u8 q13, q15 \n"
|
|
|
|
"sub %1, %1, %10, lsl #2 \n" // restore src0
|
|
|
|
"vtrn.u16 q0, q4 \n" // _src02tt_r
|
|
"vtrn.u16 q1, q5 \n"
|
|
|
|
"sub %2, %2, %10, lsl #2 \n" // restore src1
|
|
|
|
"vtrn.u16 q2, q6 \n" // _src13tt_r
|
|
"vtrn.u16 q3, q7 \n"
|
|
|
|
"add %1, #32 \n" // src0 += 32
|
|
|
|
"vtrn.u16 q8, q12 \n" // _src46tt_r
|
|
"vtrn.u16 q9, q13 \n"
|
|
|
|
"add %2, #32 \n" // src1 += 32
|
|
|
|
"vtrn.u16 q10, q14 \n" // _src57tt_r
|
|
"vtrn.u16 q11, q15 \n"
|
|
|
|
"vtrn.u32 q0, q8 \n" // _src04ttt_r
|
|
"vtrn.u32 q1, q9 \n"
|
|
|
|
"vtrn.u32 q2, q10 \n" // _src15ttt_r
|
|
"vst4.u8 {d0-d3}, [%3], %11 \n"
|
|
"vtrn.u32 q3, q11 \n"
|
|
|
|
"vtrn.u32 q4, q12 \n" // _src26ttt_r
|
|
"vst4.u8 {d4-d7}, [%4], %11 \n"
|
|
"vtrn.u32 q5, q13 \n"
|
|
|
|
"vtrn.u32 q6, q14 \n" // _src37ttt_r
|
|
"vst4.u8 {d8-d11}, [%3], %11 \n"
|
|
"vtrn.u32 q7, q15 \n"
|
|
|
|
"subs %0, #1 \n"
|
|
|
|
"vst4.u8 {d16-d19}, [%3], %11\n"
|
|
"vst4.u8 {d12-d15}, [%4], %11\n"
|
|
"vst4.u8 {d20-d23}, [%4], %11\n"
|
|
"vst4.u8 {d24-d27}, [%3], %11\n"
|
|
"vst4.u8 {d28-d31}, [%4], %11\n"
|
|
|
|
"bne 0b \n"
|
|
: "=r"(nn), // %0
|
|
"=r"(src0), // %1
|
|
"=r"(src1), // %2
|
|
"=r"(dst7), // %3
|
|
"=r"(dst6) // %4
|
|
: "0"(nn),
|
|
"1"(src0),
|
|
"2"(src1),
|
|
"3"(dst7),
|
|
"4"(dst6),
|
|
"r"(src_step), // %10
|
|
"r"(dst_step) // %11
|
|
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
|
|
}
|
|
#endif // __aarch64__
|
|
for (; remain > 0; remain--)
|
|
{
|
|
dst7[0] = src0[0];
|
|
dst7[1] = src0[1];
|
|
dst7[2] = src0[2];
|
|
dst7[3] = src0[3];
|
|
dst7[4] = src1[0];
|
|
dst7[5] = src1[1];
|
|
dst7[6] = src1[2];
|
|
dst7[7] = src1[3];
|
|
dst7[8] = src0[0 + src_step];
|
|
dst7[9] = src0[1 + src_step];
|
|
dst7[10] = src0[2 + src_step];
|
|
dst7[11] = src0[3 + src_step];
|
|
dst7[12] = src1[0 + src_step];
|
|
dst7[13] = src1[1 + src_step];
|
|
dst7[14] = src1[2 + src_step];
|
|
dst7[15] = src1[3 + src_step];
|
|
dst7[16] = src0[0 + 2 * src_step];
|
|
dst7[17] = src0[1 + 2 * src_step];
|
|
dst7[18] = src0[2 + 2 * src_step];
|
|
dst7[19] = src0[3 + 2 * src_step];
|
|
dst7[20] = src1[0 + 2 * src_step];
|
|
dst7[21] = src1[1 + 2 * src_step];
|
|
dst7[22] = src1[2 + 2 * src_step];
|
|
dst7[23] = src1[3 + 2 * src_step];
|
|
dst7[24] = src0[0 + 3 * src_step];
|
|
dst7[25] = src0[1 + 3 * src_step];
|
|
dst7[26] = src0[2 + 3 * src_step];
|
|
dst7[27] = src0[3 + 3 * src_step];
|
|
dst7[28] = src1[0 + 3 * src_step];
|
|
dst7[29] = src1[1 + 3 * src_step];
|
|
dst7[30] = src1[2 + 3 * src_step];
|
|
dst7[31] = src1[3 + 3 * src_step];
|
|
|
|
src0 += 4;
|
|
src1 += 4;
|
|
|
|
dst7 -= stride;
|
|
}
|
|
|
|
src0 += srcwgap + 7 * srcstride;
|
|
}
|
|
#endif // __ARM_NEON
|
|
for (; y < srch; y++)
|
|
{
|
|
unsigned char* dst0 = dstend + y * 4;
|
|
|
|
int x = 0;
|
|
for (; x < srcw; x++)
|
|
{
|
|
dst0[0] = src0[0];
|
|
dst0[1] = src0[1];
|
|
dst0[2] = src0[2];
|
|
dst0[3] = src0[3];
|
|
|
|
src0 += 4;
|
|
dst0 -= stride;
|
|
}
|
|
|
|
src0 += srcwgap;
|
|
}
|
|
}
|
|
|
|
void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type)
|
|
{
|
|
return kanna_rotate_c1(src, srcw, srch, srcw, dst, w, h, w, type);
|
|
}
|
|
|
|
void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type)
|
|
{
|
|
return kanna_rotate_c2(src, srcw, srch, srcw * 2, dst, w, h, w * 2, type);
|
|
}
|
|
|
|
void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type)
|
|
{
|
|
return kanna_rotate_c3(src, srcw, srch, srcw * 3, dst, w, h, w * 3, type);
|
|
}
|
|
|
|
void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type)
|
|
{
|
|
return kanna_rotate_c4(src, srcw, srch, srcw * 4, dst, w, h, w * 4, type);
|
|
}
|
|
|
|
void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type)
|
|
{
|
|
// assert srcw == w && srch == h for type 1234
|
|
// assert srcw == h && srch == w for type 5678
|
|
|
|
switch (type)
|
|
{
|
|
case 1:
|
|
kanna_rotate_1_c1(src, srcw, srch, srcstride, dst, w, h, stride);
|
|
break;
|
|
case 2:
|
|
kanna_rotate_2_c1(src, srcw, srch, srcstride, dst, w, h, stride);
|
|
break;
|
|
case 3:
|
|
kanna_rotate_3_c1(src, srcw, srch, srcstride, dst, w, h, stride);
|
|
break;
|
|
case 4:
|
|
kanna_rotate_4_c1(src, srcw, srch, srcstride, dst, w, h, stride);
|
|
break;
|
|
case 5:
|
|
kanna_rotate_5_c1(src, srcw, srch, srcstride, dst, w, h, stride);
|
|
break;
|
|
case 6:
|
|
kanna_rotate_6_c1(src, srcw, srch, srcstride, dst, w, h, stride);
|
|
break;
|
|
case 7:
|
|
kanna_rotate_7_c1(src, srcw, srch, srcstride, dst, w, h, stride);
|
|
break;
|
|
case 8:
|
|
kanna_rotate_8_c1(src, srcw, srch, srcstride, dst, w, h, stride);
|
|
break;
|
|
default:
|
|
// unsupported rotate type
|
|
break;
|
|
}
|
|
}
|
|
|
|
void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type)
|
|
{
|
|
// assert srcw == w && srch == h for type 1234
|
|
// assert srcw == h && srch == w for type 5678
|
|
|
|
switch (type)
|
|
{
|
|
case 1:
|
|
kanna_rotate_1_c2(src, srcw, srch, srcstride, dst, w, h, stride);
|
|
break;
|
|
case 2:
|
|
kanna_rotate_2_c2(src, srcw, srch, srcstride, dst, w, h, stride);
|
|
break;
|
|
case 3:
|
|
kanna_rotate_3_c2(src, srcw, srch, srcstride, dst, w, h, stride);
|
|
break;
|
|
case 4:
|
|
kanna_rotate_4_c2(src, srcw, srch, srcstride, dst, w, h, stride);
|
|
break;
|
|
case 5:
|
|
kanna_rotate_5_c2(src, srcw, srch, srcstride, dst, w, h, stride);
|
|
break;
|
|
case 6:
|
|
kanna_rotate_6_c2(src, srcw, srch, srcstride, dst, w, h, stride);
|
|
break;
|
|
case 7:
|
|
kanna_rotate_7_c2(src, srcw, srch, srcstride, dst, w, h, stride);
|
|
break;
|
|
case 8:
|
|
kanna_rotate_8_c2(src, srcw, srch, srcstride, dst, w, h, stride);
|
|
break;
|
|
default:
|
|
// unsupported rotate type
|
|
break;
|
|
}
|
|
}
|
|
|
|
void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type)
|
|
{
|
|
// assert srcw == w && srch == h for type 1234
|
|
// assert srcw == h && srch == w for type 5678
|
|
|
|
switch (type)
|
|
{
|
|
case 1:
|
|
kanna_rotate_1_c3(src, srcw, srch, srcstride, dst, w, h, stride);
|
|
break;
|
|
case 2:
|
|
kanna_rotate_2_c3(src, srcw, srch, srcstride, dst, w, h, stride);
|
|
break;
|
|
case 3:
|
|
kanna_rotate_3_c3(src, srcw, srch, srcstride, dst, w, h, stride);
|
|
break;
|
|
case 4:
|
|
kanna_rotate_4_c3(src, srcw, srch, srcstride, dst, w, h, stride);
|
|
break;
|
|
case 5:
|
|
kanna_rotate_5_c3(src, srcw, srch, srcstride, dst, w, h, stride);
|
|
break;
|
|
case 6:
|
|
kanna_rotate_6_c3(src, srcw, srch, srcstride, dst, w, h, stride);
|
|
break;
|
|
case 7:
|
|
kanna_rotate_7_c3(src, srcw, srch, srcstride, dst, w, h, stride);
|
|
break;
|
|
case 8:
|
|
kanna_rotate_8_c3(src, srcw, srch, srcstride, dst, w, h, stride);
|
|
break;
|
|
default:
|
|
// unsupported rotate type
|
|
break;
|
|
}
|
|
}
|
|
|
|
void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type)
|
|
{
|
|
// assert srcw == w && srch == h for type 1234
|
|
// assert srcw == h && srch == w for type 5678
|
|
|
|
switch (type)
|
|
{
|
|
case 1:
|
|
kanna_rotate_1_c4(src, srcw, srch, srcstride, dst, w, h, stride);
|
|
break;
|
|
case 2:
|
|
kanna_rotate_2_c4(src, srcw, srch, srcstride, dst, w, h, stride);
|
|
break;
|
|
case 3:
|
|
kanna_rotate_3_c4(src, srcw, srch, srcstride, dst, w, h, stride);
|
|
break;
|
|
case 4:
|
|
kanna_rotate_4_c4(src, srcw, srch, srcstride, dst, w, h, stride);
|
|
break;
|
|
case 5:
|
|
kanna_rotate_5_c4(src, srcw, srch, srcstride, dst, w, h, stride);
|
|
break;
|
|
case 6:
|
|
kanna_rotate_6_c4(src, srcw, srch, srcstride, dst, w, h, stride);
|
|
break;
|
|
case 7:
|
|
kanna_rotate_7_c4(src, srcw, srch, srcstride, dst, w, h, stride);
|
|
break;
|
|
case 8:
|
|
kanna_rotate_8_c4(src, srcw, srch, srcstride, dst, w, h, stride);
|
|
break;
|
|
default:
|
|
// unsupported rotate type
|
|
break;
|
|
}
|
|
}
|
|
|
|
void kanna_rotate_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type)
|
|
{
|
|
// assert srcw % 2 == 0
|
|
// assert srch % 2 == 0
|
|
// assert w % 2 == 0
|
|
// assert h % 2 == 0
|
|
|
|
const unsigned char* srcY = src;
|
|
unsigned char* dstY = dst;
|
|
kanna_rotate_c1(srcY, srcw, srch, dstY, w, h, type);
|
|
|
|
const unsigned char* srcUV = src + srcw * srch;
|
|
unsigned char* dstUV = dst + w * h;
|
|
kanna_rotate_c2(srcUV, srcw / 2, srch / 2, dstUV, w / 2, h / 2, type);
|
|
}
|
|
#endif // NCNN_PIXEL_ROTATE
|
|
|
|
} // namespace ncnn
|