Update
This commit is contained in:
435
3rdparty/libyuv/source/compare.cc
vendored
Normal file
435
3rdparty/libyuv/source/compare.cc
vendored
Normal file
@@ -0,0 +1,435 @@
|
||||
/*
|
||||
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/compare.h"
|
||||
|
||||
#include <float.h>
|
||||
#include <math.h>
|
||||
#ifdef _OPENMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
#include "libyuv/compare_row.h"
|
||||
#include "libyuv/cpu_id.h"
|
||||
#include "libyuv/row.h"
|
||||
#include "libyuv/video_common.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// hash seed of 5381 recommended.
|
||||
LIBYUV_API
|
||||
uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed) {
|
||||
const int kBlockSize = 1 << 15; // 32768;
|
||||
int remainder;
|
||||
uint32_t (*HashDjb2_SSE)(const uint8_t* src, int count, uint32_t seed) =
|
||||
HashDjb2_C;
|
||||
#if defined(HAS_HASHDJB2_SSE41)
|
||||
if (TestCpuFlag(kCpuHasSSE41)) {
|
||||
HashDjb2_SSE = HashDjb2_SSE41;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_HASHDJB2_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
HashDjb2_SSE = HashDjb2_AVX2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_HASHDJB2_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
HashDjb2_SSE = HashDjb2_NEON;
|
||||
}
|
||||
#endif
|
||||
|
||||
while (count >= (uint64_t)kBlockSize) {
|
||||
seed = HashDjb2_SSE(src, kBlockSize, seed);
|
||||
src += kBlockSize;
|
||||
count -= kBlockSize;
|
||||
}
|
||||
remainder = (int)count & ~15;
|
||||
if (remainder) {
|
||||
seed = HashDjb2_SSE(src, remainder, seed);
|
||||
src += remainder;
|
||||
count -= remainder;
|
||||
}
|
||||
remainder = (int)count & 15;
|
||||
if (remainder) {
|
||||
seed = HashDjb2_C(src, remainder, seed);
|
||||
}
|
||||
return seed;
|
||||
}
|
||||
|
||||
static uint32_t ARGBDetectRow_C(const uint8_t* argb, int width) {
|
||||
int x;
|
||||
for (x = 0; x < width - 1; x += 2) {
|
||||
if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB.
|
||||
return FOURCC_BGRA;
|
||||
}
|
||||
if (argb[3] != 255) { // Fourth byte is not Alpha of 255, so not BGRA.
|
||||
return FOURCC_ARGB;
|
||||
}
|
||||
if (argb[4] != 255) { // Second pixel first byte is not Alpha of 255.
|
||||
return FOURCC_BGRA;
|
||||
}
|
||||
if (argb[7] != 255) { // Second pixel fourth byte is not Alpha of 255.
|
||||
return FOURCC_ARGB;
|
||||
}
|
||||
argb += 8;
|
||||
}
|
||||
if (width & 1) {
|
||||
if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB.
|
||||
return FOURCC_BGRA;
|
||||
}
|
||||
if (argb[3] != 255) { // 4th byte is not Alpha of 255, so not BGRA.
|
||||
return FOURCC_ARGB;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Scan an opaque argb image and return fourcc based on alpha offset.
|
||||
// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
|
||||
LIBYUV_API
|
||||
uint32_t ARGBDetect(const uint8_t* argb,
|
||||
int stride_argb,
|
||||
int width,
|
||||
int height) {
|
||||
uint32_t fourcc = 0;
|
||||
int h;
|
||||
|
||||
// Coalesce rows.
|
||||
if (stride_argb == width * 4) {
|
||||
width *= height;
|
||||
height = 1;
|
||||
stride_argb = 0;
|
||||
}
|
||||
for (h = 0; h < height && fourcc == 0; ++h) {
|
||||
fourcc = ARGBDetectRow_C(argb, width);
|
||||
argb += stride_argb;
|
||||
}
|
||||
return fourcc;
|
||||
}
|
||||
|
||||
// NEON version accumulates in 16 bit shorts which overflow at 65536 bytes.
|
||||
// So actual maximum is 1 less loop, which is 64436 - 32 bytes.
|
||||
|
||||
LIBYUV_API
|
||||
uint64_t ComputeHammingDistance(const uint8_t* src_a,
|
||||
const uint8_t* src_b,
|
||||
int count) {
|
||||
const int kBlockSize = 1 << 15; // 32768;
|
||||
const int kSimdSize = 64;
|
||||
// SIMD for multiple of 64, and C for remainder
|
||||
int remainder = count & (kBlockSize - 1) & ~(kSimdSize - 1);
|
||||
uint64_t diff = 0;
|
||||
int i;
|
||||
uint32_t (*HammingDistance)(const uint8_t* src_a, const uint8_t* src_b,
|
||||
int count) = HammingDistance_C;
|
||||
#if defined(HAS_HAMMINGDISTANCE_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
HammingDistance = HammingDistance_NEON;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_HAMMINGDISTANCE_NEON_DOTPROD)
|
||||
if (TestCpuFlag(kCpuHasNeonDotProd)) {
|
||||
HammingDistance = HammingDistance_NEON_DotProd;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_HAMMINGDISTANCE_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
HammingDistance = HammingDistance_SSSE3;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_HAMMINGDISTANCE_SSE42)
|
||||
if (TestCpuFlag(kCpuHasSSE42)) {
|
||||
HammingDistance = HammingDistance_SSE42;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_HAMMINGDISTANCE_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
HammingDistance = HammingDistance_AVX2;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef _OPENMP
|
||||
#pragma omp parallel for reduction(+ : diff)
|
||||
#endif
|
||||
for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
|
||||
diff += HammingDistance(src_a + i, src_b + i, kBlockSize);
|
||||
}
|
||||
src_a += count & ~(kBlockSize - 1);
|
||||
src_b += count & ~(kBlockSize - 1);
|
||||
if (remainder) {
|
||||
diff += HammingDistance(src_a, src_b, remainder);
|
||||
src_a += remainder;
|
||||
src_b += remainder;
|
||||
}
|
||||
remainder = count & (kSimdSize - 1);
|
||||
if (remainder) {
|
||||
diff += HammingDistance_C(src_a, src_b, remainder);
|
||||
}
|
||||
return diff;
|
||||
}
|
||||
|
||||
// TODO(fbarchard): Refactor into row function.
|
||||
LIBYUV_API
|
||||
uint64_t ComputeSumSquareError(const uint8_t* src_a,
|
||||
const uint8_t* src_b,
|
||||
int count) {
|
||||
// SumSquareError returns values 0 to 65535 for each squared difference.
|
||||
// Up to 65536 of those can be summed and remain within a uint32_t.
|
||||
// After each block of 65536 pixels, accumulate into a uint64_t.
|
||||
const int kBlockSize = 65536;
|
||||
int remainder = count & (kBlockSize - 1) & ~31;
|
||||
uint64_t sse = 0;
|
||||
int i;
|
||||
uint32_t (*SumSquareError)(const uint8_t* src_a, const uint8_t* src_b,
|
||||
int count) = SumSquareError_C;
|
||||
#if defined(HAS_SUMSQUAREERROR_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
SumSquareError = SumSquareError_NEON;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SUMSQUAREERROR_NEON_DOTPROD)
|
||||
if (TestCpuFlag(kCpuHasNeonDotProd)) {
|
||||
SumSquareError = SumSquareError_NEON_DotProd;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SUMSQUAREERROR_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
// Note only used for multiples of 16 so count is not checked.
|
||||
SumSquareError = SumSquareError_SSE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SUMSQUAREERROR_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
// Note only used for multiples of 32 so count is not checked.
|
||||
SumSquareError = SumSquareError_AVX2;
|
||||
}
|
||||
#endif
|
||||
#ifdef _OPENMP
|
||||
#pragma omp parallel for reduction(+ : sse)
|
||||
#endif
|
||||
for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
|
||||
sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
|
||||
}
|
||||
src_a += count & ~(kBlockSize - 1);
|
||||
src_b += count & ~(kBlockSize - 1);
|
||||
if (remainder) {
|
||||
sse += SumSquareError(src_a, src_b, remainder);
|
||||
src_a += remainder;
|
||||
src_b += remainder;
|
||||
}
|
||||
remainder = count & 31;
|
||||
if (remainder) {
|
||||
sse += SumSquareError_C(src_a, src_b, remainder);
|
||||
}
|
||||
return sse;
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
uint64_t ComputeSumSquareErrorPlane(const uint8_t* src_a,
|
||||
int stride_a,
|
||||
const uint8_t* src_b,
|
||||
int stride_b,
|
||||
int width,
|
||||
int height) {
|
||||
uint64_t sse = 0;
|
||||
int h;
|
||||
// Coalesce rows.
|
||||
if (stride_a == width && stride_b == width) {
|
||||
width *= height;
|
||||
height = 1;
|
||||
stride_a = stride_b = 0;
|
||||
}
|
||||
for (h = 0; h < height; ++h) {
|
||||
sse += ComputeSumSquareError(src_a, src_b, width);
|
||||
src_a += stride_a;
|
||||
src_b += stride_b;
|
||||
}
|
||||
return sse;
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
double SumSquareErrorToPsnr(uint64_t sse, uint64_t count) {
|
||||
double psnr;
|
||||
if (sse > 0) {
|
||||
double mse = (double)count / (double)sse;
|
||||
psnr = 10.0 * log10(255.0 * 255.0 * mse);
|
||||
} else {
|
||||
psnr = kMaxPsnr; // Limit to prevent divide by 0
|
||||
}
|
||||
|
||||
if (psnr > kMaxPsnr) {
|
||||
psnr = kMaxPsnr;
|
||||
}
|
||||
|
||||
return psnr;
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
double CalcFramePsnr(const uint8_t* src_a,
|
||||
int stride_a,
|
||||
const uint8_t* src_b,
|
||||
int stride_b,
|
||||
int width,
|
||||
int height) {
|
||||
const uint64_t samples = (uint64_t)width * (uint64_t)height;
|
||||
const uint64_t sse = ComputeSumSquareErrorPlane(src_a, stride_a, src_b,
|
||||
stride_b, width, height);
|
||||
return SumSquareErrorToPsnr(sse, samples);
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
double I420Psnr(const uint8_t* src_y_a,
|
||||
int stride_y_a,
|
||||
const uint8_t* src_u_a,
|
||||
int stride_u_a,
|
||||
const uint8_t* src_v_a,
|
||||
int stride_v_a,
|
||||
const uint8_t* src_y_b,
|
||||
int stride_y_b,
|
||||
const uint8_t* src_u_b,
|
||||
int stride_u_b,
|
||||
const uint8_t* src_v_b,
|
||||
int stride_v_b,
|
||||
int width,
|
||||
int height) {
|
||||
const uint64_t sse_y = ComputeSumSquareErrorPlane(
|
||||
src_y_a, stride_y_a, src_y_b, stride_y_b, width, height);
|
||||
const int width_uv = (width + 1) >> 1;
|
||||
const int height_uv = (height + 1) >> 1;
|
||||
const uint64_t sse_u = ComputeSumSquareErrorPlane(
|
||||
src_u_a, stride_u_a, src_u_b, stride_u_b, width_uv, height_uv);
|
||||
const uint64_t sse_v = ComputeSumSquareErrorPlane(
|
||||
src_v_a, stride_v_a, src_v_b, stride_v_b, width_uv, height_uv);
|
||||
const uint64_t samples = (uint64_t)width * (uint64_t)height +
|
||||
2 * ((uint64_t)width_uv * (uint64_t)height_uv);
|
||||
const uint64_t sse = sse_y + sse_u + sse_v;
|
||||
return SumSquareErrorToPsnr(sse, samples);
|
||||
}
|
||||
|
||||
static const int64_t cc1 = 26634; // (64^2*(.01*255)^2
|
||||
static const int64_t cc2 = 239708; // (64^2*(.03*255)^2
|
||||
|
||||
static double Ssim8x8_C(const uint8_t* src_a,
|
||||
int stride_a,
|
||||
const uint8_t* src_b,
|
||||
int stride_b) {
|
||||
int64_t sum_a = 0;
|
||||
int64_t sum_b = 0;
|
||||
int64_t sum_sq_a = 0;
|
||||
int64_t sum_sq_b = 0;
|
||||
int64_t sum_axb = 0;
|
||||
|
||||
int i;
|
||||
for (i = 0; i < 8; ++i) {
|
||||
int j;
|
||||
for (j = 0; j < 8; ++j) {
|
||||
sum_a += src_a[j];
|
||||
sum_b += src_b[j];
|
||||
sum_sq_a += src_a[j] * src_a[j];
|
||||
sum_sq_b += src_b[j] * src_b[j];
|
||||
sum_axb += src_a[j] * src_b[j];
|
||||
}
|
||||
|
||||
src_a += stride_a;
|
||||
src_b += stride_b;
|
||||
}
|
||||
|
||||
{
|
||||
const int64_t count = 64;
|
||||
// scale the constants by number of pixels
|
||||
const int64_t c1 = (cc1 * count * count) >> 12;
|
||||
const int64_t c2 = (cc2 * count * count) >> 12;
|
||||
|
||||
const int64_t sum_a_x_sum_b = sum_a * sum_b;
|
||||
|
||||
const int64_t ssim_n = (2 * sum_a_x_sum_b + c1) *
|
||||
(2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
|
||||
|
||||
const int64_t sum_a_sq = sum_a * sum_a;
|
||||
const int64_t sum_b_sq = sum_b * sum_b;
|
||||
|
||||
const int64_t ssim_d =
|
||||
(sum_a_sq + sum_b_sq + c1) *
|
||||
(count * sum_sq_a - sum_a_sq + count * sum_sq_b - sum_b_sq + c2);
|
||||
|
||||
if (ssim_d == 0) {
|
||||
return DBL_MAX;
|
||||
}
|
||||
return (double)ssim_n / (double)ssim_d;
|
||||
}
|
||||
}
|
||||
|
||||
// We are using a 8x8 moving window with starting location of each 8x8 window
|
||||
// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
|
||||
// block boundaries to penalize blocking artifacts.
|
||||
LIBYUV_API
|
||||
double CalcFrameSsim(const uint8_t* src_a,
|
||||
int stride_a,
|
||||
const uint8_t* src_b,
|
||||
int stride_b,
|
||||
int width,
|
||||
int height) {
|
||||
int samples = 0;
|
||||
double ssim_total = 0;
|
||||
double (*Ssim8x8)(const uint8_t* src_a, int stride_a, const uint8_t* src_b,
|
||||
int stride_b) = Ssim8x8_C;
|
||||
|
||||
// sample point start with each 4x4 location
|
||||
int i;
|
||||
for (i = 0; i < height - 8; i += 4) {
|
||||
int j;
|
||||
for (j = 0; j < width - 8; j += 4) {
|
||||
ssim_total += Ssim8x8(src_a + j, stride_a, src_b + j, stride_b);
|
||||
samples++;
|
||||
}
|
||||
|
||||
src_a += stride_a * 4;
|
||||
src_b += stride_b * 4;
|
||||
}
|
||||
|
||||
ssim_total /= samples;
|
||||
return ssim_total;
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
double I420Ssim(const uint8_t* src_y_a,
|
||||
int stride_y_a,
|
||||
const uint8_t* src_u_a,
|
||||
int stride_u_a,
|
||||
const uint8_t* src_v_a,
|
||||
int stride_v_a,
|
||||
const uint8_t* src_y_b,
|
||||
int stride_y_b,
|
||||
const uint8_t* src_u_b,
|
||||
int stride_u_b,
|
||||
const uint8_t* src_v_b,
|
||||
int stride_v_b,
|
||||
int width,
|
||||
int height) {
|
||||
const double ssim_y =
|
||||
CalcFrameSsim(src_y_a, stride_y_a, src_y_b, stride_y_b, width, height);
|
||||
const int width_uv = (width + 1) >> 1;
|
||||
const int height_uv = (height + 1) >> 1;
|
||||
const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a, src_u_b, stride_u_b,
|
||||
width_uv, height_uv);
|
||||
const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a, src_v_b, stride_v_b,
|
||||
width_uv, height_uv);
|
||||
return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v);
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
74
3rdparty/libyuv/source/compare_common.cc
vendored
Normal file
74
3rdparty/libyuv/source/compare_common.cc
vendored
Normal file
@@ -0,0 +1,74 @@
|
||||
/*
|
||||
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
|
||||
#include "libyuv/compare_row.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Hakmem method for hamming distance.
|
||||
uint32_t HammingDistance_C(const uint8_t* src_a,
|
||||
const uint8_t* src_b,
|
||||
int count) {
|
||||
uint32_t diff = 0u;
|
||||
|
||||
int i;
|
||||
for (i = 0; i < count - 3; i += 4) {
|
||||
uint32_t x = *((const uint32_t*)src_a) ^ *((const uint32_t*)src_b);
|
||||
uint32_t u = x - ((x >> 1) & 0x55555555);
|
||||
u = ((u >> 2) & 0x33333333) + (u & 0x33333333);
|
||||
diff += ((((u + (u >> 4)) & 0x0f0f0f0f) * 0x01010101) >> 24);
|
||||
src_a += 4;
|
||||
src_b += 4;
|
||||
}
|
||||
|
||||
for (; i < count; ++i) {
|
||||
uint32_t x = *src_a ^ *src_b;
|
||||
uint32_t u = x - ((x >> 1) & 0x55);
|
||||
u = ((u >> 2) & 0x33) + (u & 0x33);
|
||||
diff += (u + (u >> 4)) & 0x0f;
|
||||
src_a += 1;
|
||||
src_b += 1;
|
||||
}
|
||||
|
||||
return diff;
|
||||
}
|
||||
|
||||
uint32_t SumSquareError_C(const uint8_t* src_a,
|
||||
const uint8_t* src_b,
|
||||
int count) {
|
||||
uint32_t sse = 0u;
|
||||
int i;
|
||||
for (i = 0; i < count; ++i) {
|
||||
int diff = src_a[i] - src_b[i];
|
||||
sse += (uint32_t)(diff * diff);
|
||||
}
|
||||
return sse;
|
||||
}
|
||||
|
||||
// hash seed of 5381 recommended.
|
||||
// Internal C version of HashDjb2 with int sized count for efficiency.
|
||||
uint32_t HashDjb2_C(const uint8_t* src, int count, uint32_t seed) {
|
||||
uint32_t hash = seed;
|
||||
int i;
|
||||
for (i = 0; i < count; ++i) {
|
||||
hash += (hash << 5) + src[i];
|
||||
}
|
||||
return hash;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
362
3rdparty/libyuv/source/compare_gcc.cc
vendored
Normal file
362
3rdparty/libyuv/source/compare_gcc.cc
vendored
Normal file
@@ -0,0 +1,362 @@
|
||||
/*
|
||||
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
|
||||
#include "libyuv/compare_row.h"
|
||||
#include "libyuv/row.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// This module is for GCC x86 and x64.
|
||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||
(defined(__x86_64__) || defined(__i386__)) && \
|
||||
!defined(LIBYUV_ENABLE_ROWWIN)
|
||||
|
||||
// "memory" clobber prevents the reads from being removed
|
||||
|
||||
#if defined(__x86_64__)
|
||||
uint32_t HammingDistance_SSE42(const uint8_t* src_a,
|
||||
const uint8_t* src_b,
|
||||
int count) {
|
||||
uint64_t diff;
|
||||
|
||||
asm volatile(
|
||||
"xor %3,%3 \n"
|
||||
"xor %%r8,%%r8 \n"
|
||||
"xor %%r9,%%r9 \n"
|
||||
"xor %%r10,%%r10 \n"
|
||||
|
||||
// Process 32 bytes per loop.
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"mov (%0),%%rcx \n"
|
||||
"mov 0x8(%0),%%rdx \n"
|
||||
"xor (%1),%%rcx \n"
|
||||
"xor 0x8(%1),%%rdx \n"
|
||||
"popcnt %%rcx,%%rcx \n"
|
||||
"popcnt %%rdx,%%rdx \n"
|
||||
"mov 0x10(%0),%%rsi \n"
|
||||
"mov 0x18(%0),%%rdi \n"
|
||||
"xor 0x10(%1),%%rsi \n"
|
||||
"xor 0x18(%1),%%rdi \n"
|
||||
"popcnt %%rsi,%%rsi \n"
|
||||
"popcnt %%rdi,%%rdi \n"
|
||||
"add $0x20,%0 \n"
|
||||
"add $0x20,%1 \n"
|
||||
"add %%rcx,%3 \n"
|
||||
"add %%rdx,%%r8 \n"
|
||||
"add %%rsi,%%r9 \n"
|
||||
"add %%rdi,%%r10 \n"
|
||||
"sub $0x20,%2 \n"
|
||||
"jg 1b \n"
|
||||
|
||||
"add %%r8, %3 \n"
|
||||
"add %%r9, %3 \n"
|
||||
"add %%r10, %3 \n"
|
||||
: "+r"(src_a), // %0
|
||||
"+r"(src_b), // %1
|
||||
"+r"(count), // %2
|
||||
"=&r"(diff) // %3
|
||||
:
|
||||
: "cc", "memory", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10");
|
||||
|
||||
return (uint32_t)(diff);
|
||||
}
|
||||
#else
|
||||
uint32_t HammingDistance_SSE42(const uint8_t* src_a,
|
||||
const uint8_t* src_b,
|
||||
int count) {
|
||||
uint32_t diff = 0u;
|
||||
|
||||
asm volatile(
|
||||
// Process 16 bytes per loop.
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"mov (%0),%%ecx \n"
|
||||
"mov 0x4(%0),%%edx \n"
|
||||
"xor (%1),%%ecx \n"
|
||||
"xor 0x4(%1),%%edx \n"
|
||||
"popcnt %%ecx,%%ecx \n"
|
||||
"add %%ecx,%3 \n"
|
||||
"popcnt %%edx,%%edx \n"
|
||||
"add %%edx,%3 \n"
|
||||
"mov 0x8(%0),%%ecx \n"
|
||||
"mov 0xc(%0),%%edx \n"
|
||||
"xor 0x8(%1),%%ecx \n"
|
||||
"xor 0xc(%1),%%edx \n"
|
||||
"popcnt %%ecx,%%ecx \n"
|
||||
"add %%ecx,%3 \n"
|
||||
"popcnt %%edx,%%edx \n"
|
||||
"add %%edx,%3 \n"
|
||||
"add $0x10,%0 \n"
|
||||
"add $0x10,%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_a), // %0
|
||||
"+r"(src_b), // %1
|
||||
"+r"(count), // %2
|
||||
"+r"(diff) // %3
|
||||
:
|
||||
: "cc", "memory", "ecx", "edx");
|
||||
|
||||
return diff;
|
||||
}
|
||||
#endif
|
||||
|
||||
static const vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15,
|
||||
15, 15, 15, 15, 15, 15, 15, 15};
|
||||
static const vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
|
||||
|
||||
uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
|
||||
const uint8_t* src_b,
|
||||
int count) {
|
||||
uint32_t diff;
|
||||
|
||||
asm volatile(
|
||||
"movdqa %4,%%xmm2 \n"
|
||||
"movdqa %5,%%xmm3 \n"
|
||||
"pxor %%xmm0,%%xmm0 \n"
|
||||
"pxor %%xmm1,%%xmm1 \n"
|
||||
"sub %0,%1 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"movdqa (%0),%%xmm4 \n"
|
||||
"movdqa 0x10(%0), %%xmm5 \n"
|
||||
"pxor (%0,%1), %%xmm4 \n"
|
||||
"movdqa %%xmm4,%%xmm6 \n"
|
||||
"pand %%xmm2,%%xmm6 \n"
|
||||
"psrlw $0x4,%%xmm4 \n"
|
||||
"movdqa %%xmm3,%%xmm7 \n"
|
||||
"pshufb %%xmm6,%%xmm7 \n"
|
||||
"pand %%xmm2,%%xmm4 \n"
|
||||
"movdqa %%xmm3,%%xmm6 \n"
|
||||
"pshufb %%xmm4,%%xmm6 \n"
|
||||
"paddb %%xmm7,%%xmm6 \n"
|
||||
"pxor 0x10(%0,%1),%%xmm5 \n"
|
||||
"add $0x20,%0 \n"
|
||||
"movdqa %%xmm5,%%xmm4 \n"
|
||||
"pand %%xmm2,%%xmm5 \n"
|
||||
"psrlw $0x4,%%xmm4 \n"
|
||||
"movdqa %%xmm3,%%xmm7 \n"
|
||||
"pshufb %%xmm5,%%xmm7 \n"
|
||||
"pand %%xmm2,%%xmm4 \n"
|
||||
"movdqa %%xmm3,%%xmm5 \n"
|
||||
"pshufb %%xmm4,%%xmm5 \n"
|
||||
"paddb %%xmm7,%%xmm5 \n"
|
||||
"paddb %%xmm5,%%xmm6 \n"
|
||||
"psadbw %%xmm1,%%xmm6 \n"
|
||||
"paddd %%xmm6,%%xmm0 \n"
|
||||
"sub $0x20,%2 \n"
|
||||
"jg 1b \n"
|
||||
|
||||
"pshufd $0xaa,%%xmm0,%%xmm1 \n"
|
||||
"paddd %%xmm1,%%xmm0 \n"
|
||||
"movd %%xmm0, %3 \n"
|
||||
: "+r"(src_a), // %0
|
||||
"+r"(src_b), // %1
|
||||
"+r"(count), // %2
|
||||
"=r"(diff) // %3
|
||||
: "m"(kNibbleMask), // %4
|
||||
"m"(kBitCount) // %5
|
||||
: "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
|
||||
return diff;
|
||||
}
|
||||
|
||||
#ifdef HAS_HAMMINGDISTANCE_AVX2
|
||||
uint32_t HammingDistance_AVX2(const uint8_t* src_a,
|
||||
const uint8_t* src_b,
|
||||
int count) {
|
||||
uint32_t diff;
|
||||
|
||||
asm volatile(
|
||||
"vbroadcastf128 %4,%%ymm2 \n"
|
||||
"vbroadcastf128 %5,%%ymm3 \n"
|
||||
"vpxor %%ymm0,%%ymm0,%%ymm0 \n"
|
||||
"vpxor %%ymm1,%%ymm1,%%ymm1 \n"
|
||||
"sub %0,%1 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vmovdqa (%0),%%ymm4 \n"
|
||||
"vmovdqa 0x20(%0), %%ymm5 \n"
|
||||
"vpxor (%0,%1), %%ymm4, %%ymm4 \n"
|
||||
"vpand %%ymm2,%%ymm4,%%ymm6 \n"
|
||||
"vpsrlw $0x4,%%ymm4,%%ymm4 \n"
|
||||
"vpshufb %%ymm6,%%ymm3,%%ymm6 \n"
|
||||
"vpand %%ymm2,%%ymm4,%%ymm4 \n"
|
||||
"vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
|
||||
"vpaddb %%ymm4,%%ymm6,%%ymm6 \n"
|
||||
"vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n"
|
||||
"add $0x40,%0 \n"
|
||||
"vpand %%ymm2,%%ymm4,%%ymm5 \n"
|
||||
"vpsrlw $0x4,%%ymm4,%%ymm4 \n"
|
||||
"vpshufb %%ymm5,%%ymm3,%%ymm5 \n"
|
||||
"vpand %%ymm2,%%ymm4,%%ymm4 \n"
|
||||
"vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
|
||||
"vpaddb %%ymm5,%%ymm4,%%ymm4 \n"
|
||||
"vpaddb %%ymm6,%%ymm4,%%ymm4 \n"
|
||||
"vpsadbw %%ymm1,%%ymm4,%%ymm4 \n"
|
||||
"vpaddd %%ymm0,%%ymm4,%%ymm0 \n"
|
||||
"sub $0x40,%2 \n"
|
||||
"jg 1b \n"
|
||||
|
||||
"vpermq $0xb1,%%ymm0,%%ymm1 \n"
|
||||
"vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
|
||||
"vpermq $0xaa,%%ymm0,%%ymm1 \n"
|
||||
"vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
|
||||
"vmovd %%xmm0,%3 \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_a), // %0
|
||||
"+r"(src_b), // %1
|
||||
"+r"(count), // %2
|
||||
"=r"(diff) // %3
|
||||
: "m"(kNibbleMask), // %4
|
||||
"m"(kBitCount) // %5
|
||||
: "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
|
||||
|
||||
return diff;
|
||||
}
|
||||
#endif // HAS_HAMMINGDISTANCE_AVX2
|
||||
|
||||
uint32_t SumSquareError_SSE2(const uint8_t* src_a,
|
||||
const uint8_t* src_b,
|
||||
int count) {
|
||||
uint32_t sse;
|
||||
asm volatile(
|
||||
"pxor %%xmm0,%%xmm0 \n"
|
||||
"pxor %%xmm5,%%xmm5 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"movdqu (%0),%%xmm1 \n"
|
||||
"lea 0x10(%0),%0 \n"
|
||||
"movdqu (%1),%%xmm2 \n"
|
||||
"lea 0x10(%1),%1 \n"
|
||||
"movdqa %%xmm1,%%xmm3 \n"
|
||||
"psubusb %%xmm2,%%xmm1 \n"
|
||||
"psubusb %%xmm3,%%xmm2 \n"
|
||||
"por %%xmm2,%%xmm1 \n"
|
||||
"movdqa %%xmm1,%%xmm2 \n"
|
||||
"punpcklbw %%xmm5,%%xmm1 \n"
|
||||
"punpckhbw %%xmm5,%%xmm2 \n"
|
||||
"pmaddwd %%xmm1,%%xmm1 \n"
|
||||
"pmaddwd %%xmm2,%%xmm2 \n"
|
||||
"paddd %%xmm1,%%xmm0 \n"
|
||||
"paddd %%xmm2,%%xmm0 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
|
||||
"pshufd $0xee,%%xmm0,%%xmm1 \n"
|
||||
"paddd %%xmm1,%%xmm0 \n"
|
||||
"pshufd $0x1,%%xmm0,%%xmm1 \n"
|
||||
"paddd %%xmm1,%%xmm0 \n"
|
||||
"movd %%xmm0,%3 \n"
|
||||
: "+r"(src_a), // %0
|
||||
"+r"(src_b), // %1
|
||||
"+r"(count), // %2
|
||||
"=r"(sse) // %3
|
||||
:
|
||||
: "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
|
||||
return sse;
|
||||
}
|
||||
|
||||
static const uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16
|
||||
static const uvec32 kHashMul0 = {
|
||||
0x0c3525e1, // 33 ^ 15
|
||||
0xa3476dc1, // 33 ^ 14
|
||||
0x3b4039a1, // 33 ^ 13
|
||||
0x4f5f0981, // 33 ^ 12
|
||||
};
|
||||
static const uvec32 kHashMul1 = {
|
||||
0x30f35d61, // 33 ^ 11
|
||||
0x855cb541, // 33 ^ 10
|
||||
0x040a9121, // 33 ^ 9
|
||||
0x747c7101, // 33 ^ 8
|
||||
};
|
||||
static const uvec32 kHashMul2 = {
|
||||
0xec41d4e1, // 33 ^ 7
|
||||
0x4cfa3cc1, // 33 ^ 6
|
||||
0x025528a1, // 33 ^ 5
|
||||
0x00121881, // 33 ^ 4
|
||||
};
|
||||
static const uvec32 kHashMul3 = {
|
||||
0x00008c61, // 33 ^ 3
|
||||
0x00000441, // 33 ^ 2
|
||||
0x00000021, // 33 ^ 1
|
||||
0x00000001, // 33 ^ 0
|
||||
};
|
||||
|
||||
uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
|
||||
uint32_t hash;
|
||||
asm volatile(
|
||||
"movd %2,%%xmm0 \n"
|
||||
"pxor %%xmm7,%%xmm7 \n"
|
||||
"movdqa %4,%%xmm6 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"movdqu (%0),%%xmm1 \n"
|
||||
"lea 0x10(%0),%0 \n"
|
||||
"pmulld %%xmm6,%%xmm0 \n"
|
||||
"movdqa %5,%%xmm5 \n"
|
||||
"movdqa %%xmm1,%%xmm2 \n"
|
||||
"punpcklbw %%xmm7,%%xmm2 \n"
|
||||
"movdqa %%xmm2,%%xmm3 \n"
|
||||
"punpcklwd %%xmm7,%%xmm3 \n"
|
||||
"pmulld %%xmm5,%%xmm3 \n"
|
||||
"movdqa %6,%%xmm5 \n"
|
||||
"movdqa %%xmm2,%%xmm4 \n"
|
||||
"punpckhwd %%xmm7,%%xmm4 \n"
|
||||
"pmulld %%xmm5,%%xmm4 \n"
|
||||
"movdqa %7,%%xmm5 \n"
|
||||
"punpckhbw %%xmm7,%%xmm1 \n"
|
||||
"movdqa %%xmm1,%%xmm2 \n"
|
||||
"punpcklwd %%xmm7,%%xmm2 \n"
|
||||
"pmulld %%xmm5,%%xmm2 \n"
|
||||
"movdqa %8,%%xmm5 \n"
|
||||
"punpckhwd %%xmm7,%%xmm1 \n"
|
||||
"pmulld %%xmm5,%%xmm1 \n"
|
||||
"paddd %%xmm4,%%xmm3 \n"
|
||||
"paddd %%xmm2,%%xmm1 \n"
|
||||
"paddd %%xmm3,%%xmm1 \n"
|
||||
"pshufd $0xe,%%xmm1,%%xmm2 \n"
|
||||
"paddd %%xmm2,%%xmm1 \n"
|
||||
"pshufd $0x1,%%xmm1,%%xmm2 \n"
|
||||
"paddd %%xmm2,%%xmm1 \n"
|
||||
"paddd %%xmm1,%%xmm0 \n"
|
||||
"sub $0x10,%1 \n"
|
||||
"jg 1b \n"
|
||||
"movd %%xmm0,%3 \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(count), // %1
|
||||
"+rm"(seed), // %2
|
||||
"=r"(hash) // %3
|
||||
: "m"(kHash16x33), // %4
|
||||
"m"(kHashMul0), // %5
|
||||
"m"(kHashMul1), // %6
|
||||
"m"(kHashMul2), // %7
|
||||
"m"(kHashMul3) // %8
|
||||
: "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
return hash;
|
||||
}
|
||||
#endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
96
3rdparty/libyuv/source/compare_neon.cc
vendored
Normal file
96
3rdparty/libyuv/source/compare_neon.cc
vendored
Normal file
@@ -0,0 +1,96 @@
|
||||
/*
|
||||
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
|
||||
#include "libyuv/compare_row.h"
|
||||
#include "libyuv/row.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
|
||||
!defined(__aarch64__)
|
||||
|
||||
// 256 bits at a time
|
||||
// uses short accumulator which restricts count to 131 KB
|
||||
uint32_t HammingDistance_NEON(const uint8_t* src_a,
|
||||
const uint8_t* src_b,
|
||||
int count) {
|
||||
uint32_t diff;
|
||||
|
||||
asm volatile(
|
||||
"vmov.u16 q4, #0 \n" // accumulator
|
||||
|
||||
"1: \n"
|
||||
"vld1.8 {q0, q1}, [%0]! \n"
|
||||
"vld1.8 {q2, q3}, [%1]! \n"
|
||||
"veor.32 q0, q0, q2 \n"
|
||||
"veor.32 q1, q1, q3 \n"
|
||||
"vcnt.i8 q0, q0 \n"
|
||||
"vcnt.i8 q1, q1 \n"
|
||||
"subs %2, %2, #32 \n"
|
||||
"vadd.u8 q0, q0, q1 \n" // 16 byte counts
|
||||
"vpadal.u8 q4, q0 \n" // 8 shorts
|
||||
"bgt 1b \n"
|
||||
|
||||
"vpaddl.u16 q0, q4 \n" // 4 ints
|
||||
"vpadd.u32 d0, d0, d1 \n"
|
||||
"vpadd.u32 d0, d0, d0 \n"
|
||||
"vmov.32 %3, d0[0] \n"
|
||||
|
||||
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
|
||||
:
|
||||
: "cc", "q0", "q1", "q2", "q3", "q4");
|
||||
return diff;
|
||||
}
|
||||
|
||||
uint32_t SumSquareError_NEON(const uint8_t* src_a,
|
||||
const uint8_t* src_b,
|
||||
int count) {
|
||||
uint32_t sse;
|
||||
asm volatile(
|
||||
"vmov.u8 q8, #0 \n"
|
||||
"vmov.u8 q10, #0 \n"
|
||||
"vmov.u8 q9, #0 \n"
|
||||
"vmov.u8 q11, #0 \n"
|
||||
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0]! \n"
|
||||
"vld1.8 {q1}, [%1]! \n"
|
||||
"subs %2, %2, #16 \n"
|
||||
"vsubl.u8 q2, d0, d2 \n"
|
||||
"vsubl.u8 q3, d1, d3 \n"
|
||||
"vmlal.s16 q8, d4, d4 \n"
|
||||
"vmlal.s16 q9, d6, d6 \n"
|
||||
"vmlal.s16 q10, d5, d5 \n"
|
||||
"vmlal.s16 q11, d7, d7 \n"
|
||||
"bgt 1b \n"
|
||||
|
||||
"vadd.u32 q8, q8, q9 \n"
|
||||
"vadd.u32 q10, q10, q11 \n"
|
||||
"vadd.u32 q11, q8, q10 \n"
|
||||
"vpaddl.u32 q1, q11 \n"
|
||||
"vadd.u64 d0, d2, d3 \n"
|
||||
"vmov.32 %3, d0[0] \n"
|
||||
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
|
||||
:
|
||||
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
|
||||
return sse;
|
||||
}
|
||||
|
||||
#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
223
3rdparty/libyuv/source/compare_neon64.cc
vendored
Normal file
223
3rdparty/libyuv/source/compare_neon64.cc
vendored
Normal file
@@ -0,0 +1,223 @@
|
||||
/*
|
||||
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
|
||||
#include "libyuv/compare_row.h"
|
||||
#include "libyuv/row.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
|
||||
// 256 bits at a time
|
||||
// uses short accumulator which restricts count to 131 KB
|
||||
uint32_t HammingDistance_NEON(const uint8_t* src_a,
|
||||
const uint8_t* src_b,
|
||||
int count) {
|
||||
uint32_t diff;
|
||||
asm volatile(
|
||||
"movi v4.8h, #0 \n"
|
||||
|
||||
"1: \n"
|
||||
"ld1 {v0.16b, v1.16b}, [%0], #32 \n"
|
||||
"ld1 {v2.16b, v3.16b}, [%1], #32 \n"
|
||||
"eor v0.16b, v0.16b, v2.16b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"eor v1.16b, v1.16b, v3.16b \n"
|
||||
"cnt v0.16b, v0.16b \n"
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"cnt v1.16b, v1.16b \n"
|
||||
"subs %w2, %w2, #32 \n"
|
||||
"add v0.16b, v0.16b, v1.16b \n"
|
||||
"uadalp v4.8h, v0.16b \n"
|
||||
"b.gt 1b \n"
|
||||
|
||||
"uaddlv s4, v4.8h \n"
|
||||
"fmov %w3, s4 \n"
|
||||
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
|
||||
:
|
||||
: "memory", "cc", "v0", "v1", "v2", "v3", "v4");
|
||||
return diff;
|
||||
}
|
||||
|
||||
uint32_t SumSquareError_NEON(const uint8_t* src_a,
|
||||
const uint8_t* src_b,
|
||||
int count) {
|
||||
uint32_t sse;
|
||||
asm volatile(
|
||||
"movi v16.16b, #0 \n"
|
||||
"movi v17.16b, #0 \n"
|
||||
"movi v18.16b, #0 \n"
|
||||
"movi v19.16b, #0 \n"
|
||||
|
||||
"1: \n"
|
||||
"ld1 {v0.16b}, [%0], #16 \n"
|
||||
"ld1 {v1.16b}, [%1], #16 \n"
|
||||
"subs %w2, %w2, #16 \n"
|
||||
"usubl v2.8h, v0.8b, v1.8b \n"
|
||||
"usubl2 v3.8h, v0.16b, v1.16b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"smlal v16.4s, v2.4h, v2.4h \n"
|
||||
"smlal v17.4s, v3.4h, v3.4h \n"
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"smlal2 v18.4s, v2.8h, v2.8h \n"
|
||||
"smlal2 v19.4s, v3.8h, v3.8h \n"
|
||||
"b.gt 1b \n"
|
||||
|
||||
"add v16.4s, v16.4s, v17.4s \n"
|
||||
"add v18.4s, v18.4s, v19.4s \n"
|
||||
"add v19.4s, v16.4s, v18.4s \n"
|
||||
"addv s0, v19.4s \n"
|
||||
"fmov %w3, s0 \n"
|
||||
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
|
||||
:
|
||||
: "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
|
||||
return sse;
|
||||
}
|
||||
|
||||
static const uvec32 kDjb2Multiplicands[] = {
|
||||
{0x0c3525e1, // 33^15
|
||||
0xa3476dc1, // 33^14
|
||||
0x3b4039a1, // 33^13
|
||||
0x4f5f0981}, // 33^12
|
||||
{0x30f35d61, // 33^11
|
||||
0x855cb541, // 33^10
|
||||
0x040a9121, // 33^9
|
||||
0x747c7101}, // 33^8
|
||||
{0xec41d4e1, // 33^7
|
||||
0x4cfa3cc1, // 33^6
|
||||
0x025528a1, // 33^5
|
||||
0x00121881}, // 33^4
|
||||
{0x00008c61, // 33^3
|
||||
0x00000441, // 33^2
|
||||
0x00000021, // 33^1
|
||||
0x00000001}, // 33^0
|
||||
};
|
||||
|
||||
static const uvec32 kDjb2WidenIndices[] = {
|
||||
{0xffffff00U, 0xffffff01U, 0xffffff02U, 0xffffff03U},
|
||||
{0xffffff04U, 0xffffff05U, 0xffffff06U, 0xffffff07U},
|
||||
{0xffffff08U, 0xffffff09U, 0xffffff0aU, 0xffffff0bU},
|
||||
{0xffffff0cU, 0xffffff0dU, 0xffffff0eU, 0xffffff0fU},
|
||||
};
|
||||
|
||||
uint32_t HashDjb2_NEON(const uint8_t* src, int count, uint32_t seed) {
|
||||
uint32_t hash = seed;
|
||||
const uint32_t c16 = 0x92d9e201; // 33^16
|
||||
uint32_t tmp, tmp2;
|
||||
asm("ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%[kIdx]] \n"
|
||||
"ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[kMuls]] \n"
|
||||
|
||||
// count is always a multiple of 16.
|
||||
// maintain two accumulators, reduce and then final sum in scalar since
|
||||
// this has better performance on little cores.
|
||||
"1: \n"
|
||||
"ldr q0, [%[src]], #16 \n"
|
||||
"subs %w[count], %w[count], #16 \n"
|
||||
"tbl v3.16b, {v0.16b}, v19.16b \n"
|
||||
"tbl v2.16b, {v0.16b}, v18.16b \n"
|
||||
"tbl v1.16b, {v0.16b}, v17.16b \n"
|
||||
"tbl v0.16b, {v0.16b}, v16.16b \n"
|
||||
"mul v3.4s, v3.4s, v7.4s \n"
|
||||
"mul v2.4s, v2.4s, v6.4s \n"
|
||||
"mla v3.4s, v1.4s, v5.4s \n"
|
||||
"mla v2.4s, v0.4s, v4.4s \n"
|
||||
"addv s1, v3.4s \n"
|
||||
"addv s0, v2.4s \n"
|
||||
"fmov %w[tmp2], s1 \n"
|
||||
"fmov %w[tmp], s0 \n"
|
||||
"add %w[tmp], %w[tmp], %w[tmp2] \n"
|
||||
"madd %w[hash], %w[hash], %w[c16], %w[tmp] \n"
|
||||
"b.gt 1b \n"
|
||||
: [hash] "+r"(hash), // %[hash]
|
||||
[count] "+r"(count), // %[count]
|
||||
[tmp] "=&r"(tmp), // %[tmp]
|
||||
[tmp2] "=&r"(tmp2) // %[tmp2]
|
||||
: [src] "r"(src), // %[src]
|
||||
[kMuls] "r"(kDjb2Multiplicands), // %[kMuls]
|
||||
[kIdx] "r"(kDjb2WidenIndices), // %[kIdx]
|
||||
[c16] "r"(c16) // %[c16]
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
|
||||
"v17", "v18", "v19");
|
||||
return hash;
|
||||
}
|
||||
|
||||
uint32_t HammingDistance_NEON_DotProd(const uint8_t* src_a,
|
||||
const uint8_t* src_b,
|
||||
int count) {
|
||||
uint32_t diff;
|
||||
asm volatile(
|
||||
"movi v4.4s, #0 \n"
|
||||
"movi v5.4s, #0 \n"
|
||||
"movi v6.16b, #1 \n"
|
||||
|
||||
"1: \n"
|
||||
"ldp q0, q1, [%0], #32 \n"
|
||||
"ldp q2, q3, [%1], #32 \n"
|
||||
"eor v0.16b, v0.16b, v2.16b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"eor v1.16b, v1.16b, v3.16b \n"
|
||||
"cnt v0.16b, v0.16b \n"
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"cnt v1.16b, v1.16b \n"
|
||||
"subs %w2, %w2, #32 \n"
|
||||
"udot v4.4s, v0.16b, v6.16b \n"
|
||||
"udot v5.4s, v1.16b, v6.16b \n"
|
||||
"b.gt 1b \n"
|
||||
|
||||
"add v0.4s, v4.4s, v5.4s \n"
|
||||
"addv s0, v0.4s \n"
|
||||
"fmov %w3, s0 \n"
|
||||
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
|
||||
:
|
||||
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
|
||||
return diff;
|
||||
}
|
||||
|
||||
uint32_t SumSquareError_NEON_DotProd(const uint8_t* src_a,
|
||||
const uint8_t* src_b,
|
||||
int count) {
|
||||
// count is guaranteed to be a multiple of 32.
|
||||
uint32_t sse;
|
||||
asm volatile(
|
||||
"movi v4.4s, #0 \n"
|
||||
"movi v5.4s, #0 \n"
|
||||
|
||||
"1: \n"
|
||||
"ldp q0, q2, [%0], #32 \n"
|
||||
"ldp q1, q3, [%1], #32 \n"
|
||||
"subs %w2, %w2, #32 \n"
|
||||
"uabd v0.16b, v0.16b, v1.16b \n"
|
||||
"uabd v1.16b, v2.16b, v3.16b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"udot v4.4s, v0.16b, v0.16b \n"
|
||||
"udot v5.4s, v1.16b, v1.16b \n"
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"b.gt 1b \n"
|
||||
|
||||
"add v0.4s, v4.4s, v5.4s \n"
|
||||
"addv s0, v0.4s \n"
|
||||
"fmov %w3, s0 \n"
|
||||
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
|
||||
:
|
||||
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5");
|
||||
return sse;
|
||||
}
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
241
3rdparty/libyuv/source/compare_win.cc
vendored
Normal file
241
3rdparty/libyuv/source/compare_win.cc
vendored
Normal file
@@ -0,0 +1,241 @@
|
||||
/*
|
||||
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
|
||||
#include "libyuv/compare_row.h"
|
||||
#include "libyuv/row.h"
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#include <intrin.h> // For __popcnt
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// This module is for 32 bit Visual C x86
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && defined(_M_IX86) && \
|
||||
(!defined(__clang__) || defined(LIBYUV_ENABLE_ROWWIN))
|
||||
|
||||
uint32_t HammingDistance_SSE42(const uint8_t* src_a,
|
||||
const uint8_t* src_b,
|
||||
int count) {
|
||||
uint32_t diff = 0u;
|
||||
|
||||
int i;
|
||||
for (i = 0; i < count - 3; i += 4) {
|
||||
uint32_t x = *((uint32_t*)src_a) ^ *((uint32_t*)src_b); // NOLINT
|
||||
src_a += 4;
|
||||
src_b += 4;
|
||||
diff += __popcnt(x);
|
||||
}
|
||||
return diff;
|
||||
}
|
||||
|
||||
__declspec(naked) uint32_t
|
||||
SumSquareError_SSE2(const uint8_t* src_a, const uint8_t* src_b, int count) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_a
|
||||
mov edx, [esp + 8] // src_b
|
||||
mov ecx, [esp + 12] // count
|
||||
pxor xmm0, xmm0
|
||||
pxor xmm5, xmm5
|
||||
|
||||
wloop:
|
||||
movdqu xmm1, [eax]
|
||||
lea eax, [eax + 16]
|
||||
movdqu xmm2, [edx]
|
||||
lea edx, [edx + 16]
|
||||
movdqa xmm3, xmm1 // abs trick
|
||||
psubusb xmm1, xmm2
|
||||
psubusb xmm2, xmm3
|
||||
por xmm1, xmm2
|
||||
movdqa xmm2, xmm1
|
||||
punpcklbw xmm1, xmm5
|
||||
punpckhbw xmm2, xmm5
|
||||
pmaddwd xmm1, xmm1
|
||||
pmaddwd xmm2, xmm2
|
||||
paddd xmm0, xmm1
|
||||
paddd xmm0, xmm2
|
||||
sub ecx, 16
|
||||
jg wloop
|
||||
|
||||
pshufd xmm1, xmm0, 0xee
|
||||
paddd xmm0, xmm1
|
||||
pshufd xmm1, xmm0, 0x01
|
||||
paddd xmm0, xmm1
|
||||
movd eax, xmm0
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef HAS_SUMSQUAREERROR_AVX2
|
||||
// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
|
||||
#pragma warning(disable : 4752)
|
||||
__declspec(naked) uint32_t
|
||||
SumSquareError_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_a
|
||||
mov edx, [esp + 8] // src_b
|
||||
mov ecx, [esp + 12] // count
|
||||
vpxor ymm0, ymm0, ymm0 // sum
|
||||
vpxor ymm5, ymm5, ymm5 // constant 0 for unpck
|
||||
sub edx, eax
|
||||
|
||||
wloop:
|
||||
vmovdqu ymm1, [eax]
|
||||
vmovdqu ymm2, [eax + edx]
|
||||
lea eax, [eax + 32]
|
||||
vpsubusb ymm3, ymm1, ymm2 // abs difference trick
|
||||
vpsubusb ymm2, ymm2, ymm1
|
||||
vpor ymm1, ymm2, ymm3
|
||||
vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order.
|
||||
vpunpckhbw ymm1, ymm1, ymm5
|
||||
vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32.
|
||||
vpmaddwd ymm1, ymm1, ymm1
|
||||
vpaddd ymm0, ymm0, ymm1
|
||||
vpaddd ymm0, ymm0, ymm2
|
||||
sub ecx, 32
|
||||
jg wloop
|
||||
|
||||
vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes.
|
||||
vpaddd ymm0, ymm0, ymm1
|
||||
vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes.
|
||||
vpaddd ymm0, ymm0, ymm1
|
||||
vpermq ymm1, ymm0, 0x02 // high + low lane.
|
||||
vpaddd ymm0, ymm0, ymm1
|
||||
vmovd eax, xmm0
|
||||
vzeroupper
|
||||
ret
|
||||
}
|
||||
}
|
||||
#endif // HAS_SUMSQUAREERROR_AVX2
|
||||
|
||||
uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16
|
||||
uvec32 kHashMul0 = {
|
||||
0x0c3525e1, // 33 ^ 15
|
||||
0xa3476dc1, // 33 ^ 14
|
||||
0x3b4039a1, // 33 ^ 13
|
||||
0x4f5f0981, // 33 ^ 12
|
||||
};
|
||||
uvec32 kHashMul1 = {
|
||||
0x30f35d61, // 33 ^ 11
|
||||
0x855cb541, // 33 ^ 10
|
||||
0x040a9121, // 33 ^ 9
|
||||
0x747c7101, // 33 ^ 8
|
||||
};
|
||||
uvec32 kHashMul2 = {
|
||||
0xec41d4e1, // 33 ^ 7
|
||||
0x4cfa3cc1, // 33 ^ 6
|
||||
0x025528a1, // 33 ^ 5
|
||||
0x00121881, // 33 ^ 4
|
||||
};
|
||||
uvec32 kHashMul3 = {
|
||||
0x00008c61, // 33 ^ 3
|
||||
0x00000441, // 33 ^ 2
|
||||
0x00000021, // 33 ^ 1
|
||||
0x00000001, // 33 ^ 0
|
||||
};
|
||||
|
||||
__declspec(naked) uint32_t
|
||||
HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src
|
||||
mov ecx, [esp + 8] // count
|
||||
movd xmm0, [esp + 12] // seed
|
||||
|
||||
pxor xmm7, xmm7 // constant 0 for unpck
|
||||
movdqa xmm6, xmmword ptr kHash16x33
|
||||
|
||||
wloop:
|
||||
movdqu xmm1, [eax] // src[0-15]
|
||||
lea eax, [eax + 16]
|
||||
pmulld xmm0, xmm6 // hash *= 33 ^ 16
|
||||
movdqa xmm5, xmmword ptr kHashMul0
|
||||
movdqa xmm2, xmm1
|
||||
punpcklbw xmm2, xmm7 // src[0-7]
|
||||
movdqa xmm3, xmm2
|
||||
punpcklwd xmm3, xmm7 // src[0-3]
|
||||
pmulld xmm3, xmm5
|
||||
movdqa xmm5, xmmword ptr kHashMul1
|
||||
movdqa xmm4, xmm2
|
||||
punpckhwd xmm4, xmm7 // src[4-7]
|
||||
pmulld xmm4, xmm5
|
||||
movdqa xmm5, xmmword ptr kHashMul2
|
||||
punpckhbw xmm1, xmm7 // src[8-15]
|
||||
movdqa xmm2, xmm1
|
||||
punpcklwd xmm2, xmm7 // src[8-11]
|
||||
pmulld xmm2, xmm5
|
||||
movdqa xmm5, xmmword ptr kHashMul3
|
||||
punpckhwd xmm1, xmm7 // src[12-15]
|
||||
pmulld xmm1, xmm5
|
||||
paddd xmm3, xmm4 // add 16 results
|
||||
paddd xmm1, xmm2
|
||||
paddd xmm1, xmm3
|
||||
|
||||
pshufd xmm2, xmm1, 0x0e // upper 2 dwords
|
||||
paddd xmm1, xmm2
|
||||
pshufd xmm2, xmm1, 0x01
|
||||
paddd xmm1, xmm2
|
||||
paddd xmm0, xmm1
|
||||
sub ecx, 16
|
||||
jg wloop
|
||||
|
||||
movd eax, xmm0 // return hash
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
// Visual C 2012 required for AVX2.
|
||||
#ifdef HAS_HASHDJB2_AVX2
|
||||
__declspec(naked) uint32_t
|
||||
HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src
|
||||
mov ecx, [esp + 8] // count
|
||||
vmovd xmm0, [esp + 12] // seed
|
||||
|
||||
wloop:
|
||||
vpmovzxbd xmm3, [eax] // src[0-3]
|
||||
vpmulld xmm0, xmm0, xmmword ptr kHash16x33 // hash *= 33 ^ 16
|
||||
vpmovzxbd xmm4, [eax + 4] // src[4-7]
|
||||
vpmulld xmm3, xmm3, xmmword ptr kHashMul0
|
||||
vpmovzxbd xmm2, [eax + 8] // src[8-11]
|
||||
vpmulld xmm4, xmm4, xmmword ptr kHashMul1
|
||||
vpmovzxbd xmm1, [eax + 12] // src[12-15]
|
||||
vpmulld xmm2, xmm2, xmmword ptr kHashMul2
|
||||
lea eax, [eax + 16]
|
||||
vpmulld xmm1, xmm1, xmmword ptr kHashMul3
|
||||
vpaddd xmm3, xmm3, xmm4 // add 16 results
|
||||
vpaddd xmm1, xmm1, xmm2
|
||||
vpaddd xmm1, xmm1, xmm3
|
||||
vpshufd xmm2, xmm1, 0x0e // upper 2 dwords
|
||||
vpaddd xmm1, xmm1,xmm2
|
||||
vpshufd xmm2, xmm1, 0x01
|
||||
vpaddd xmm1, xmm1, xmm2
|
||||
vpaddd xmm0, xmm0, xmm1
|
||||
sub ecx, 16
|
||||
jg wloop
|
||||
|
||||
vmovd eax, xmm0 // return hash
|
||||
vzeroupper
|
||||
ret
|
||||
}
|
||||
}
|
||||
#endif // HAS_HASHDJB2_AVX2
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
5005
3rdparty/libyuv/source/convert.cc
vendored
Normal file
5005
3rdparty/libyuv/source/convert.cc
vendored
Normal file
File diff suppressed because it is too large
Load Diff
9179
3rdparty/libyuv/source/convert_argb.cc
vendored
Normal file
9179
3rdparty/libyuv/source/convert_argb.cc
vendored
Normal file
File diff suppressed because it is too large
Load Diff
866
3rdparty/libyuv/source/convert_from.cc
vendored
Normal file
866
3rdparty/libyuv/source/convert_from.cc
vendored
Normal file
@@ -0,0 +1,866 @@
|
||||
/*
|
||||
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/convert_from.h"
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
#include "libyuv/convert.h" // For I420Copy
|
||||
#include "libyuv/cpu_id.h"
|
||||
#include "libyuv/planar_functions.h"
|
||||
#include "libyuv/rotate.h"
|
||||
#include "libyuv/row.h"
|
||||
#include "libyuv/scale.h" // For ScalePlane()
|
||||
#include "libyuv/video_common.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
|
||||
static __inline int Abs(int v) {
|
||||
return v >= 0 ? v : -v;
|
||||
}
|
||||
|
||||
// I420 To any I4xx YUV format with mirroring.
|
||||
static int I420ToI4xx(const uint8_t* src_y,
|
||||
int src_stride_y,
|
||||
const uint8_t* src_u,
|
||||
int src_stride_u,
|
||||
const uint8_t* src_v,
|
||||
int src_stride_v,
|
||||
uint8_t* dst_y,
|
||||
int dst_stride_y,
|
||||
uint8_t* dst_u,
|
||||
int dst_stride_u,
|
||||
uint8_t* dst_v,
|
||||
int dst_stride_v,
|
||||
int src_y_width,
|
||||
int src_y_height,
|
||||
int dst_uv_width,
|
||||
int dst_uv_height) {
|
||||
const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1);
|
||||
const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1);
|
||||
int r;
|
||||
if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v ||
|
||||
src_y_width <= 0 || src_y_height == 0 || dst_uv_width <= 0 ||
|
||||
dst_uv_height <= 0) {
|
||||
return -1;
|
||||
}
|
||||
if (dst_y) {
|
||||
CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, src_y_width,
|
||||
src_y_height);
|
||||
}
|
||||
r = ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
|
||||
dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
|
||||
if (r != 0) {
|
||||
return r;
|
||||
}
|
||||
r = ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
|
||||
dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
|
||||
return r;
|
||||
}
|
||||
|
||||
// Convert 8 bit YUV to 10 bit.
|
||||
LIBYUV_API
|
||||
int I420ToI010(const uint8_t* src_y,
|
||||
int src_stride_y,
|
||||
const uint8_t* src_u,
|
||||
int src_stride_u,
|
||||
const uint8_t* src_v,
|
||||
int src_stride_v,
|
||||
uint16_t* dst_y,
|
||||
int dst_stride_y,
|
||||
uint16_t* dst_u,
|
||||
int dst_stride_u,
|
||||
uint16_t* dst_v,
|
||||
int dst_stride_v,
|
||||
int width,
|
||||
int height) {
|
||||
int halfwidth = (width + 1) >> 1;
|
||||
int halfheight = (height + 1) >> 1;
|
||||
if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
|
||||
height == 0) {
|
||||
return -1;
|
||||
}
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
halfheight = (height + 1) >> 1;
|
||||
src_y = src_y + (height - 1) * src_stride_y;
|
||||
src_u = src_u + (halfheight - 1) * src_stride_u;
|
||||
src_v = src_v + (halfheight - 1) * src_stride_v;
|
||||
src_stride_y = -src_stride_y;
|
||||
src_stride_u = -src_stride_u;
|
||||
src_stride_v = -src_stride_v;
|
||||
}
|
||||
|
||||
// Convert Y plane.
|
||||
Convert8To16Plane(src_y, src_stride_y, dst_y, dst_stride_y, 1024, width,
|
||||
height);
|
||||
// Convert UV planes.
|
||||
Convert8To16Plane(src_u, src_stride_u, dst_u, dst_stride_u, 1024, halfwidth,
|
||||
halfheight);
|
||||
Convert8To16Plane(src_v, src_stride_v, dst_v, dst_stride_v, 1024, halfwidth,
|
||||
halfheight);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Convert 8 bit YUV to 12 bit.
|
||||
LIBYUV_API
|
||||
int I420ToI012(const uint8_t* src_y,
|
||||
int src_stride_y,
|
||||
const uint8_t* src_u,
|
||||
int src_stride_u,
|
||||
const uint8_t* src_v,
|
||||
int src_stride_v,
|
||||
uint16_t* dst_y,
|
||||
int dst_stride_y,
|
||||
uint16_t* dst_u,
|
||||
int dst_stride_u,
|
||||
uint16_t* dst_v,
|
||||
int dst_stride_v,
|
||||
int width,
|
||||
int height) {
|
||||
int halfwidth = (width + 1) >> 1;
|
||||
int halfheight = (height + 1) >> 1;
|
||||
if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
|
||||
height == 0) {
|
||||
return -1;
|
||||
}
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
halfheight = (height + 1) >> 1;
|
||||
src_y = src_y + (height - 1) * src_stride_y;
|
||||
src_u = src_u + (halfheight - 1) * src_stride_u;
|
||||
src_v = src_v + (halfheight - 1) * src_stride_v;
|
||||
src_stride_y = -src_stride_y;
|
||||
src_stride_u = -src_stride_u;
|
||||
src_stride_v = -src_stride_v;
|
||||
}
|
||||
|
||||
// Convert Y plane.
|
||||
Convert8To16Plane(src_y, src_stride_y, dst_y, dst_stride_y, 4096, width,
|
||||
height);
|
||||
// Convert UV planes.
|
||||
Convert8To16Plane(src_u, src_stride_u, dst_u, dst_stride_u, 4096, halfwidth,
|
||||
halfheight);
|
||||
Convert8To16Plane(src_v, src_stride_v, dst_v, dst_stride_v, 4096, halfwidth,
|
||||
halfheight);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// 420 chroma is 1/2 width, 1/2 height
|
||||
// 422 chroma is 1/2 width, 1x height
|
||||
LIBYUV_API
|
||||
int I420ToI422(const uint8_t* src_y,
|
||||
int src_stride_y,
|
||||
const uint8_t* src_u,
|
||||
int src_stride_u,
|
||||
const uint8_t* src_v,
|
||||
int src_stride_v,
|
||||
uint8_t* dst_y,
|
||||
int dst_stride_y,
|
||||
uint8_t* dst_u,
|
||||
int dst_stride_u,
|
||||
uint8_t* dst_v,
|
||||
int dst_stride_v,
|
||||
int width,
|
||||
int height) {
|
||||
const int dst_uv_width = (Abs(width) + 1) >> 1;
|
||||
const int dst_uv_height = Abs(height);
|
||||
return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v,
|
||||
src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
|
||||
dst_v, dst_stride_v, width, height, dst_uv_width,
|
||||
dst_uv_height);
|
||||
}
|
||||
|
||||
// 420 chroma is 1/2 width, 1/2 height
|
||||
// 444 chroma is 1x width, 1x height
|
||||
LIBYUV_API
|
||||
int I420ToI444(const uint8_t* src_y,
|
||||
int src_stride_y,
|
||||
const uint8_t* src_u,
|
||||
int src_stride_u,
|
||||
const uint8_t* src_v,
|
||||
int src_stride_v,
|
||||
uint8_t* dst_y,
|
||||
int dst_stride_y,
|
||||
uint8_t* dst_u,
|
||||
int dst_stride_u,
|
||||
uint8_t* dst_v,
|
||||
int dst_stride_v,
|
||||
int width,
|
||||
int height) {
|
||||
const int dst_uv_width = Abs(width);
|
||||
const int dst_uv_height = Abs(height);
|
||||
return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v,
|
||||
src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
|
||||
dst_v, dst_stride_v, width, height, dst_uv_width,
|
||||
dst_uv_height);
|
||||
}
|
||||
|
||||
// 420 chroma to 444 chroma, 10/12 bit version
|
||||
LIBYUV_API
|
||||
int I010ToI410(const uint16_t* src_y,
|
||||
int src_stride_y,
|
||||
const uint16_t* src_u,
|
||||
int src_stride_u,
|
||||
const uint16_t* src_v,
|
||||
int src_stride_v,
|
||||
uint16_t* dst_y,
|
||||
int dst_stride_y,
|
||||
uint16_t* dst_u,
|
||||
int dst_stride_u,
|
||||
uint16_t* dst_v,
|
||||
int dst_stride_v,
|
||||
int width,
|
||||
int height) {
|
||||
int r;
|
||||
if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
|
||||
height == 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (dst_y) {
|
||||
CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
|
||||
}
|
||||
r = ScalePlane_12(src_u, src_stride_u, SUBSAMPLE(width, 1, 1),
|
||||
SUBSAMPLE(height, 1, 1), dst_u, dst_stride_u, width,
|
||||
Abs(height), kFilterBilinear);
|
||||
if (r != 0) {
|
||||
return r;
|
||||
}
|
||||
r = ScalePlane_12(src_v, src_stride_v, SUBSAMPLE(width, 1, 1),
|
||||
SUBSAMPLE(height, 1, 1), dst_v, dst_stride_v, width,
|
||||
Abs(height), kFilterBilinear);
|
||||
return r;
|
||||
}
|
||||
|
||||
// 422 chroma to 444 chroma, 10/12 bit version
|
||||
LIBYUV_API
|
||||
int I210ToI410(const uint16_t* src_y,
|
||||
int src_stride_y,
|
||||
const uint16_t* src_u,
|
||||
int src_stride_u,
|
||||
const uint16_t* src_v,
|
||||
int src_stride_v,
|
||||
uint16_t* dst_y,
|
||||
int dst_stride_y,
|
||||
uint16_t* dst_u,
|
||||
int dst_stride_u,
|
||||
uint16_t* dst_v,
|
||||
int dst_stride_v,
|
||||
int width,
|
||||
int height) {
|
||||
int r;
|
||||
if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
|
||||
height == 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (dst_y) {
|
||||
CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
|
||||
}
|
||||
r = ScalePlane_12(src_u, src_stride_u, SUBSAMPLE(width, 1, 1), height, dst_u,
|
||||
dst_stride_u, width, Abs(height), kFilterBilinear);
|
||||
if (r != 0) {
|
||||
return r;
|
||||
}
|
||||
r = ScalePlane_12(src_v, src_stride_v, SUBSAMPLE(width, 1, 1), height, dst_v,
|
||||
dst_stride_v, width, Abs(height), kFilterBilinear);
|
||||
return r;
|
||||
}
|
||||
|
||||
// 422 chroma is 1/2 width, 1x height
|
||||
// 444 chroma is 1x width, 1x height
|
||||
LIBYUV_API
|
||||
int I422ToI444(const uint8_t* src_y,
|
||||
int src_stride_y,
|
||||
const uint8_t* src_u,
|
||||
int src_stride_u,
|
||||
const uint8_t* src_v,
|
||||
int src_stride_v,
|
||||
uint8_t* dst_y,
|
||||
int dst_stride_y,
|
||||
uint8_t* dst_u,
|
||||
int dst_stride_u,
|
||||
uint8_t* dst_v,
|
||||
int dst_stride_v,
|
||||
int width,
|
||||
int height) {
|
||||
int r;
|
||||
if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
|
||||
height == 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (dst_y) {
|
||||
CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
|
||||
}
|
||||
r = ScalePlane(src_u, src_stride_u, SUBSAMPLE(width, 1, 1), height, dst_u,
|
||||
dst_stride_u, width, Abs(height), kFilterBilinear);
|
||||
if (r != 0) {
|
||||
return r;
|
||||
}
|
||||
r = ScalePlane(src_v, src_stride_v, SUBSAMPLE(width, 1, 1), height, dst_v,
|
||||
dst_stride_v, width, Abs(height), kFilterBilinear);
|
||||
return r;
|
||||
}
|
||||
|
||||
// Copy to I400. Source can be I420,422,444,400,NV12,NV21
|
||||
LIBYUV_API
|
||||
int I400Copy(const uint8_t* src_y,
|
||||
int src_stride_y,
|
||||
uint8_t* dst_y,
|
||||
int dst_stride_y,
|
||||
int width,
|
||||
int height) {
|
||||
if (!src_y || !dst_y || width <= 0 || height == 0) {
|
||||
return -1;
|
||||
}
|
||||
CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
|
||||
return 0;
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
int I422ToYUY2(const uint8_t* src_y,
|
||||
int src_stride_y,
|
||||
const uint8_t* src_u,
|
||||
int src_stride_u,
|
||||
const uint8_t* src_v,
|
||||
int src_stride_v,
|
||||
uint8_t* dst_yuy2,
|
||||
int dst_stride_yuy2,
|
||||
int width,
|
||||
int height) {
|
||||
int y;
|
||||
void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
|
||||
const uint8_t* src_v, uint8_t* dst_yuy2, int width) =
|
||||
I422ToYUY2Row_C;
|
||||
if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) {
|
||||
return -1;
|
||||
}
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
|
||||
dst_stride_yuy2 = -dst_stride_yuy2;
|
||||
}
|
||||
// Coalesce rows.
|
||||
if (src_stride_y == width && src_stride_u * 2 == width &&
|
||||
src_stride_v * 2 == width && dst_stride_yuy2 == width * 2) {
|
||||
width *= height;
|
||||
height = 1;
|
||||
src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0;
|
||||
}
|
||||
#if defined(HAS_I422TOYUY2ROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
I422ToYUY2Row = I422ToYUY2Row_SSE2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TOYUY2ROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
I422ToYUY2Row = I422ToYUY2Row_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
I422ToYUY2Row = I422ToYUY2Row_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TOYUY2ROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
I422ToYUY2Row = I422ToYUY2Row_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (y = 0; y < height; ++y) {
|
||||
I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
|
||||
src_y += src_stride_y;
|
||||
src_u += src_stride_u;
|
||||
src_v += src_stride_v;
|
||||
dst_yuy2 += dst_stride_yuy2;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
int I420ToYUY2(const uint8_t* src_y,
|
||||
int src_stride_y,
|
||||
const uint8_t* src_u,
|
||||
int src_stride_u,
|
||||
const uint8_t* src_v,
|
||||
int src_stride_v,
|
||||
uint8_t* dst_yuy2,
|
||||
int dst_stride_yuy2,
|
||||
int width,
|
||||
int height) {
|
||||
int y;
|
||||
void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
|
||||
const uint8_t* src_v, uint8_t* dst_yuy2, int width) =
|
||||
I422ToYUY2Row_C;
|
||||
if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) {
|
||||
return -1;
|
||||
}
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
|
||||
dst_stride_yuy2 = -dst_stride_yuy2;
|
||||
}
|
||||
#if defined(HAS_I422TOYUY2ROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
I422ToYUY2Row = I422ToYUY2Row_SSE2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TOYUY2ROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
I422ToYUY2Row = I422ToYUY2Row_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
I422ToYUY2Row = I422ToYUY2Row_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TOYUY2ROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
I422ToYUY2Row = I422ToYUY2Row_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TOYUY2ROW_LSX)
|
||||
if (TestCpuFlag(kCpuHasLSX)) {
|
||||
I422ToYUY2Row = I422ToYUY2Row_Any_LSX;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
I422ToYUY2Row = I422ToYUY2Row_LSX;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TOYUY2ROW_LASX)
|
||||
if (TestCpuFlag(kCpuHasLASX)) {
|
||||
I422ToYUY2Row = I422ToYUY2Row_Any_LASX;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
I422ToYUY2Row = I422ToYUY2Row_LASX;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (y = 0; y < height - 1; y += 2) {
|
||||
I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
|
||||
I422ToYUY2Row(src_y + src_stride_y, src_u, src_v,
|
||||
dst_yuy2 + dst_stride_yuy2, width);
|
||||
src_y += src_stride_y * 2;
|
||||
src_u += src_stride_u;
|
||||
src_v += src_stride_v;
|
||||
dst_yuy2 += dst_stride_yuy2 * 2;
|
||||
}
|
||||
if (height & 1) {
|
||||
I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
int I422ToUYVY(const uint8_t* src_y,
|
||||
int src_stride_y,
|
||||
const uint8_t* src_u,
|
||||
int src_stride_u,
|
||||
const uint8_t* src_v,
|
||||
int src_stride_v,
|
||||
uint8_t* dst_uyvy,
|
||||
int dst_stride_uyvy,
|
||||
int width,
|
||||
int height) {
|
||||
int y;
|
||||
void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,
|
||||
const uint8_t* src_v, uint8_t* dst_uyvy, int width) =
|
||||
I422ToUYVYRow_C;
|
||||
if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) {
|
||||
return -1;
|
||||
}
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
|
||||
dst_stride_uyvy = -dst_stride_uyvy;
|
||||
}
|
||||
// Coalesce rows.
|
||||
if (src_stride_y == width && src_stride_u * 2 == width &&
|
||||
src_stride_v * 2 == width && dst_stride_uyvy == width * 2) {
|
||||
width *= height;
|
||||
height = 1;
|
||||
src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0;
|
||||
}
|
||||
#if defined(HAS_I422TOUYVYROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
I422ToUYVYRow = I422ToUYVYRow_SSE2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TOUYVYROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
I422ToUYVYRow = I422ToUYVYRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
I422ToUYVYRow = I422ToUYVYRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TOUYVYROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
I422ToUYVYRow = I422ToUYVYRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TOUYVYROW_LSX)
|
||||
if (TestCpuFlag(kCpuHasLSX)) {
|
||||
I422ToUYVYRow = I422ToUYVYRow_Any_LSX;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
I422ToUYVYRow = I422ToUYVYRow_LSX;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TOUYVYROW_LASX)
|
||||
if (TestCpuFlag(kCpuHasLASX)) {
|
||||
I422ToUYVYRow = I422ToUYVYRow_Any_LASX;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
I422ToUYVYRow = I422ToUYVYRow_LASX;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (y = 0; y < height; ++y) {
|
||||
I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
|
||||
src_y += src_stride_y;
|
||||
src_u += src_stride_u;
|
||||
src_v += src_stride_v;
|
||||
dst_uyvy += dst_stride_uyvy;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
int I420ToUYVY(const uint8_t* src_y,
|
||||
int src_stride_y,
|
||||
const uint8_t* src_u,
|
||||
int src_stride_u,
|
||||
const uint8_t* src_v,
|
||||
int src_stride_v,
|
||||
uint8_t* dst_uyvy,
|
||||
int dst_stride_uyvy,
|
||||
int width,
|
||||
int height) {
|
||||
int y;
|
||||
void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,
|
||||
const uint8_t* src_v, uint8_t* dst_uyvy, int width) =
|
||||
I422ToUYVYRow_C;
|
||||
if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) {
|
||||
return -1;
|
||||
}
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
|
||||
dst_stride_uyvy = -dst_stride_uyvy;
|
||||
}
|
||||
#if defined(HAS_I422TOUYVYROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
I422ToUYVYRow = I422ToUYVYRow_SSE2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TOUYVYROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
I422ToUYVYRow = I422ToUYVYRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
I422ToUYVYRow = I422ToUYVYRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TOUYVYROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
I422ToUYVYRow = I422ToUYVYRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TOUYVYROW_LSX)
|
||||
if (TestCpuFlag(kCpuHasLSX)) {
|
||||
I422ToUYVYRow = I422ToUYVYRow_Any_LSX;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
I422ToUYVYRow = I422ToUYVYRow_LSX;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TOUYVYROW_LASX)
|
||||
if (TestCpuFlag(kCpuHasLASX)) {
|
||||
I422ToUYVYRow = I422ToUYVYRow_Any_LASX;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
I422ToUYVYRow = I422ToUYVYRow_LASX;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (y = 0; y < height - 1; y += 2) {
|
||||
I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
|
||||
I422ToUYVYRow(src_y + src_stride_y, src_u, src_v,
|
||||
dst_uyvy + dst_stride_uyvy, width);
|
||||
src_y += src_stride_y * 2;
|
||||
src_u += src_stride_u;
|
||||
src_v += src_stride_v;
|
||||
dst_uyvy += dst_stride_uyvy * 2;
|
||||
}
|
||||
if (height & 1) {
|
||||
I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
int I420ToNV12(const uint8_t* src_y,
|
||||
int src_stride_y,
|
||||
const uint8_t* src_u,
|
||||
int src_stride_u,
|
||||
const uint8_t* src_v,
|
||||
int src_stride_v,
|
||||
uint8_t* dst_y,
|
||||
int dst_stride_y,
|
||||
uint8_t* dst_uv,
|
||||
int dst_stride_uv,
|
||||
int width,
|
||||
int height) {
|
||||
int halfwidth = (width + 1) / 2;
|
||||
int halfheight = (height + 1) / 2;
|
||||
if ((!src_y && dst_y) || !src_u || !src_v || !dst_uv || width <= 0 ||
|
||||
height == 0) {
|
||||
return -1;
|
||||
}
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
halfheight = (height + 1) >> 1;
|
||||
src_y = src_y + (height - 1) * src_stride_y;
|
||||
src_u = src_u + (halfheight - 1) * src_stride_u;
|
||||
src_v = src_v + (halfheight - 1) * src_stride_v;
|
||||
src_stride_y = -src_stride_y;
|
||||
src_stride_u = -src_stride_u;
|
||||
src_stride_v = -src_stride_v;
|
||||
}
|
||||
if (dst_y) {
|
||||
CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
|
||||
}
|
||||
MergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv, dst_stride_uv,
|
||||
halfwidth, halfheight);
|
||||
return 0;
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
int I420ToNV21(const uint8_t* src_y,
|
||||
int src_stride_y,
|
||||
const uint8_t* src_u,
|
||||
int src_stride_u,
|
||||
const uint8_t* src_v,
|
||||
int src_stride_v,
|
||||
uint8_t* dst_y,
|
||||
int dst_stride_y,
|
||||
uint8_t* dst_vu,
|
||||
int dst_stride_vu,
|
||||
int width,
|
||||
int height) {
|
||||
return I420ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u,
|
||||
src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu,
|
||||
width, height);
|
||||
}
|
||||
|
||||
// Convert I420 to specified format
|
||||
LIBYUV_API
|
||||
int ConvertFromI420(const uint8_t* y,
|
||||
int y_stride,
|
||||
const uint8_t* u,
|
||||
int u_stride,
|
||||
const uint8_t* v,
|
||||
int v_stride,
|
||||
uint8_t* dst_sample,
|
||||
int dst_sample_stride,
|
||||
int width,
|
||||
int height,
|
||||
uint32_t fourcc) {
|
||||
uint32_t format = CanonicalFourCC(fourcc);
|
||||
int r = 0;
|
||||
if (!y || !u || !v || !dst_sample || width <= 0 || height == 0) {
|
||||
return -1;
|
||||
}
|
||||
switch (format) {
|
||||
// Single plane formats
|
||||
case FOURCC_YUY2:
|
||||
r = I420ToYUY2(y, y_stride, u, u_stride, v, v_stride, dst_sample,
|
||||
dst_sample_stride ? dst_sample_stride : width * 2, width,
|
||||
height);
|
||||
break;
|
||||
case FOURCC_UYVY:
|
||||
r = I420ToUYVY(y, y_stride, u, u_stride, v, v_stride, dst_sample,
|
||||
dst_sample_stride ? dst_sample_stride : width * 2, width,
|
||||
height);
|
||||
break;
|
||||
case FOURCC_RGBP:
|
||||
r = I420ToRGB565(y, y_stride, u, u_stride, v, v_stride, dst_sample,
|
||||
dst_sample_stride ? dst_sample_stride : width * 2, width,
|
||||
height);
|
||||
break;
|
||||
case FOURCC_RGBO:
|
||||
r = I420ToARGB1555(y, y_stride, u, u_stride, v, v_stride, dst_sample,
|
||||
dst_sample_stride ? dst_sample_stride : width * 2,
|
||||
width, height);
|
||||
break;
|
||||
case FOURCC_R444:
|
||||
r = I420ToARGB4444(y, y_stride, u, u_stride, v, v_stride, dst_sample,
|
||||
dst_sample_stride ? dst_sample_stride : width * 2,
|
||||
width, height);
|
||||
break;
|
||||
case FOURCC_24BG:
|
||||
r = I420ToRGB24(y, y_stride, u, u_stride, v, v_stride, dst_sample,
|
||||
dst_sample_stride ? dst_sample_stride : width * 3, width,
|
||||
height);
|
||||
break;
|
||||
case FOURCC_RAW:
|
||||
r = I420ToRAW(y, y_stride, u, u_stride, v, v_stride, dst_sample,
|
||||
dst_sample_stride ? dst_sample_stride : width * 3, width,
|
||||
height);
|
||||
break;
|
||||
case FOURCC_ARGB:
|
||||
r = I420ToARGB(y, y_stride, u, u_stride, v, v_stride, dst_sample,
|
||||
dst_sample_stride ? dst_sample_stride : width * 4, width,
|
||||
height);
|
||||
break;
|
||||
case FOURCC_BGRA:
|
||||
r = I420ToBGRA(y, y_stride, u, u_stride, v, v_stride, dst_sample,
|
||||
dst_sample_stride ? dst_sample_stride : width * 4, width,
|
||||
height);
|
||||
break;
|
||||
case FOURCC_ABGR:
|
||||
r = I420ToABGR(y, y_stride, u, u_stride, v, v_stride, dst_sample,
|
||||
dst_sample_stride ? dst_sample_stride : width * 4, width,
|
||||
height);
|
||||
break;
|
||||
case FOURCC_RGBA:
|
||||
r = I420ToRGBA(y, y_stride, u, u_stride, v, v_stride, dst_sample,
|
||||
dst_sample_stride ? dst_sample_stride : width * 4, width,
|
||||
height);
|
||||
break;
|
||||
case FOURCC_AR30:
|
||||
r = I420ToAR30(y, y_stride, u, u_stride, v, v_stride, dst_sample,
|
||||
dst_sample_stride ? dst_sample_stride : width * 4, width,
|
||||
height);
|
||||
break;
|
||||
case FOURCC_I400:
|
||||
r = I400Copy(y, y_stride, dst_sample,
|
||||
dst_sample_stride ? dst_sample_stride : width, width,
|
||||
height);
|
||||
break;
|
||||
case FOURCC_NV12: {
|
||||
int dst_y_stride = dst_sample_stride ? dst_sample_stride : width;
|
||||
uint8_t* dst_uv = dst_sample + dst_y_stride * height;
|
||||
r = I420ToNV12(y, y_stride, u, u_stride, v, v_stride, dst_sample,
|
||||
dst_sample_stride ? dst_sample_stride : width, dst_uv,
|
||||
dst_sample_stride ? dst_sample_stride : width, width,
|
||||
height);
|
||||
break;
|
||||
}
|
||||
case FOURCC_NV21: {
|
||||
int dst_y_stride = dst_sample_stride ? dst_sample_stride : width;
|
||||
uint8_t* dst_vu = dst_sample + dst_y_stride * height;
|
||||
r = I420ToNV21(y, y_stride, u, u_stride, v, v_stride, dst_sample,
|
||||
dst_sample_stride ? dst_sample_stride : width, dst_vu,
|
||||
dst_sample_stride ? dst_sample_stride : width, width,
|
||||
height);
|
||||
break;
|
||||
}
|
||||
// Triplanar formats
|
||||
case FOURCC_I420:
|
||||
case FOURCC_YV12: {
|
||||
dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
|
||||
int halfstride = (dst_sample_stride + 1) / 2;
|
||||
int halfheight = (height + 1) / 2;
|
||||
uint8_t* dst_u;
|
||||
uint8_t* dst_v;
|
||||
if (format == FOURCC_YV12) {
|
||||
dst_v = dst_sample + dst_sample_stride * height;
|
||||
dst_u = dst_v + halfstride * halfheight;
|
||||
} else {
|
||||
dst_u = dst_sample + dst_sample_stride * height;
|
||||
dst_v = dst_u + halfstride * halfheight;
|
||||
}
|
||||
r = I420Copy(y, y_stride, u, u_stride, v, v_stride, dst_sample,
|
||||
dst_sample_stride, dst_u, halfstride, dst_v, halfstride,
|
||||
width, height);
|
||||
break;
|
||||
}
|
||||
case FOURCC_I422:
|
||||
case FOURCC_YV16: {
|
||||
dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
|
||||
int halfstride = (dst_sample_stride + 1) / 2;
|
||||
uint8_t* dst_u;
|
||||
uint8_t* dst_v;
|
||||
if (format == FOURCC_YV16) {
|
||||
dst_v = dst_sample + dst_sample_stride * height;
|
||||
dst_u = dst_v + halfstride * height;
|
||||
} else {
|
||||
dst_u = dst_sample + dst_sample_stride * height;
|
||||
dst_v = dst_u + halfstride * height;
|
||||
}
|
||||
r = I420ToI422(y, y_stride, u, u_stride, v, v_stride, dst_sample,
|
||||
dst_sample_stride, dst_u, halfstride, dst_v, halfstride,
|
||||
width, height);
|
||||
break;
|
||||
}
|
||||
case FOURCC_I444:
|
||||
case FOURCC_YV24: {
|
||||
dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
|
||||
uint8_t* dst_u;
|
||||
uint8_t* dst_v;
|
||||
if (format == FOURCC_YV24) {
|
||||
dst_v = dst_sample + dst_sample_stride * height;
|
||||
dst_u = dst_v + dst_sample_stride * height;
|
||||
} else {
|
||||
dst_u = dst_sample + dst_sample_stride * height;
|
||||
dst_v = dst_u + dst_sample_stride * height;
|
||||
}
|
||||
r = I420ToI444(y, y_stride, u, u_stride, v, v_stride, dst_sample,
|
||||
dst_sample_stride, dst_u, dst_sample_stride, dst_v,
|
||||
dst_sample_stride, width, height);
|
||||
break;
|
||||
}
|
||||
// Formats not supported - MJPG, biplanar, some rgb formats.
|
||||
default:
|
||||
return -1; // unknown fourcc - return failure code.
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
4226
3rdparty/libyuv/source/convert_from_argb.cc
vendored
Normal file
4226
3rdparty/libyuv/source/convert_from_argb.cc
vendored
Normal file
File diff suppressed because it is too large
Load Diff
602
3rdparty/libyuv/source/convert_jpeg.cc
vendored
Normal file
602
3rdparty/libyuv/source/convert_jpeg.cc
vendored
Normal file
@@ -0,0 +1,602 @@
|
||||
/*
|
||||
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/convert.h"
|
||||
#include "libyuv/convert_argb.h"
|
||||
|
||||
#ifdef HAVE_JPEG
|
||||
#include "libyuv/mjpeg_decoder.h"
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_JPEG
|
||||
struct I420Buffers {
|
||||
uint8_t* y;
|
||||
int y_stride;
|
||||
uint8_t* u;
|
||||
int u_stride;
|
||||
uint8_t* v;
|
||||
int v_stride;
|
||||
int w;
|
||||
int h;
|
||||
};
|
||||
|
||||
static void JpegCopyI420(void* opaque,
|
||||
const uint8_t* const* data,
|
||||
const int* strides,
|
||||
int rows) {
|
||||
I420Buffers* dest = (I420Buffers*)(opaque);
|
||||
I420Copy(data[0], strides[0], data[1], strides[1], data[2], strides[2],
|
||||
dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,
|
||||
dest->v_stride, dest->w, rows);
|
||||
dest->y += rows * dest->y_stride;
|
||||
dest->u += ((rows + 1) >> 1) * dest->u_stride;
|
||||
dest->v += ((rows + 1) >> 1) * dest->v_stride;
|
||||
dest->h -= rows;
|
||||
}
|
||||
|
||||
static void JpegI422ToI420(void* opaque,
|
||||
const uint8_t* const* data,
|
||||
const int* strides,
|
||||
int rows) {
|
||||
I420Buffers* dest = (I420Buffers*)(opaque);
|
||||
I422ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2],
|
||||
dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,
|
||||
dest->v_stride, dest->w, rows);
|
||||
dest->y += rows * dest->y_stride;
|
||||
dest->u += ((rows + 1) >> 1) * dest->u_stride;
|
||||
dest->v += ((rows + 1) >> 1) * dest->v_stride;
|
||||
dest->h -= rows;
|
||||
}
|
||||
|
||||
static void JpegI444ToI420(void* opaque,
|
||||
const uint8_t* const* data,
|
||||
const int* strides,
|
||||
int rows) {
|
||||
I420Buffers* dest = (I420Buffers*)(opaque);
|
||||
I444ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2],
|
||||
dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,
|
||||
dest->v_stride, dest->w, rows);
|
||||
dest->y += rows * dest->y_stride;
|
||||
dest->u += ((rows + 1) >> 1) * dest->u_stride;
|
||||
dest->v += ((rows + 1) >> 1) * dest->v_stride;
|
||||
dest->h -= rows;
|
||||
}
|
||||
|
||||
static void JpegI400ToI420(void* opaque,
|
||||
const uint8_t* const* data,
|
||||
const int* strides,
|
||||
int rows) {
|
||||
I420Buffers* dest = (I420Buffers*)(opaque);
|
||||
I400ToI420(data[0], strides[0], dest->y, dest->y_stride, dest->u,
|
||||
dest->u_stride, dest->v, dest->v_stride, dest->w, rows);
|
||||
dest->y += rows * dest->y_stride;
|
||||
dest->u += ((rows + 1) >> 1) * dest->u_stride;
|
||||
dest->v += ((rows + 1) >> 1) * dest->v_stride;
|
||||
dest->h -= rows;
|
||||
}
|
||||
|
||||
// Query size of MJPG in pixels.
|
||||
LIBYUV_API
|
||||
int MJPGSize(const uint8_t* src_mjpg,
|
||||
size_t src_size_mjpg,
|
||||
int* width,
|
||||
int* height) {
|
||||
MJpegDecoder mjpeg_decoder;
|
||||
LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg);
|
||||
if (ret) {
|
||||
*width = mjpeg_decoder.GetWidth();
|
||||
*height = mjpeg_decoder.GetHeight();
|
||||
}
|
||||
mjpeg_decoder.UnloadFrame();
|
||||
return ret ? 0 : -1; // -1 for runtime failure.
|
||||
}
|
||||
|
||||
// MJPG (Motion JPeg) to I420
|
||||
// TODO(fbarchard): review src_width and src_height requirement. dst_width and
|
||||
// dst_height may be enough.
|
||||
LIBYUV_API
|
||||
int MJPGToI420(const uint8_t* src_mjpg,
|
||||
size_t src_size_mjpg,
|
||||
uint8_t* dst_y,
|
||||
int dst_stride_y,
|
||||
uint8_t* dst_u,
|
||||
int dst_stride_u,
|
||||
uint8_t* dst_v,
|
||||
int dst_stride_v,
|
||||
int src_width,
|
||||
int src_height,
|
||||
int dst_width,
|
||||
int dst_height) {
|
||||
if (src_size_mjpg == kUnknownDataSize) {
|
||||
// ERROR: MJPEG frame size unknown
|
||||
return -1;
|
||||
}
|
||||
|
||||
// TODO(fbarchard): Port MJpeg to C.
|
||||
MJpegDecoder mjpeg_decoder;
|
||||
LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg);
|
||||
if (ret && (mjpeg_decoder.GetWidth() != src_width ||
|
||||
mjpeg_decoder.GetHeight() != src_height)) {
|
||||
// ERROR: MJPEG frame has unexpected dimensions
|
||||
mjpeg_decoder.UnloadFrame();
|
||||
return 1; // runtime failure
|
||||
}
|
||||
if (ret) {
|
||||
I420Buffers bufs = {dst_y, dst_stride_y, dst_u, dst_stride_u,
|
||||
dst_v, dst_stride_v, dst_width, dst_height};
|
||||
// YUV420
|
||||
if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
|
||||
mjpeg_decoder.GetNumComponents() == 3 &&
|
||||
mjpeg_decoder.GetVertSampFactor(0) == 2 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
|
||||
mjpeg_decoder.GetVertSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
|
||||
ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dst_width,
|
||||
dst_height);
|
||||
// YUV422
|
||||
} else if (mjpeg_decoder.GetColorSpace() ==
|
||||
MJpegDecoder::kColorSpaceYCbCr &&
|
||||
mjpeg_decoder.GetNumComponents() == 3 &&
|
||||
mjpeg_decoder.GetVertSampFactor(0) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
|
||||
mjpeg_decoder.GetVertSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
|
||||
ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dst_width,
|
||||
dst_height);
|
||||
// YUV444
|
||||
} else if (mjpeg_decoder.GetColorSpace() ==
|
||||
MJpegDecoder::kColorSpaceYCbCr &&
|
||||
mjpeg_decoder.GetNumComponents() == 3 &&
|
||||
mjpeg_decoder.GetVertSampFactor(0) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
|
||||
mjpeg_decoder.GetVertSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
|
||||
ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dst_width,
|
||||
dst_height);
|
||||
// YUV400
|
||||
} else if (mjpeg_decoder.GetColorSpace() ==
|
||||
MJpegDecoder::kColorSpaceGrayscale &&
|
||||
mjpeg_decoder.GetNumComponents() == 1 &&
|
||||
mjpeg_decoder.GetVertSampFactor(0) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(0) == 1) {
|
||||
ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dst_width,
|
||||
dst_height);
|
||||
} else {
|
||||
// TODO(fbarchard): Implement conversion for any other
|
||||
// colorspace/subsample factors that occur in practice. ERROR: Unable to
|
||||
// convert MJPEG frame because format is not supported
|
||||
mjpeg_decoder.UnloadFrame();
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return ret ? 0 : 1;
|
||||
}
|
||||
|
||||
struct NV21Buffers {
|
||||
uint8_t* y;
|
||||
int y_stride;
|
||||
uint8_t* vu;
|
||||
int vu_stride;
|
||||
int w;
|
||||
int h;
|
||||
};
|
||||
|
||||
static void JpegI420ToNV21(void* opaque,
|
||||
const uint8_t* const* data,
|
||||
const int* strides,
|
||||
int rows) {
|
||||
NV21Buffers* dest = (NV21Buffers*)(opaque);
|
||||
I420ToNV21(data[0], strides[0], data[1], strides[1], data[2], strides[2],
|
||||
dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
|
||||
dest->y += rows * dest->y_stride;
|
||||
dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
|
||||
dest->h -= rows;
|
||||
}
|
||||
|
||||
static void JpegI422ToNV21(void* opaque,
|
||||
const uint8_t* const* data,
|
||||
const int* strides,
|
||||
int rows) {
|
||||
NV21Buffers* dest = (NV21Buffers*)(opaque);
|
||||
I422ToNV21(data[0], strides[0], data[1], strides[1], data[2], strides[2],
|
||||
dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
|
||||
dest->y += rows * dest->y_stride;
|
||||
dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
|
||||
dest->h -= rows;
|
||||
}
|
||||
|
||||
static void JpegI444ToNV21(void* opaque,
|
||||
const uint8_t* const* data,
|
||||
const int* strides,
|
||||
int rows) {
|
||||
NV21Buffers* dest = (NV21Buffers*)(opaque);
|
||||
I444ToNV21(data[0], strides[0], data[1], strides[1], data[2], strides[2],
|
||||
dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
|
||||
dest->y += rows * dest->y_stride;
|
||||
dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
|
||||
dest->h -= rows;
|
||||
}
|
||||
|
||||
static void JpegI400ToNV21(void* opaque,
|
||||
const uint8_t* const* data,
|
||||
const int* strides,
|
||||
int rows) {
|
||||
NV21Buffers* dest = (NV21Buffers*)(opaque);
|
||||
I400ToNV21(data[0], strides[0], dest->y, dest->y_stride, dest->vu,
|
||||
dest->vu_stride, dest->w, rows);
|
||||
dest->y += rows * dest->y_stride;
|
||||
dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
|
||||
dest->h -= rows;
|
||||
}
|
||||
|
||||
// MJPG (Motion JPeg) to NV21
|
||||
LIBYUV_API
|
||||
int MJPGToNV21(const uint8_t* src_mjpg,
|
||||
size_t src_size_mjpg,
|
||||
uint8_t* dst_y,
|
||||
int dst_stride_y,
|
||||
uint8_t* dst_vu,
|
||||
int dst_stride_vu,
|
||||
int src_width,
|
||||
int src_height,
|
||||
int dst_width,
|
||||
int dst_height) {
|
||||
if (src_size_mjpg == kUnknownDataSize) {
|
||||
// ERROR: MJPEG frame size unknown
|
||||
return -1;
|
||||
}
|
||||
|
||||
// TODO(fbarchard): Port MJpeg to C.
|
||||
MJpegDecoder mjpeg_decoder;
|
||||
LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg);
|
||||
if (ret && (mjpeg_decoder.GetWidth() != src_width ||
|
||||
mjpeg_decoder.GetHeight() != src_height)) {
|
||||
// ERROR: MJPEG frame has unexpected dimensions
|
||||
mjpeg_decoder.UnloadFrame();
|
||||
return 1; // runtime failure
|
||||
}
|
||||
if (ret) {
|
||||
NV21Buffers bufs = {dst_y, dst_stride_y, dst_vu,
|
||||
dst_stride_vu, dst_width, dst_height};
|
||||
// YUV420
|
||||
if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
|
||||
mjpeg_decoder.GetNumComponents() == 3 &&
|
||||
mjpeg_decoder.GetVertSampFactor(0) == 2 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
|
||||
mjpeg_decoder.GetVertSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
|
||||
ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToNV21, &bufs, dst_width,
|
||||
dst_height);
|
||||
// YUV422
|
||||
} else if (mjpeg_decoder.GetColorSpace() ==
|
||||
MJpegDecoder::kColorSpaceYCbCr &&
|
||||
mjpeg_decoder.GetNumComponents() == 3 &&
|
||||
mjpeg_decoder.GetVertSampFactor(0) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
|
||||
mjpeg_decoder.GetVertSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
|
||||
ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToNV21, &bufs, dst_width,
|
||||
dst_height);
|
||||
// YUV444
|
||||
} else if (mjpeg_decoder.GetColorSpace() ==
|
||||
MJpegDecoder::kColorSpaceYCbCr &&
|
||||
mjpeg_decoder.GetNumComponents() == 3 &&
|
||||
mjpeg_decoder.GetVertSampFactor(0) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
|
||||
mjpeg_decoder.GetVertSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
|
||||
ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToNV21, &bufs, dst_width,
|
||||
dst_height);
|
||||
// YUV400
|
||||
} else if (mjpeg_decoder.GetColorSpace() ==
|
||||
MJpegDecoder::kColorSpaceGrayscale &&
|
||||
mjpeg_decoder.GetNumComponents() == 1 &&
|
||||
mjpeg_decoder.GetVertSampFactor(0) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(0) == 1) {
|
||||
ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToNV21, &bufs, dst_width,
|
||||
dst_height);
|
||||
} else {
|
||||
// Unknown colorspace.
|
||||
mjpeg_decoder.UnloadFrame();
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return ret ? 0 : 1;
|
||||
}
|
||||
|
||||
static void JpegI420ToNV12(void* opaque,
|
||||
const uint8_t* const* data,
|
||||
const int* strides,
|
||||
int rows) {
|
||||
NV21Buffers* dest = (NV21Buffers*)(opaque);
|
||||
// Use NV21 with VU swapped.
|
||||
I420ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1],
|
||||
dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
|
||||
dest->y += rows * dest->y_stride;
|
||||
dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
|
||||
dest->h -= rows;
|
||||
}
|
||||
|
||||
static void JpegI422ToNV12(void* opaque,
|
||||
const uint8_t* const* data,
|
||||
const int* strides,
|
||||
int rows) {
|
||||
NV21Buffers* dest = (NV21Buffers*)(opaque);
|
||||
// Use NV21 with VU swapped.
|
||||
I422ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1],
|
||||
dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
|
||||
dest->y += rows * dest->y_stride;
|
||||
dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
|
||||
dest->h -= rows;
|
||||
}
|
||||
|
||||
static void JpegI444ToNV12(void* opaque,
|
||||
const uint8_t* const* data,
|
||||
const int* strides,
|
||||
int rows) {
|
||||
NV21Buffers* dest = (NV21Buffers*)(opaque);
|
||||
// Use NV21 with VU swapped.
|
||||
I444ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1],
|
||||
dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
|
||||
dest->y += rows * dest->y_stride;
|
||||
dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
|
||||
dest->h -= rows;
|
||||
}
|
||||
|
||||
static void JpegI400ToNV12(void* opaque,
|
||||
const uint8_t* const* data,
|
||||
const int* strides,
|
||||
int rows) {
|
||||
NV21Buffers* dest = (NV21Buffers*)(opaque);
|
||||
// Use NV21 since there is no UV plane.
|
||||
I400ToNV21(data[0], strides[0], dest->y, dest->y_stride, dest->vu,
|
||||
dest->vu_stride, dest->w, rows);
|
||||
dest->y += rows * dest->y_stride;
|
||||
dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
|
||||
dest->h -= rows;
|
||||
}
|
||||
|
||||
// MJPG (Motion JPEG) to NV12.
|
||||
LIBYUV_API
|
||||
int MJPGToNV12(const uint8_t* sample,
|
||||
size_t sample_size,
|
||||
uint8_t* dst_y,
|
||||
int dst_stride_y,
|
||||
uint8_t* dst_uv,
|
||||
int dst_stride_uv,
|
||||
int src_width,
|
||||
int src_height,
|
||||
int dst_width,
|
||||
int dst_height) {
|
||||
if (sample_size == kUnknownDataSize) {
|
||||
// ERROR: MJPEG frame size unknown
|
||||
return -1;
|
||||
}
|
||||
|
||||
// TODO(fbarchard): Port MJpeg to C.
|
||||
MJpegDecoder mjpeg_decoder;
|
||||
LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
|
||||
if (ret && (mjpeg_decoder.GetWidth() != src_width ||
|
||||
mjpeg_decoder.GetHeight() != src_height)) {
|
||||
// ERROR: MJPEG frame has unexpected dimensions
|
||||
mjpeg_decoder.UnloadFrame();
|
||||
return 1; // runtime failure
|
||||
}
|
||||
if (ret) {
|
||||
// Use NV21Buffers but with UV instead of VU.
|
||||
NV21Buffers bufs = {dst_y, dst_stride_y, dst_uv,
|
||||
dst_stride_uv, dst_width, dst_height};
|
||||
// YUV420
|
||||
if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
|
||||
mjpeg_decoder.GetNumComponents() == 3 &&
|
||||
mjpeg_decoder.GetVertSampFactor(0) == 2 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
|
||||
mjpeg_decoder.GetVertSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
|
||||
ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToNV12, &bufs, dst_width,
|
||||
dst_height);
|
||||
// YUV422
|
||||
} else if (mjpeg_decoder.GetColorSpace() ==
|
||||
MJpegDecoder::kColorSpaceYCbCr &&
|
||||
mjpeg_decoder.GetNumComponents() == 3 &&
|
||||
mjpeg_decoder.GetVertSampFactor(0) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
|
||||
mjpeg_decoder.GetVertSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
|
||||
ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToNV12, &bufs, dst_width,
|
||||
dst_height);
|
||||
// YUV444
|
||||
} else if (mjpeg_decoder.GetColorSpace() ==
|
||||
MJpegDecoder::kColorSpaceYCbCr &&
|
||||
mjpeg_decoder.GetNumComponents() == 3 &&
|
||||
mjpeg_decoder.GetVertSampFactor(0) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
|
||||
mjpeg_decoder.GetVertSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
|
||||
ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToNV12, &bufs, dst_width,
|
||||
dst_height);
|
||||
// YUV400
|
||||
} else if (mjpeg_decoder.GetColorSpace() ==
|
||||
MJpegDecoder::kColorSpaceGrayscale &&
|
||||
mjpeg_decoder.GetNumComponents() == 1 &&
|
||||
mjpeg_decoder.GetVertSampFactor(0) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(0) == 1) {
|
||||
ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToNV12, &bufs, dst_width,
|
||||
dst_height);
|
||||
} else {
|
||||
// Unknown colorspace.
|
||||
mjpeg_decoder.UnloadFrame();
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return ret ? 0 : 1;
|
||||
}
|
||||
|
||||
struct ARGBBuffers {
|
||||
uint8_t* argb;
|
||||
int argb_stride;
|
||||
int w;
|
||||
int h;
|
||||
};
|
||||
|
||||
static void JpegI420ToARGB(void* opaque,
|
||||
const uint8_t* const* data,
|
||||
const int* strides,
|
||||
int rows) {
|
||||
ARGBBuffers* dest = (ARGBBuffers*)(opaque);
|
||||
I420ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],
|
||||
dest->argb, dest->argb_stride, dest->w, rows);
|
||||
dest->argb += rows * dest->argb_stride;
|
||||
dest->h -= rows;
|
||||
}
|
||||
|
||||
static void JpegI422ToARGB(void* opaque,
|
||||
const uint8_t* const* data,
|
||||
const int* strides,
|
||||
int rows) {
|
||||
ARGBBuffers* dest = (ARGBBuffers*)(opaque);
|
||||
I422ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],
|
||||
dest->argb, dest->argb_stride, dest->w, rows);
|
||||
dest->argb += rows * dest->argb_stride;
|
||||
dest->h -= rows;
|
||||
}
|
||||
|
||||
static void JpegI444ToARGB(void* opaque,
|
||||
const uint8_t* const* data,
|
||||
const int* strides,
|
||||
int rows) {
|
||||
ARGBBuffers* dest = (ARGBBuffers*)(opaque);
|
||||
I444ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],
|
||||
dest->argb, dest->argb_stride, dest->w, rows);
|
||||
dest->argb += rows * dest->argb_stride;
|
||||
dest->h -= rows;
|
||||
}
|
||||
|
||||
static void JpegI400ToARGB(void* opaque,
|
||||
const uint8_t* const* data,
|
||||
const int* strides,
|
||||
int rows) {
|
||||
ARGBBuffers* dest = (ARGBBuffers*)(opaque);
|
||||
I400ToARGB(data[0], strides[0], dest->argb, dest->argb_stride, dest->w, rows);
|
||||
dest->argb += rows * dest->argb_stride;
|
||||
dest->h -= rows;
|
||||
}
|
||||
|
||||
// MJPG (Motion JPeg) to ARGB
|
||||
// TODO(fbarchard): review src_width and src_height requirement. dst_width and
|
||||
// dst_height may be enough.
|
||||
LIBYUV_API
|
||||
int MJPGToARGB(const uint8_t* src_mjpg,
|
||||
size_t src_size_mjpg,
|
||||
uint8_t* dst_argb,
|
||||
int dst_stride_argb,
|
||||
int src_width,
|
||||
int src_height,
|
||||
int dst_width,
|
||||
int dst_height) {
|
||||
if (src_size_mjpg == kUnknownDataSize) {
|
||||
// ERROR: MJPEG frame size unknown
|
||||
return -1;
|
||||
}
|
||||
|
||||
// TODO(fbarchard): Port MJpeg to C.
|
||||
MJpegDecoder mjpeg_decoder;
|
||||
LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg);
|
||||
if (ret && (mjpeg_decoder.GetWidth() != src_width ||
|
||||
mjpeg_decoder.GetHeight() != src_height)) {
|
||||
// ERROR: MJPEG frame has unexpected dimensions
|
||||
mjpeg_decoder.UnloadFrame();
|
||||
return 1; // runtime failure
|
||||
}
|
||||
if (ret) {
|
||||
ARGBBuffers bufs = {dst_argb, dst_stride_argb, dst_width, dst_height};
|
||||
// YUV420
|
||||
if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
|
||||
mjpeg_decoder.GetNumComponents() == 3 &&
|
||||
mjpeg_decoder.GetVertSampFactor(0) == 2 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
|
||||
mjpeg_decoder.GetVertSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
|
||||
ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dst_width,
|
||||
dst_height);
|
||||
// YUV422
|
||||
} else if (mjpeg_decoder.GetColorSpace() ==
|
||||
MJpegDecoder::kColorSpaceYCbCr &&
|
||||
mjpeg_decoder.GetNumComponents() == 3 &&
|
||||
mjpeg_decoder.GetVertSampFactor(0) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
|
||||
mjpeg_decoder.GetVertSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
|
||||
ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dst_width,
|
||||
dst_height);
|
||||
// YUV444
|
||||
} else if (mjpeg_decoder.GetColorSpace() ==
|
||||
MJpegDecoder::kColorSpaceYCbCr &&
|
||||
mjpeg_decoder.GetNumComponents() == 3 &&
|
||||
mjpeg_decoder.GetVertSampFactor(0) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
|
||||
mjpeg_decoder.GetVertSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
|
||||
ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dst_width,
|
||||
dst_height);
|
||||
// YUV400
|
||||
} else if (mjpeg_decoder.GetColorSpace() ==
|
||||
MJpegDecoder::kColorSpaceGrayscale &&
|
||||
mjpeg_decoder.GetNumComponents() == 1 &&
|
||||
mjpeg_decoder.GetVertSampFactor(0) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(0) == 1) {
|
||||
ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dst_width,
|
||||
dst_height);
|
||||
} else {
|
||||
// TODO(fbarchard): Implement conversion for any other
|
||||
// colorspace/subsample factors that occur in practice. ERROR: Unable to
|
||||
// convert MJPEG frame because format is not supported
|
||||
mjpeg_decoder.UnloadFrame();
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return ret ? 0 : 1;
|
||||
}
|
||||
|
||||
#endif // HAVE_JPEG
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
391
3rdparty/libyuv/source/convert_to_argb.cc
vendored
Normal file
391
3rdparty/libyuv/source/convert_to_argb.cc
vendored
Normal file
@@ -0,0 +1,391 @@
|
||||
/*
|
||||
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/convert_argb.h"
|
||||
|
||||
#include <limits.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "libyuv/cpu_id.h"
|
||||
#ifdef HAVE_JPEG
|
||||
#include "libyuv/mjpeg_decoder.h"
|
||||
#endif
|
||||
#include "libyuv/rotate_argb.h"
|
||||
#include "libyuv/row.h"
|
||||
#include "libyuv/video_common.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Convert camera sample to ARGB with cropping, rotation and vertical flip.
|
||||
// src_width is used for source stride computation
|
||||
// src_height is used to compute location of planes, and indicate inversion
|
||||
// sample_size is measured in bytes and is the size of the frame.
|
||||
// With MJPEG it is the compressed size of the frame.
|
||||
|
||||
// TODO(fbarchard): Add the following:
|
||||
// H010ToARGB
|
||||
// I010ToARGB
|
||||
|
||||
LIBYUV_API
|
||||
int ConvertToARGB(const uint8_t* sample,
|
||||
size_t sample_size,
|
||||
uint8_t* dst_argb,
|
||||
int dst_stride_argb,
|
||||
int crop_x,
|
||||
int crop_y,
|
||||
int src_width,
|
||||
int src_height,
|
||||
int crop_width,
|
||||
int crop_height,
|
||||
enum RotationMode rotation,
|
||||
uint32_t fourcc) {
|
||||
uint32_t format = CanonicalFourCC(fourcc);
|
||||
int aligned_src_width = (src_width + 1) & ~1;
|
||||
const uint8_t* src;
|
||||
const uint8_t* src_uv;
|
||||
int abs_src_height = (src_height < 0) ? -src_height : src_height;
|
||||
int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
|
||||
int r = 0;
|
||||
|
||||
// One pass rotation is available for some formats. For the rest, convert
|
||||
// to ARGB (with optional vertical flipping) into a temporary ARGB buffer,
|
||||
// and then rotate the ARGB to the final destination buffer.
|
||||
// For in-place conversion, if destination dst_argb is same as source sample,
|
||||
// also enable temporary buffer.
|
||||
LIBYUV_BOOL need_buf =
|
||||
(rotation && format != FOURCC_ARGB) || dst_argb == sample;
|
||||
uint8_t* dest_argb = dst_argb;
|
||||
int dest_dst_stride_argb = dst_stride_argb;
|
||||
uint8_t* rotate_buffer = NULL;
|
||||
int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
|
||||
|
||||
if (dst_argb == NULL || sample == NULL || src_width <= 0 ||
|
||||
src_width > INT_MAX / 4 || crop_width <= 0 || crop_width > INT_MAX / 4 ||
|
||||
src_height == 0 || crop_height == 0) {
|
||||
return -1;
|
||||
}
|
||||
if (src_height < 0) {
|
||||
inv_crop_height = -inv_crop_height;
|
||||
}
|
||||
|
||||
if (need_buf) {
|
||||
const uint64_t rotate_buffer_size =
|
||||
(uint64_t)crop_width * 4 * abs_crop_height;
|
||||
if (rotate_buffer_size > SIZE_MAX) {
|
||||
return -1; // Invalid size.
|
||||
}
|
||||
rotate_buffer = (uint8_t*)malloc((size_t)rotate_buffer_size);
|
||||
if (!rotate_buffer) {
|
||||
return 1; // Out of memory runtime error.
|
||||
}
|
||||
dst_argb = rotate_buffer;
|
||||
dst_stride_argb = crop_width * 4;
|
||||
}
|
||||
|
||||
switch (format) {
|
||||
// Single plane formats
|
||||
case FOURCC_YUY2:
|
||||
src = sample + (aligned_src_width * crop_y + crop_x) * 2;
|
||||
r = YUY2ToARGB(src, aligned_src_width * 2, dst_argb, dst_stride_argb,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
case FOURCC_UYVY:
|
||||
src = sample + (aligned_src_width * crop_y + crop_x) * 2;
|
||||
r = UYVYToARGB(src, aligned_src_width * 2, dst_argb, dst_stride_argb,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
case FOURCC_24BG:
|
||||
src = sample + (src_width * crop_y + crop_x) * 3;
|
||||
r = RGB24ToARGB(src, src_width * 3, dst_argb, dst_stride_argb, crop_width,
|
||||
inv_crop_height);
|
||||
break;
|
||||
case FOURCC_RAW:
|
||||
src = sample + (src_width * crop_y + crop_x) * 3;
|
||||
r = RAWToARGB(src, src_width * 3, dst_argb, dst_stride_argb, crop_width,
|
||||
inv_crop_height);
|
||||
break;
|
||||
case FOURCC_ARGB:
|
||||
if (!need_buf && !rotation) {
|
||||
src = sample + (src_width * crop_y + crop_x) * 4;
|
||||
r = ARGBToARGB(src, src_width * 4, dst_argb, dst_stride_argb,
|
||||
crop_width, inv_crop_height);
|
||||
}
|
||||
break;
|
||||
case FOURCC_BGRA:
|
||||
src = sample + (src_width * crop_y + crop_x) * 4;
|
||||
r = BGRAToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
|
||||
inv_crop_height);
|
||||
break;
|
||||
case FOURCC_ABGR:
|
||||
src = sample + (src_width * crop_y + crop_x) * 4;
|
||||
r = ABGRToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
|
||||
inv_crop_height);
|
||||
break;
|
||||
case FOURCC_RGBA:
|
||||
src = sample + (src_width * crop_y + crop_x) * 4;
|
||||
r = RGBAToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
|
||||
inv_crop_height);
|
||||
break;
|
||||
case FOURCC_AR30:
|
||||
src = sample + (src_width * crop_y + crop_x) * 4;
|
||||
r = AR30ToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
|
||||
inv_crop_height);
|
||||
break;
|
||||
case FOURCC_AB30:
|
||||
src = sample + (src_width * crop_y + crop_x) * 4;
|
||||
r = AB30ToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
|
||||
inv_crop_height);
|
||||
break;
|
||||
case FOURCC_RGBP:
|
||||
src = sample + (src_width * crop_y + crop_x) * 2;
|
||||
r = RGB565ToARGB(src, src_width * 2, dst_argb, dst_stride_argb,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
case FOURCC_RGBO:
|
||||
src = sample + (src_width * crop_y + crop_x) * 2;
|
||||
r = ARGB1555ToARGB(src, src_width * 2, dst_argb, dst_stride_argb,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
case FOURCC_R444:
|
||||
src = sample + (src_width * crop_y + crop_x) * 2;
|
||||
r = ARGB4444ToARGB(src, src_width * 2, dst_argb, dst_stride_argb,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
case FOURCC_I400:
|
||||
src = sample + src_width * crop_y + crop_x;
|
||||
r = I400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width,
|
||||
inv_crop_height);
|
||||
break;
|
||||
case FOURCC_J400:
|
||||
src = sample + src_width * crop_y + crop_x;
|
||||
r = J400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width,
|
||||
inv_crop_height);
|
||||
break;
|
||||
|
||||
// Biplanar formats
|
||||
case FOURCC_NV12:
|
||||
src = sample + (src_width * crop_y + crop_x);
|
||||
src_uv =
|
||||
sample + aligned_src_width * (abs_src_height + crop_y / 2) + crop_x;
|
||||
r = NV12ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb,
|
||||
dst_stride_argb, crop_width, inv_crop_height);
|
||||
break;
|
||||
case FOURCC_NV21:
|
||||
src = sample + (src_width * crop_y + crop_x);
|
||||
src_uv =
|
||||
sample + aligned_src_width * (abs_src_height + crop_y / 2) + crop_x;
|
||||
// Call NV12 but with u and v parameters swapped.
|
||||
r = NV21ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb,
|
||||
dst_stride_argb, crop_width, inv_crop_height);
|
||||
break;
|
||||
// Triplanar formats
|
||||
case FOURCC_I420:
|
||||
case FOURCC_YV12: {
|
||||
const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
|
||||
const uint8_t* src_u;
|
||||
const uint8_t* src_v;
|
||||
int halfwidth = (src_width + 1) / 2;
|
||||
int halfheight = (abs_src_height + 1) / 2;
|
||||
if (format == FOURCC_YV12) {
|
||||
src_v = sample + src_width * abs_src_height +
|
||||
(halfwidth * crop_y + crop_x) / 2;
|
||||
src_u = sample + src_width * abs_src_height +
|
||||
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
|
||||
} else {
|
||||
src_u = sample + src_width * abs_src_height +
|
||||
(halfwidth * crop_y + crop_x) / 2;
|
||||
src_v = sample + src_width * abs_src_height +
|
||||
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
|
||||
}
|
||||
r = I420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
|
||||
dst_argb, dst_stride_argb, crop_width, inv_crop_height);
|
||||
break;
|
||||
}
|
||||
|
||||
case FOURCC_J420: {
|
||||
int halfwidth = (src_width + 1) / 2;
|
||||
int halfheight = (abs_src_height + 1) / 2;
|
||||
const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
|
||||
const uint8_t* src_u = sample + src_width * abs_src_height +
|
||||
(halfwidth * crop_y + crop_x) / 2;
|
||||
const uint8_t* src_v = sample + src_width * abs_src_height +
|
||||
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
|
||||
r = J420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
|
||||
dst_argb, dst_stride_argb, crop_width, inv_crop_height);
|
||||
break;
|
||||
}
|
||||
|
||||
case FOURCC_H420: {
|
||||
int halfwidth = (src_width + 1) / 2;
|
||||
int halfheight = (abs_src_height + 1) / 2;
|
||||
const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
|
||||
const uint8_t* src_u = sample + src_width * abs_src_height +
|
||||
(halfwidth * crop_y + crop_x) / 2;
|
||||
const uint8_t* src_v = sample + src_width * abs_src_height +
|
||||
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
|
||||
r = H420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
|
||||
dst_argb, dst_stride_argb, crop_width, inv_crop_height);
|
||||
break;
|
||||
}
|
||||
|
||||
case FOURCC_U420: {
|
||||
int halfwidth = (src_width + 1) / 2;
|
||||
int halfheight = (abs_src_height + 1) / 2;
|
||||
const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
|
||||
const uint8_t* src_u = sample + src_width * abs_src_height +
|
||||
(halfwidth * crop_y + crop_x) / 2;
|
||||
const uint8_t* src_v = sample + src_width * abs_src_height +
|
||||
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
|
||||
r = U420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
|
||||
dst_argb, dst_stride_argb, crop_width, inv_crop_height);
|
||||
break;
|
||||
}
|
||||
|
||||
case FOURCC_I422:
|
||||
case FOURCC_YV16: {
|
||||
int halfwidth = (src_width + 1) / 2;
|
||||
const uint8_t* src_y = sample + src_width * crop_y + crop_x;
|
||||
const uint8_t* src_u;
|
||||
const uint8_t* src_v;
|
||||
if (format == FOURCC_YV16) {
|
||||
src_v = sample + src_width * abs_src_height + halfwidth * crop_y +
|
||||
crop_x / 2;
|
||||
src_u = sample + src_width * abs_src_height +
|
||||
halfwidth * (abs_src_height + crop_y) + crop_x / 2;
|
||||
} else {
|
||||
src_u = sample + src_width * abs_src_height + halfwidth * crop_y +
|
||||
crop_x / 2;
|
||||
src_v = sample + src_width * abs_src_height +
|
||||
halfwidth * (abs_src_height + crop_y) + crop_x / 2;
|
||||
}
|
||||
r = I422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
|
||||
dst_argb, dst_stride_argb, crop_width, inv_crop_height);
|
||||
break;
|
||||
}
|
||||
|
||||
case FOURCC_J422: {
|
||||
int halfwidth = (src_width + 1) / 2;
|
||||
const uint8_t* src_y = sample + src_width * crop_y + crop_x;
|
||||
const uint8_t* src_u =
|
||||
sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2;
|
||||
const uint8_t* src_v = sample + src_width * abs_src_height +
|
||||
halfwidth * (abs_src_height + crop_y) + crop_x / 2;
|
||||
r = J422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
|
||||
dst_argb, dst_stride_argb, crop_width, inv_crop_height);
|
||||
break;
|
||||
}
|
||||
|
||||
case FOURCC_H422: {
|
||||
int halfwidth = (src_width + 1) / 2;
|
||||
const uint8_t* src_y = sample + src_width * crop_y + crop_x;
|
||||
const uint8_t* src_u =
|
||||
sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2;
|
||||
const uint8_t* src_v = sample + src_width * abs_src_height +
|
||||
halfwidth * (abs_src_height + crop_y) + crop_x / 2;
|
||||
r = H422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
|
||||
dst_argb, dst_stride_argb, crop_width, inv_crop_height);
|
||||
break;
|
||||
}
|
||||
|
||||
case FOURCC_U422: {
|
||||
int halfwidth = (src_width + 1) / 2;
|
||||
const uint8_t* src_y = sample + src_width * crop_y + crop_x;
|
||||
const uint8_t* src_u =
|
||||
sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2;
|
||||
const uint8_t* src_v = sample + src_width * abs_src_height +
|
||||
halfwidth * (abs_src_height + crop_y) + crop_x / 2;
|
||||
r = H422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
|
||||
dst_argb, dst_stride_argb, crop_width, inv_crop_height);
|
||||
break;
|
||||
}
|
||||
|
||||
case FOURCC_I444:
|
||||
case FOURCC_YV24: {
|
||||
const uint8_t* src_y = sample + src_width * crop_y + crop_x;
|
||||
const uint8_t* src_u;
|
||||
const uint8_t* src_v;
|
||||
if (format == FOURCC_YV24) {
|
||||
src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
|
||||
src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
|
||||
} else {
|
||||
src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
|
||||
src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
|
||||
}
|
||||
r = I444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
|
||||
dst_argb, dst_stride_argb, crop_width, inv_crop_height);
|
||||
break;
|
||||
}
|
||||
|
||||
case FOURCC_J444: {
|
||||
const uint8_t* src_y = sample + src_width * crop_y + crop_x;
|
||||
const uint8_t* src_u;
|
||||
const uint8_t* src_v;
|
||||
src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
|
||||
src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
|
||||
r = J444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
|
||||
dst_argb, dst_stride_argb, crop_width, inv_crop_height);
|
||||
break;
|
||||
}
|
||||
|
||||
case FOURCC_H444: {
|
||||
const uint8_t* src_y = sample + src_width * crop_y + crop_x;
|
||||
const uint8_t* src_u;
|
||||
const uint8_t* src_v;
|
||||
src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
|
||||
src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
|
||||
r = H444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
|
||||
dst_argb, dst_stride_argb, crop_width, inv_crop_height);
|
||||
break;
|
||||
}
|
||||
|
||||
case FOURCC_U444: {
|
||||
const uint8_t* src_y = sample + src_width * crop_y + crop_x;
|
||||
const uint8_t* src_u;
|
||||
const uint8_t* src_v;
|
||||
src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
|
||||
src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
|
||||
r = U444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
|
||||
dst_argb, dst_stride_argb, crop_width, inv_crop_height);
|
||||
break;
|
||||
}
|
||||
|
||||
#ifdef HAVE_JPEG
|
||||
case FOURCC_MJPG:
|
||||
r = MJPGToARGB(sample, sample_size, dst_argb, dst_stride_argb, src_width,
|
||||
abs_src_height, crop_width, inv_crop_height);
|
||||
break;
|
||||
#endif
|
||||
default:
|
||||
r = -1; // unknown fourcc - return failure code.
|
||||
}
|
||||
|
||||
if (need_buf) {
|
||||
if (!r) {
|
||||
r = ARGBRotate(dst_argb, dst_stride_argb, dest_argb, dest_dst_stride_argb,
|
||||
crop_width, abs_crop_height, rotation);
|
||||
}
|
||||
free(rotate_buffer);
|
||||
} else if (rotation) {
|
||||
src = sample + (src_width * crop_y + crop_x) * 4;
|
||||
r = ARGBRotate(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
|
||||
inv_crop_height, rotation);
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
288
3rdparty/libyuv/source/convert_to_i420.cc
vendored
Normal file
288
3rdparty/libyuv/source/convert_to_i420.cc
vendored
Normal file
@@ -0,0 +1,288 @@
|
||||
/*
|
||||
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/convert.h"
|
||||
|
||||
#include <limits.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "libyuv/video_common.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Convert camera sample to I420 with cropping, rotation and vertical flip.
|
||||
// src_width is used for source stride computation
|
||||
// src_height is used to compute location of planes, and indicate inversion
|
||||
// sample_size is measured in bytes and is the size of the frame.
|
||||
// With MJPEG it is the compressed size of the frame.
|
||||
LIBYUV_API
|
||||
int ConvertToI420(const uint8_t* sample,
|
||||
size_t sample_size,
|
||||
uint8_t* dst_y,
|
||||
int dst_stride_y,
|
||||
uint8_t* dst_u,
|
||||
int dst_stride_u,
|
||||
uint8_t* dst_v,
|
||||
int dst_stride_v,
|
||||
int crop_x,
|
||||
int crop_y,
|
||||
int src_width,
|
||||
int src_height,
|
||||
int crop_width,
|
||||
int crop_height,
|
||||
enum RotationMode rotation,
|
||||
uint32_t fourcc) {
|
||||
uint32_t format = CanonicalFourCC(fourcc);
|
||||
int aligned_src_width = (src_width + 1) & ~1;
|
||||
const uint8_t* src;
|
||||
const uint8_t* src_uv;
|
||||
const int abs_src_height = (src_height < 0) ? -src_height : src_height;
|
||||
const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
|
||||
int r = 0;
|
||||
LIBYUV_BOOL need_buf =
|
||||
(rotation && format != FOURCC_I420 && format != FOURCC_NV12 &&
|
||||
format != FOURCC_NV21 && format != FOURCC_YV12) ||
|
||||
dst_y == sample;
|
||||
uint8_t* tmp_y = dst_y;
|
||||
uint8_t* tmp_u = dst_u;
|
||||
uint8_t* tmp_v = dst_v;
|
||||
int tmp_y_stride = dst_stride_y;
|
||||
int tmp_u_stride = dst_stride_u;
|
||||
int tmp_v_stride = dst_stride_v;
|
||||
uint8_t* rotate_buffer = NULL;
|
||||
const int inv_crop_height =
|
||||
(src_height < 0) ? -abs_crop_height : abs_crop_height;
|
||||
|
||||
if (!dst_y || !dst_u || !dst_v || !sample || src_width <= 0 ||
|
||||
src_width > INT_MAX / 4 || crop_width <= 0 || src_height == 0 ||
|
||||
crop_height == 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// One pass rotation is available for some formats. For the rest, convert
|
||||
// to I420 (with optional vertical flipping) into a temporary I420 buffer,
|
||||
// and then rotate the I420 to the final destination buffer.
|
||||
// For in-place conversion, if destination dst_y is same as source sample,
|
||||
// also enable temporary buffer.
|
||||
if (need_buf) {
|
||||
int y_size = crop_width * abs_crop_height;
|
||||
int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);
|
||||
const uint64_t rotate_buffer_size =
|
||||
(uint64_t)y_size + (uint64_t)uv_size * 2;
|
||||
if (rotate_buffer_size > SIZE_MAX) {
|
||||
return -1; // Invalid size.
|
||||
}
|
||||
rotate_buffer = (uint8_t*)malloc((size_t)rotate_buffer_size);
|
||||
if (!rotate_buffer) {
|
||||
return 1; // Out of memory runtime error.
|
||||
}
|
||||
dst_y = rotate_buffer;
|
||||
dst_u = dst_y + y_size;
|
||||
dst_v = dst_u + uv_size;
|
||||
dst_stride_y = crop_width;
|
||||
dst_stride_u = dst_stride_v = ((crop_width + 1) / 2);
|
||||
}
|
||||
|
||||
switch (format) {
|
||||
// Single plane formats
|
||||
case FOURCC_YUY2: { // TODO(fbarchard): Find better odd crop fix.
|
||||
uint8_t* u = (crop_x & 1) ? dst_v : dst_u;
|
||||
uint8_t* v = (crop_x & 1) ? dst_u : dst_v;
|
||||
int stride_u = (crop_x & 1) ? dst_stride_v : dst_stride_u;
|
||||
int stride_v = (crop_x & 1) ? dst_stride_u : dst_stride_v;
|
||||
src = sample + (aligned_src_width * crop_y + crop_x) * 2;
|
||||
r = YUY2ToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, u,
|
||||
stride_u, v, stride_v, crop_width, inv_crop_height);
|
||||
break;
|
||||
}
|
||||
case FOURCC_UYVY: {
|
||||
uint8_t* u = (crop_x & 1) ? dst_v : dst_u;
|
||||
uint8_t* v = (crop_x & 1) ? dst_u : dst_v;
|
||||
int stride_u = (crop_x & 1) ? dst_stride_v : dst_stride_u;
|
||||
int stride_v = (crop_x & 1) ? dst_stride_u : dst_stride_v;
|
||||
src = sample + (aligned_src_width * crop_y + crop_x) * 2;
|
||||
r = UYVYToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, u,
|
||||
stride_u, v, stride_v, crop_width, inv_crop_height);
|
||||
break;
|
||||
}
|
||||
case FOURCC_RGBP:
|
||||
src = sample + (src_width * crop_y + crop_x) * 2;
|
||||
r = RGB565ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
|
||||
dst_stride_u, dst_v, dst_stride_v, crop_width,
|
||||
inv_crop_height);
|
||||
break;
|
||||
case FOURCC_RGBO:
|
||||
src = sample + (src_width * crop_y + crop_x) * 2;
|
||||
r = ARGB1555ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
|
||||
dst_stride_u, dst_v, dst_stride_v, crop_width,
|
||||
inv_crop_height);
|
||||
break;
|
||||
case FOURCC_R444:
|
||||
src = sample + (src_width * crop_y + crop_x) * 2;
|
||||
r = ARGB4444ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
|
||||
dst_stride_u, dst_v, dst_stride_v, crop_width,
|
||||
inv_crop_height);
|
||||
break;
|
||||
case FOURCC_24BG:
|
||||
src = sample + (src_width * crop_y + crop_x) * 3;
|
||||
r = RGB24ToI420(src, src_width * 3, dst_y, dst_stride_y, dst_u,
|
||||
dst_stride_u, dst_v, dst_stride_v, crop_width,
|
||||
inv_crop_height);
|
||||
break;
|
||||
case FOURCC_RAW:
|
||||
src = sample + (src_width * crop_y + crop_x) * 3;
|
||||
r = RAWToI420(src, src_width * 3, dst_y, dst_stride_y, dst_u,
|
||||
dst_stride_u, dst_v, dst_stride_v, crop_width,
|
||||
inv_crop_height);
|
||||
break;
|
||||
case FOURCC_ARGB:
|
||||
src = sample + (src_width * crop_y + crop_x) * 4;
|
||||
r = ARGBToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
|
||||
dst_stride_u, dst_v, dst_stride_v, crop_width,
|
||||
inv_crop_height);
|
||||
break;
|
||||
case FOURCC_BGRA:
|
||||
src = sample + (src_width * crop_y + crop_x) * 4;
|
||||
r = BGRAToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
|
||||
dst_stride_u, dst_v, dst_stride_v, crop_width,
|
||||
inv_crop_height);
|
||||
break;
|
||||
case FOURCC_ABGR:
|
||||
src = sample + (src_width * crop_y + crop_x) * 4;
|
||||
r = ABGRToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
|
||||
dst_stride_u, dst_v, dst_stride_v, crop_width,
|
||||
inv_crop_height);
|
||||
break;
|
||||
case FOURCC_RGBA:
|
||||
src = sample + (src_width * crop_y + crop_x) * 4;
|
||||
r = RGBAToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
|
||||
dst_stride_u, dst_v, dst_stride_v, crop_width,
|
||||
inv_crop_height);
|
||||
break;
|
||||
// TODO(fbarchard): Add AR30 and AB30
|
||||
case FOURCC_I400:
|
||||
src = sample + src_width * crop_y + crop_x;
|
||||
r = I400ToI420(src, src_width, dst_y, dst_stride_y, dst_u, dst_stride_u,
|
||||
dst_v, dst_stride_v, crop_width, inv_crop_height);
|
||||
break;
|
||||
// Biplanar formats
|
||||
case FOURCC_NV12:
|
||||
src = sample + (src_width * crop_y + crop_x);
|
||||
src_uv = sample + (src_width * abs_src_height) +
|
||||
((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
|
||||
r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, dst_y,
|
||||
dst_stride_y, dst_u, dst_stride_u, dst_v,
|
||||
dst_stride_v, crop_width, inv_crop_height, rotation);
|
||||
break;
|
||||
case FOURCC_NV21:
|
||||
src = sample + (src_width * crop_y + crop_x);
|
||||
src_uv = sample + (src_width * abs_src_height) +
|
||||
((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
|
||||
// Call NV12 but with dst_u and dst_v parameters swapped.
|
||||
r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, dst_y,
|
||||
dst_stride_y, dst_v, dst_stride_v, dst_u,
|
||||
dst_stride_u, crop_width, inv_crop_height, rotation);
|
||||
break;
|
||||
// Triplanar formats
|
||||
case FOURCC_I420:
|
||||
case FOURCC_YV12: {
|
||||
const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
|
||||
const uint8_t* src_u;
|
||||
const uint8_t* src_v;
|
||||
int halfwidth = (src_width + 1) / 2;
|
||||
int halfheight = (abs_src_height + 1) / 2;
|
||||
if (format == FOURCC_YV12) {
|
||||
src_v = sample + src_width * abs_src_height + halfwidth * (crop_y / 2) +
|
||||
(crop_x / 2);
|
||||
src_u = sample + src_width * abs_src_height +
|
||||
halfwidth * (halfheight + (crop_y / 2)) + (crop_x / 2);
|
||||
} else {
|
||||
src_u = sample + src_width * abs_src_height + halfwidth * (crop_y / 2) +
|
||||
(crop_x / 2);
|
||||
src_v = sample + src_width * abs_src_height +
|
||||
halfwidth * (halfheight + (crop_y / 2)) + (crop_x / 2);
|
||||
}
|
||||
r = I420Rotate(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
|
||||
dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
|
||||
dst_stride_v, crop_width, inv_crop_height, rotation);
|
||||
break;
|
||||
}
|
||||
case FOURCC_I422:
|
||||
case FOURCC_YV16: {
|
||||
const uint8_t* src_y = sample + src_width * crop_y + crop_x;
|
||||
const uint8_t* src_u;
|
||||
const uint8_t* src_v;
|
||||
int halfwidth = (src_width + 1) / 2;
|
||||
if (format == FOURCC_YV16) {
|
||||
src_v = sample + src_width * abs_src_height + halfwidth * crop_y +
|
||||
(crop_x / 2);
|
||||
src_u = sample + src_width * abs_src_height +
|
||||
halfwidth * (abs_src_height + crop_y) + (crop_x / 2);
|
||||
} else {
|
||||
src_u = sample + src_width * abs_src_height + halfwidth * crop_y +
|
||||
(crop_x / 2);
|
||||
src_v = sample + src_width * abs_src_height +
|
||||
halfwidth * (abs_src_height + crop_y) + (crop_x / 2);
|
||||
}
|
||||
r = I422ToI420(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
|
||||
dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
|
||||
dst_stride_v, crop_width, inv_crop_height);
|
||||
break;
|
||||
}
|
||||
case FOURCC_I444:
|
||||
case FOURCC_YV24: {
|
||||
const uint8_t* src_y = sample + src_width * crop_y + crop_x;
|
||||
const uint8_t* src_u;
|
||||
const uint8_t* src_v;
|
||||
if (format == FOURCC_YV24) {
|
||||
src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
|
||||
src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
|
||||
} else {
|
||||
src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
|
||||
src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
|
||||
}
|
||||
r = I444ToI420(src_y, src_width, src_u, src_width, src_v, src_width,
|
||||
dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
|
||||
dst_stride_v, crop_width, inv_crop_height);
|
||||
break;
|
||||
}
|
||||
#ifdef HAVE_JPEG
|
||||
case FOURCC_MJPG:
|
||||
r = MJPGToI420(sample, sample_size, dst_y, dst_stride_y, dst_u,
|
||||
dst_stride_u, dst_v, dst_stride_v, src_width,
|
||||
abs_src_height, crop_width, inv_crop_height);
|
||||
break;
|
||||
#endif
|
||||
default:
|
||||
r = -1; // unknown fourcc - return failure code.
|
||||
}
|
||||
|
||||
if (need_buf) {
|
||||
if (!r) {
|
||||
r = I420Rotate(dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
|
||||
dst_stride_v, tmp_y, tmp_y_stride, tmp_u, tmp_u_stride,
|
||||
tmp_v, tmp_v_stride, crop_width, abs_crop_height,
|
||||
rotation);
|
||||
}
|
||||
free(rotate_buffer);
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
501
3rdparty/libyuv/source/cpu_id.cc
vendored
Normal file
501
3rdparty/libyuv/source/cpu_id.cc
vendored
Normal file
@@ -0,0 +1,501 @@
|
||||
/*
|
||||
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/cpu_id.h"
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#include <intrin.h> // For __cpuidex()
|
||||
#endif
|
||||
#if !defined(__pnacl__) && !defined(__CLR_VER) && \
|
||||
!defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \
|
||||
defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
|
||||
#include <immintrin.h> // For _xgetbv()
|
||||
#endif
|
||||
|
||||
// For ArmCpuCaps() but unittested on all platforms
|
||||
#include <stdio.h> // For fopen()
|
||||
#include <string.h>
|
||||
|
||||
#if defined(__linux__) && (defined(__aarch64__) || defined(__loongarch__))
|
||||
#include <sys/auxv.h> // For getauxval()
|
||||
#endif
|
||||
|
||||
#if defined(_WIN32) && defined(__aarch64__)
|
||||
#undef WIN32_LEAN_AND_MEAN
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#undef WIN32_EXTRA_LEAN
|
||||
#define WIN32_EXTRA_LEAN
|
||||
#include <windows.h> // For IsProcessorFeaturePresent()
|
||||
#endif
|
||||
|
||||
#if defined(__APPLE__) && defined(__aarch64__)
|
||||
#include <sys/sysctl.h> // For sysctlbyname()
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// For functions that use the stack and have runtime checks for overflow,
|
||||
// use SAFEBUFFERS to avoid additional check.
|
||||
#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) && \
|
||||
!defined(__clang__)
|
||||
#define SAFEBUFFERS __declspec(safebuffers)
|
||||
#else
|
||||
#define SAFEBUFFERS
|
||||
#endif
|
||||
|
||||
// cpu_info_ variable for SIMD instruction sets detected.
|
||||
LIBYUV_API int cpu_info_ = 0;
|
||||
|
||||
// Low level cpuid for X86.
|
||||
#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
|
||||
defined(__x86_64__)) && \
|
||||
!defined(__pnacl__) && !defined(__CLR_VER)
|
||||
LIBYUV_API
|
||||
void CpuId(int info_eax, int info_ecx, int* cpu_info) {
|
||||
#if defined(_MSC_VER)
|
||||
// Visual C version uses intrinsic or inline x86 assembly.
|
||||
#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
|
||||
__cpuidex(cpu_info, info_eax, info_ecx);
|
||||
#elif defined(_M_IX86)
|
||||
__asm {
|
||||
mov eax, info_eax
|
||||
mov ecx, info_ecx
|
||||
mov edi, cpu_info
|
||||
cpuid
|
||||
mov [edi], eax
|
||||
mov [edi + 4], ebx
|
||||
mov [edi + 8], ecx
|
||||
mov [edi + 12], edx
|
||||
}
|
||||
#else // Visual C but not x86
|
||||
if (info_ecx == 0) {
|
||||
__cpuid(cpu_info, info_eax);
|
||||
} else {
|
||||
cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0u;
|
||||
}
|
||||
#endif
|
||||
// GCC version uses inline x86 assembly.
|
||||
#else // defined(_MSC_VER)
|
||||
int info_ebx, info_edx;
|
||||
asm volatile(
|
||||
#if defined(__i386__) && defined(__PIC__)
|
||||
// Preserve ebx for fpic 32 bit.
|
||||
"mov %%ebx, %%edi \n"
|
||||
"cpuid \n"
|
||||
"xchg %%edi, %%ebx \n"
|
||||
: "=D"(info_ebx),
|
||||
#else
|
||||
"cpuid \n"
|
||||
: "=b"(info_ebx),
|
||||
#endif // defined( __i386__) && defined(__PIC__)
|
||||
"+a"(info_eax), "+c"(info_ecx), "=d"(info_edx));
|
||||
cpu_info[0] = info_eax;
|
||||
cpu_info[1] = info_ebx;
|
||||
cpu_info[2] = info_ecx;
|
||||
cpu_info[3] = info_edx;
|
||||
#endif // defined(_MSC_VER)
|
||||
}
|
||||
#else // (defined(_M_IX86) || defined(_M_X64) ...
|
||||
LIBYUV_API
|
||||
void CpuId(int eax, int ecx, int* cpu_info) {
|
||||
(void)eax;
|
||||
(void)ecx;
|
||||
cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
// For VS2010 and earlier emit can be used:
|
||||
// _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier.
|
||||
// __asm {
|
||||
// xor ecx, ecx // xcr 0
|
||||
// xgetbv
|
||||
// mov xcr0, eax
|
||||
// }
|
||||
// For VS2013 and earlier 32 bit, the _xgetbv(0) optimizer produces bad code.
|
||||
// https://code.google.com/p/libyuv/issues/detail?id=529
|
||||
#if defined(_M_IX86) && defined(_MSC_VER) && (_MSC_VER < 1900)
|
||||
#pragma optimize("g", off)
|
||||
#endif
|
||||
#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
|
||||
defined(__x86_64__)) && \
|
||||
!defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
|
||||
// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
|
||||
static int GetXCR0() {
|
||||
int xcr0 = 0;
|
||||
#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
|
||||
xcr0 = (int)_xgetbv(0); // VS2010 SP1 required. NOLINT
|
||||
#elif defined(__i386__) || defined(__x86_64__)
|
||||
asm(".byte 0x0f, 0x01, 0xd0" : "=a"(xcr0) : "c"(0) : "%edx");
|
||||
#endif // defined(__i386__) || defined(__x86_64__)
|
||||
return xcr0;
|
||||
}
|
||||
#else
|
||||
// xgetbv unavailable to query for OSSave support. Return 0.
|
||||
#define GetXCR0() 0
|
||||
#endif // defined(_M_IX86) || defined(_M_X64) ..
|
||||
// Return optimization to previous setting.
|
||||
#if defined(_M_IX86) && defined(_MSC_VER) && (_MSC_VER < 1900)
|
||||
#pragma optimize("g", on)
|
||||
#endif
|
||||
|
||||
static int cpuinfo_search(const char* cpuinfo_line,
|
||||
const char* needle,
|
||||
int needle_len) {
|
||||
const char* p = strstr(cpuinfo_line, needle);
|
||||
return p && (p[needle_len] == ' ' || p[needle_len] == '\n');
|
||||
}
|
||||
|
||||
// Based on libvpx arm_cpudetect.c
|
||||
// For Arm, but public to allow testing on any CPU
|
||||
LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) {
|
||||
char cpuinfo_line[512];
|
||||
FILE* f = fopen(cpuinfo_name, "re");
|
||||
if (!f) {
|
||||
// Assume Neon if /proc/cpuinfo is unavailable.
|
||||
// This will occur for Chrome sandbox for Pepper or Render process.
|
||||
return kCpuHasNEON;
|
||||
}
|
||||
memset(cpuinfo_line, 0, sizeof(cpuinfo_line));
|
||||
int features = 0;
|
||||
while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f)) {
|
||||
if (memcmp(cpuinfo_line, "Features", 8) == 0) {
|
||||
if (cpuinfo_search(cpuinfo_line, " neon", 5)) {
|
||||
features |= kCpuHasNEON;
|
||||
}
|
||||
}
|
||||
}
|
||||
fclose(f);
|
||||
return features;
|
||||
}
|
||||
|
||||
#ifdef __aarch64__
|
||||
#ifdef __linux__
|
||||
// Define hwcap values ourselves: building with an old auxv header where these
|
||||
// hwcap values are not defined should not prevent features from being enabled.
|
||||
#define YUV_AARCH64_HWCAP_ASIMDDP (1UL << 20)
|
||||
#define YUV_AARCH64_HWCAP_SVE (1UL << 22)
|
||||
#define YUV_AARCH64_HWCAP2_SVE2 (1UL << 1)
|
||||
#define YUV_AARCH64_HWCAP2_SVEF32MM (1UL << 10)
|
||||
#define YUV_AARCH64_HWCAP2_I8MM (1UL << 13)
|
||||
#define YUV_AARCH64_HWCAP2_SME (1UL << 23)
|
||||
#define YUV_AARCH64_HWCAP2_SME2 (1UL << 37)
|
||||
|
||||
// For AArch64, but public to allow testing on any CPU.
|
||||
LIBYUV_API SAFEBUFFERS int AArch64CpuCaps(unsigned long hwcap,
|
||||
unsigned long hwcap2) {
|
||||
// Neon is mandatory on AArch64, so enable regardless of hwcaps.
|
||||
int features = kCpuHasNEON;
|
||||
|
||||
// Don't try to enable later extensions unless earlier extensions are also
|
||||
// reported available. Some of these constraints aren't strictly required by
|
||||
// the architecture, but are satisfied by all micro-architectures of
|
||||
// interest. This also avoids an issue on some emulators where true
|
||||
// architectural constraints are not satisfied, e.g. SVE2 may be reported as
|
||||
// available while SVE is not.
|
||||
if (hwcap & YUV_AARCH64_HWCAP_ASIMDDP) {
|
||||
features |= kCpuHasNeonDotProd;
|
||||
if (hwcap2 & YUV_AARCH64_HWCAP2_I8MM) {
|
||||
features |= kCpuHasNeonI8MM;
|
||||
if (hwcap & YUV_AARCH64_HWCAP_SVE) {
|
||||
features |= kCpuHasSVE;
|
||||
if (hwcap2 & YUV_AARCH64_HWCAP2_SVEF32MM) {
|
||||
features |= kCpuHasSVEF32MM;
|
||||
}
|
||||
if (hwcap2 & YUV_AARCH64_HWCAP2_SVE2) {
|
||||
features |= kCpuHasSVE2;
|
||||
}
|
||||
}
|
||||
// SME may be present without SVE
|
||||
if (hwcap2 & YUV_AARCH64_HWCAP2_SME) {
|
||||
features |= kCpuHasSME;
|
||||
if (hwcap2 & YUV_AARCH64_HWCAP2_SME2) {
|
||||
features |= kCpuHasSME2;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return features;
|
||||
}
|
||||
|
||||
#elif defined(_WIN32)
|
||||
// For AArch64, but public to allow testing on any CPU.
|
||||
LIBYUV_API SAFEBUFFERS int AArch64CpuCaps() {
|
||||
// Neon is mandatory on AArch64, so enable unconditionally.
|
||||
int features = kCpuHasNEON;
|
||||
|
||||
// For more information on IsProcessorFeaturePresent(), see:
|
||||
// https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-isprocessorfeaturepresent#parameters
|
||||
#ifdef PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE
|
||||
if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)) {
|
||||
features |= kCpuHasNeonDotProd;
|
||||
}
|
||||
#endif
|
||||
// No Neon I8MM or SVE feature detection available here at time of writing.
|
||||
return features;
|
||||
}
|
||||
|
||||
#elif defined(__APPLE__)
|
||||
static bool have_feature(const char* feature) {
|
||||
// For more information on sysctlbyname(), see:
|
||||
// https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics
|
||||
int64_t feature_present = 0;
|
||||
size_t size = sizeof(feature_present);
|
||||
if (sysctlbyname(feature, &feature_present, &size, NULL, 0) != 0) {
|
||||
return false;
|
||||
}
|
||||
return feature_present;
|
||||
}
|
||||
|
||||
// For AArch64, but public to allow testing on any CPU.
|
||||
LIBYUV_API SAFEBUFFERS int AArch64CpuCaps() {
|
||||
// Neon is mandatory on AArch64, so enable unconditionally.
|
||||
int features = kCpuHasNEON;
|
||||
|
||||
if (have_feature("hw.optional.arm.FEAT_DotProd")) {
|
||||
features |= kCpuHasNeonDotProd;
|
||||
if (have_feature("hw.optional.arm.FEAT_I8MM")) {
|
||||
features |= kCpuHasNeonI8MM;
|
||||
if (have_feature("hw.optional.arm.FEAT_SME")) {
|
||||
features |= kCpuHasSME;
|
||||
if (have_feature("hw.optional.arm.FEAT_SME2")) {
|
||||
features |= kCpuHasSME2;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// No SVE feature detection available here at time of writing.
|
||||
return features;
|
||||
}
|
||||
|
||||
#else // !defined(__linux__) && !defined(_WIN32) && !defined(__APPLE__)
|
||||
// For AArch64, but public to allow testing on any CPU.
|
||||
LIBYUV_API SAFEBUFFERS int AArch64CpuCaps() {
|
||||
// Neon is mandatory on AArch64, so enable unconditionally.
|
||||
int features = kCpuHasNEON;
|
||||
|
||||
// TODO(libyuv:980) support feature detection on other platforms.
|
||||
|
||||
return features;
|
||||
}
|
||||
#endif
|
||||
#endif // defined(__aarch64__)
|
||||
|
||||
LIBYUV_API SAFEBUFFERS int RiscvCpuCaps(const char* cpuinfo_name) {
|
||||
char cpuinfo_line[512];
|
||||
int flag = 0;
|
||||
FILE* f = fopen(cpuinfo_name, "re");
|
||||
if (!f) {
|
||||
#if defined(__riscv_vector)
|
||||
// Assume RVV if /proc/cpuinfo is unavailable.
|
||||
// This will occur for Chrome sandbox for Pepper or Render process.
|
||||
return kCpuHasRVV;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
memset(cpuinfo_line, 0, sizeof(cpuinfo_line));
|
||||
while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f)) {
|
||||
if (memcmp(cpuinfo_line, "isa", 3) == 0) {
|
||||
// ISA string must begin with rv64{i,e,g} for a 64-bit processor.
|
||||
char* isa = strstr(cpuinfo_line, "rv64");
|
||||
if (isa) {
|
||||
size_t isa_len = strlen(isa);
|
||||
char* extensions;
|
||||
size_t extensions_len = 0;
|
||||
size_t std_isa_len;
|
||||
// Remove the new-line character at the end of string
|
||||
if (isa[isa_len - 1] == '\n') {
|
||||
isa[--isa_len] = '\0';
|
||||
}
|
||||
// 5 ISA characters
|
||||
if (isa_len < 5) {
|
||||
fclose(f);
|
||||
return 0;
|
||||
}
|
||||
// Skip {i,e,g} canonical checking.
|
||||
// Skip rvxxx
|
||||
isa += 5;
|
||||
// Find the very first occurrence of 's', 'x' or 'z'.
|
||||
// To detect multi-letter standard, non-standard, and
|
||||
// supervisor-level extensions.
|
||||
extensions = strpbrk(isa, "zxs");
|
||||
if (extensions) {
|
||||
extensions_len = strlen(extensions);
|
||||
// Multi-letter extensions are seperated by a single underscore
|
||||
// as described in RISC-V User-Level ISA V2.2.
|
||||
char* ext = extensions;
|
||||
while (ext) {
|
||||
char* next = strchr(ext, '_');
|
||||
if (next) {
|
||||
*next = '\0';
|
||||
next++;
|
||||
}
|
||||
// Search for the ZVFH (Vector FP16) extension.
|
||||
if (!strcmp(ext, "zvfh")) {
|
||||
flag |= kCpuHasRVVZVFH;
|
||||
}
|
||||
ext = next;
|
||||
}
|
||||
}
|
||||
std_isa_len = isa_len - extensions_len - 5;
|
||||
// Detect the v in the standard single-letter extensions.
|
||||
if (memchr(isa, 'v', std_isa_len)) {
|
||||
// The RVV implied the F extension.
|
||||
flag |= kCpuHasRVV;
|
||||
}
|
||||
}
|
||||
}
|
||||
#if defined(__riscv_vector)
|
||||
// Assume RVV if /proc/cpuinfo is from x86 host running QEMU.
|
||||
else if ((memcmp(cpuinfo_line, "vendor_id\t: GenuineIntel", 24) == 0) ||
|
||||
(memcmp(cpuinfo_line, "vendor_id\t: AuthenticAMD", 24) == 0)) {
|
||||
fclose(f);
|
||||
return kCpuHasRVV;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
fclose(f);
|
||||
return flag;
|
||||
}
|
||||
|
||||
#if defined(__loongarch__) && defined(__linux__)
|
||||
// Define hwcap values ourselves: building with an old auxv header where these
|
||||
// hwcap values are not defined should not prevent features from being enabled.
|
||||
#define YUV_LOONGARCH_HWCAP_LSX (1 << 4)
|
||||
#define YUV_LOONGARCH_HWCAP_LASX (1 << 5)
|
||||
|
||||
LIBYUV_API SAFEBUFFERS int LoongArchCpuCaps(void) {
|
||||
int flag = 0;
|
||||
unsigned long hwcap = getauxval(AT_HWCAP);
|
||||
|
||||
if (hwcap & YUV_LOONGARCH_HWCAP_LSX)
|
||||
flag |= kCpuHasLSX;
|
||||
|
||||
if (hwcap & YUV_LOONGARCH_HWCAP_LASX)
|
||||
flag |= kCpuHasLASX;
|
||||
return flag;
|
||||
}
|
||||
#endif
|
||||
|
||||
static SAFEBUFFERS int GetCpuFlags(void) {
|
||||
int cpu_info = 0;
|
||||
#if !defined(__pnacl__) && !defined(__CLR_VER) && \
|
||||
(defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
|
||||
defined(_M_IX86))
|
||||
int cpu_info0[4] = {0, 0, 0, 0};
|
||||
int cpu_info1[4] = {0, 0, 0, 0};
|
||||
int cpu_info7[4] = {0, 0, 0, 0};
|
||||
int cpu_einfo7[4] = {0, 0, 0, 0};
|
||||
int cpu_info24[4] = {0, 0, 0, 0};
|
||||
int cpu_amdinfo21[4] = {0, 0, 0, 0};
|
||||
CpuId(0, 0, cpu_info0);
|
||||
CpuId(1, 0, cpu_info1);
|
||||
if (cpu_info0[0] >= 7) {
|
||||
CpuId(7, 0, cpu_info7);
|
||||
CpuId(7, 1, cpu_einfo7);
|
||||
CpuId(0x80000021, 0, cpu_amdinfo21);
|
||||
}
|
||||
if (cpu_info0[0] >= 0x24) {
|
||||
CpuId(0x24, 0, cpu_info24);
|
||||
}
|
||||
cpu_info = kCpuHasX86 | ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
|
||||
((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
|
||||
((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
|
||||
((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
|
||||
((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
|
||||
((cpu_info7[3] & 0x00000010) ? kCpuHasFSMR : 0);
|
||||
|
||||
// AVX requires OS saves YMM registers.
|
||||
if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) && // AVX and OSXSave
|
||||
((GetXCR0() & 6) == 6)) { // Test OS saves YMM registers
|
||||
cpu_info |= kCpuHasAVX | ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |
|
||||
((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
|
||||
((cpu_info1[2] & 0x20000000) ? kCpuHasF16C : 0) |
|
||||
((cpu_einfo7[0] & 0x00000010) ? kCpuHasAVXVNNI : 0) |
|
||||
((cpu_einfo7[3] & 0x00000010) ? kCpuHasAVXVNNIINT8 : 0);
|
||||
|
||||
cpu_info |= ((cpu_amdinfo21[0] & 0x00008000) ? kCpuHasERMS : 0);
|
||||
|
||||
// Detect AVX512bw
|
||||
if ((GetXCR0() & 0xe0) == 0xe0 && (cpu_info7[1] & 0x00010000)) {
|
||||
cpu_info |= ((cpu_info7[1] & 0x40000000) ? kCpuHasAVX512BW : 0) |
|
||||
((cpu_info7[1] & 0x80000000) ? kCpuHasAVX512VL : 0) |
|
||||
((cpu_info7[2] & 0x00000002) ? kCpuHasAVX512VBMI : 0) |
|
||||
((cpu_info7[2] & 0x00000040) ? kCpuHasAVX512VBMI2 : 0) |
|
||||
((cpu_info7[2] & 0x00000800) ? kCpuHasAVX512VNNI : 0) |
|
||||
((cpu_info7[2] & 0x00001000) ? kCpuHasAVX512VBITALG : 0) |
|
||||
((cpu_einfo7[3] & 0x00080000) ? kCpuHasAVX10 : 0) |
|
||||
((cpu_info7[3] & 0x02000000) ? kCpuHasAMXINT8 : 0);
|
||||
if (cpu_info0[0] >= 0x24 && (cpu_einfo7[3] & 0x00080000)) {
|
||||
cpu_info |= ((cpu_info24[1] & 0xFF) >= 2) ? kCpuHasAVX10_2 : 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(__loongarch__) && defined(__linux__)
|
||||
cpu_info = LoongArchCpuCaps();
|
||||
cpu_info |= kCpuHasLOONGARCH;
|
||||
#endif
|
||||
#if defined(__aarch64__)
|
||||
#if defined(__linux__)
|
||||
// getauxval is supported since Android SDK version 18, minimum at time of
|
||||
// writing is 21, so should be safe to always use this. If getauxval is
|
||||
// somehow disabled then getauxval returns 0, which will leave Neon enabled
|
||||
// since Neon is mandatory on AArch64.
|
||||
unsigned long hwcap = getauxval(AT_HWCAP);
|
||||
unsigned long hwcap2 = getauxval(AT_HWCAP2);
|
||||
cpu_info = AArch64CpuCaps(hwcap, hwcap2);
|
||||
#else
|
||||
cpu_info = AArch64CpuCaps();
|
||||
#endif
|
||||
cpu_info |= kCpuHasARM;
|
||||
#endif // __aarch64__
|
||||
#if defined(__arm__)
|
||||
// gcc -mfpu=neon defines __ARM_NEON__
|
||||
// __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon.
|
||||
// For Linux, /proc/cpuinfo can be tested but without that assume Neon.
|
||||
// Linux arm parse text file for neon detect.
|
||||
#if defined(__linux__)
|
||||
cpu_info = ArmCpuCaps("/proc/cpuinfo");
|
||||
#elif defined(__ARM_NEON__)
|
||||
cpu_info = kCpuHasNEON;
|
||||
#else
|
||||
cpu_info = 0;
|
||||
#endif
|
||||
cpu_info |= kCpuHasARM;
|
||||
#endif // __arm__
|
||||
#if defined(__riscv) && defined(__linux__)
|
||||
cpu_info = RiscvCpuCaps("/proc/cpuinfo");
|
||||
cpu_info |= kCpuHasRISCV;
|
||||
#endif // __riscv
|
||||
cpu_info |= kCpuInitialized;
|
||||
return cpu_info;
|
||||
}
|
||||
|
||||
// Note that use of this function is not thread safe.
|
||||
LIBYUV_API
|
||||
int MaskCpuFlags(int enable_flags) {
|
||||
int cpu_info = GetCpuFlags() & enable_flags;
|
||||
SetCpuFlags(cpu_info);
|
||||
return cpu_info;
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
int InitCpuFlags(void) {
|
||||
return MaskCpuFlags(-1);
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
580
3rdparty/libyuv/source/mjpeg_decoder.cc
vendored
Normal file
580
3rdparty/libyuv/source/mjpeg_decoder.cc
vendored
Normal file
@@ -0,0 +1,580 @@
|
||||
/*
|
||||
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/mjpeg_decoder.h"
|
||||
|
||||
#ifdef HAVE_JPEG
|
||||
#include <assert.h>
|
||||
|
||||
#if !defined(__pnacl__) && !defined(__CLR_VER) && !defined(COVERAGE_ENABLED)
|
||||
// Must be included before jpeglib.
|
||||
#include <setjmp.h>
|
||||
#define HAVE_SETJMP
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
// disable warning 4324: structure was padded due to __declspec(align())
|
||||
#pragma warning(disable : 4324)
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#include <stdio.h> // For jpeglib.h.
|
||||
|
||||
// C++ build requires extern C for jpeg internals.
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include <jpeglib.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#include "libyuv/planar_functions.h" // For CopyPlane().
|
||||
|
||||
namespace libyuv {
|
||||
|
||||
#ifdef HAVE_SETJMP
|
||||
struct SetJmpErrorMgr {
|
||||
jpeg_error_mgr base; // Must be at the top
|
||||
jmp_buf setjmp_buffer;
|
||||
};
|
||||
#endif
|
||||
|
||||
const int MJpegDecoder::kColorSpaceUnknown = JCS_UNKNOWN;
|
||||
const int MJpegDecoder::kColorSpaceGrayscale = JCS_GRAYSCALE;
|
||||
const int MJpegDecoder::kColorSpaceRgb = JCS_RGB;
|
||||
const int MJpegDecoder::kColorSpaceYCbCr = JCS_YCbCr;
|
||||
const int MJpegDecoder::kColorSpaceCMYK = JCS_CMYK;
|
||||
const int MJpegDecoder::kColorSpaceYCCK = JCS_YCCK;
|
||||
|
||||
// Methods that are passed to jpeglib.
|
||||
boolean fill_input_buffer(jpeg_decompress_struct* cinfo);
|
||||
void init_source(jpeg_decompress_struct* cinfo);
|
||||
void skip_input_data(jpeg_decompress_struct* cinfo, long num_bytes); // NOLINT
|
||||
void term_source(jpeg_decompress_struct* cinfo);
|
||||
void ErrorHandler(jpeg_common_struct* cinfo);
|
||||
void OutputHandler(jpeg_common_struct* cinfo);
|
||||
|
||||
MJpegDecoder::MJpegDecoder()
|
||||
: has_scanline_padding_(LIBYUV_FALSE),
|
||||
num_outbufs_(0),
|
||||
scanlines_(NULL),
|
||||
scanlines_sizes_(NULL),
|
||||
databuf_(NULL),
|
||||
databuf_strides_(NULL) {
|
||||
decompress_struct_ = new jpeg_decompress_struct;
|
||||
source_mgr_ = new jpeg_source_mgr;
|
||||
#ifdef HAVE_SETJMP
|
||||
error_mgr_ = new SetJmpErrorMgr;
|
||||
decompress_struct_->err = jpeg_std_error(&error_mgr_->base);
|
||||
// Override standard exit()-based error handler.
|
||||
error_mgr_->base.error_exit = &ErrorHandler;
|
||||
error_mgr_->base.output_message = &OutputHandler;
|
||||
#endif
|
||||
decompress_struct_->client_data = NULL;
|
||||
source_mgr_->init_source = &init_source;
|
||||
source_mgr_->fill_input_buffer = &fill_input_buffer;
|
||||
source_mgr_->skip_input_data = &skip_input_data;
|
||||
source_mgr_->resync_to_restart = &jpeg_resync_to_restart;
|
||||
source_mgr_->term_source = &term_source;
|
||||
jpeg_create_decompress(decompress_struct_);
|
||||
decompress_struct_->src = source_mgr_;
|
||||
buf_vec_.buffers = &buf_;
|
||||
buf_vec_.len = 1;
|
||||
}
|
||||
|
||||
MJpegDecoder::~MJpegDecoder() {
|
||||
jpeg_destroy_decompress(decompress_struct_);
|
||||
delete decompress_struct_;
|
||||
delete source_mgr_;
|
||||
#ifdef HAVE_SETJMP
|
||||
delete error_mgr_;
|
||||
#endif
|
||||
DestroyOutputBuffers();
|
||||
}
|
||||
|
||||
LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8_t* src, size_t src_len) {
|
||||
if (!ValidateJpeg(src, src_len)) {
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
|
||||
buf_.data = src;
|
||||
buf_.len = (int)src_len;
|
||||
buf_vec_.pos = 0;
|
||||
decompress_struct_->client_data = &buf_vec_;
|
||||
#ifdef HAVE_SETJMP
|
||||
if (setjmp(error_mgr_->setjmp_buffer)) {
|
||||
// We called jpeg_read_header, it experienced an error, and we called
|
||||
// longjmp() and rewound the stack to here. Return error.
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
#endif
|
||||
if (jpeg_read_header(decompress_struct_, TRUE) != JPEG_HEADER_OK) {
|
||||
// ERROR: Bad MJPEG header
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
AllocOutputBuffers(GetNumComponents());
|
||||
for (int i = 0; i < num_outbufs_; ++i) {
|
||||
int scanlines_size = GetComponentScanlinesPerImcuRow(i);
|
||||
if (scanlines_sizes_[i] != scanlines_size) {
|
||||
if (scanlines_[i]) {
|
||||
delete scanlines_[i];
|
||||
}
|
||||
scanlines_[i] = new uint8_t*[scanlines_size];
|
||||
scanlines_sizes_[i] = scanlines_size;
|
||||
}
|
||||
|
||||
// We allocate padding for the final scanline to pad it up to DCTSIZE bytes
|
||||
// to avoid memory errors, since jpeglib only reads full MCUs blocks. For
|
||||
// the preceding scanlines, the padding is not needed/wanted because the
|
||||
// following addresses will already be valid (they are the initial bytes of
|
||||
// the next scanline) and will be overwritten when jpeglib writes out that
|
||||
// next scanline.
|
||||
int databuf_stride = GetComponentStride(i);
|
||||
int databuf_size = scanlines_size * databuf_stride;
|
||||
if (databuf_strides_[i] != databuf_stride) {
|
||||
if (databuf_[i]) {
|
||||
delete databuf_[i];
|
||||
}
|
||||
databuf_[i] = new uint8_t[databuf_size];
|
||||
databuf_strides_[i] = databuf_stride;
|
||||
}
|
||||
|
||||
if (GetComponentStride(i) != GetComponentWidth(i)) {
|
||||
has_scanline_padding_ = LIBYUV_TRUE;
|
||||
}
|
||||
}
|
||||
return LIBYUV_TRUE;
|
||||
}
|
||||
|
||||
static int DivideAndRoundUp(int numerator, int denominator) {
|
||||
return (numerator + denominator - 1) / denominator;
|
||||
}
|
||||
|
||||
static int DivideAndRoundDown(int numerator, int denominator) {
|
||||
return numerator / denominator;
|
||||
}
|
||||
|
||||
// Returns width of the last loaded frame.
|
||||
int MJpegDecoder::GetWidth() {
|
||||
return decompress_struct_->image_width;
|
||||
}
|
||||
|
||||
// Returns height of the last loaded frame.
|
||||
int MJpegDecoder::GetHeight() {
|
||||
return decompress_struct_->image_height;
|
||||
}
|
||||
|
||||
// Returns format of the last loaded frame. The return value is one of the
|
||||
// kColorSpace* constants.
|
||||
int MJpegDecoder::GetColorSpace() {
|
||||
return decompress_struct_->jpeg_color_space;
|
||||
}
|
||||
|
||||
// Number of color components in the color space.
|
||||
int MJpegDecoder::GetNumComponents() {
|
||||
return decompress_struct_->num_components;
|
||||
}
|
||||
|
||||
// Sample factors of the n-th component.
|
||||
int MJpegDecoder::GetHorizSampFactor(int component) {
|
||||
return decompress_struct_->comp_info[component].h_samp_factor;
|
||||
}
|
||||
|
||||
int MJpegDecoder::GetVertSampFactor(int component) {
|
||||
return decompress_struct_->comp_info[component].v_samp_factor;
|
||||
}
|
||||
|
||||
int MJpegDecoder::GetHorizSubSampFactor(int component) {
|
||||
return decompress_struct_->max_h_samp_factor / GetHorizSampFactor(component);
|
||||
}
|
||||
|
||||
int MJpegDecoder::GetVertSubSampFactor(int component) {
|
||||
return decompress_struct_->max_v_samp_factor / GetVertSampFactor(component);
|
||||
}
|
||||
|
||||
int MJpegDecoder::GetImageScanlinesPerImcuRow() {
|
||||
return decompress_struct_->max_v_samp_factor * DCTSIZE;
|
||||
}
|
||||
|
||||
int MJpegDecoder::GetComponentScanlinesPerImcuRow(int component) {
|
||||
int vs = GetVertSubSampFactor(component);
|
||||
return DivideAndRoundUp(GetImageScanlinesPerImcuRow(), vs);
|
||||
}
|
||||
|
||||
int MJpegDecoder::GetComponentWidth(int component) {
|
||||
int hs = GetHorizSubSampFactor(component);
|
||||
return DivideAndRoundUp(GetWidth(), hs);
|
||||
}
|
||||
|
||||
int MJpegDecoder::GetComponentHeight(int component) {
|
||||
int vs = GetVertSubSampFactor(component);
|
||||
return DivideAndRoundUp(GetHeight(), vs);
|
||||
}
|
||||
|
||||
// Get width in bytes padded out to a multiple of DCTSIZE
|
||||
int MJpegDecoder::GetComponentStride(int component) {
|
||||
return (GetComponentWidth(component) + DCTSIZE - 1) & ~(DCTSIZE - 1);
|
||||
}
|
||||
|
||||
int MJpegDecoder::GetComponentSize(int component) {
|
||||
return GetComponentWidth(component) * GetComponentHeight(component);
|
||||
}
|
||||
|
||||
LIBYUV_BOOL MJpegDecoder::UnloadFrame() {
|
||||
#ifdef HAVE_SETJMP
|
||||
if (setjmp(error_mgr_->setjmp_buffer)) {
|
||||
// We called jpeg_abort_decompress, it experienced an error, and we called
|
||||
// longjmp() and rewound the stack to here. Return error.
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
#endif
|
||||
jpeg_abort_decompress(decompress_struct_);
|
||||
return LIBYUV_TRUE;
|
||||
}
|
||||
|
||||
// TODO(fbarchard): Allow rectangle to be specified: x, y, width, height.
|
||||
LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(uint8_t** planes,
|
||||
int dst_width,
|
||||
int dst_height) {
|
||||
if (dst_width != GetWidth() || dst_height > GetHeight()) {
|
||||
// ERROR: Bad dimensions
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
#ifdef HAVE_SETJMP
|
||||
if (setjmp(error_mgr_->setjmp_buffer)) {
|
||||
// We called into jpeglib, it experienced an error sometime during this
|
||||
// function call, and we called longjmp() and rewound the stack to here.
|
||||
// Return error.
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
#endif
|
||||
if (!StartDecode()) {
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
SetScanlinePointers(databuf_);
|
||||
int lines_left = dst_height;
|
||||
// Compute amount of lines to skip to implement vertical crop.
|
||||
// TODO(fbarchard): Ensure skip is a multiple of maximum component
|
||||
// subsample. ie 2
|
||||
int skip = (GetHeight() - dst_height) / 2;
|
||||
if (skip > 0) {
|
||||
// There is no API to skip lines in the output data, so we read them
|
||||
// into the temp buffer.
|
||||
while (skip >= GetImageScanlinesPerImcuRow()) {
|
||||
if (!DecodeImcuRow()) {
|
||||
FinishDecode();
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
skip -= GetImageScanlinesPerImcuRow();
|
||||
}
|
||||
if (skip > 0) {
|
||||
// Have a partial iMCU row left over to skip. Must read it and then
|
||||
// copy the parts we want into the destination.
|
||||
if (!DecodeImcuRow()) {
|
||||
FinishDecode();
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
for (int i = 0; i < num_outbufs_; ++i) {
|
||||
// TODO(fbarchard): Compute skip to avoid this
|
||||
assert(skip % GetVertSubSampFactor(i) == 0);
|
||||
int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
|
||||
int scanlines_to_copy =
|
||||
GetComponentScanlinesPerImcuRow(i) - rows_to_skip;
|
||||
int data_to_skip = rows_to_skip * GetComponentStride(i);
|
||||
CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i), planes[i],
|
||||
GetComponentWidth(i), GetComponentWidth(i),
|
||||
scanlines_to_copy);
|
||||
planes[i] += scanlines_to_copy * GetComponentWidth(i);
|
||||
}
|
||||
lines_left -= (GetImageScanlinesPerImcuRow() - skip);
|
||||
}
|
||||
}
|
||||
|
||||
// Read full MCUs but cropped horizontally
|
||||
for (; lines_left > GetImageScanlinesPerImcuRow();
|
||||
lines_left -= GetImageScanlinesPerImcuRow()) {
|
||||
if (!DecodeImcuRow()) {
|
||||
FinishDecode();
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
for (int i = 0; i < num_outbufs_; ++i) {
|
||||
int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i);
|
||||
CopyPlane(databuf_[i], GetComponentStride(i), planes[i],
|
||||
GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy);
|
||||
planes[i] += scanlines_to_copy * GetComponentWidth(i);
|
||||
}
|
||||
}
|
||||
|
||||
if (lines_left > 0) {
|
||||
// Have a partial iMCU row left over to decode.
|
||||
if (!DecodeImcuRow()) {
|
||||
FinishDecode();
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
for (int i = 0; i < num_outbufs_; ++i) {
|
||||
int scanlines_to_copy =
|
||||
DivideAndRoundUp(lines_left, GetVertSubSampFactor(i));
|
||||
CopyPlane(databuf_[i], GetComponentStride(i), planes[i],
|
||||
GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy);
|
||||
planes[i] += scanlines_to_copy * GetComponentWidth(i);
|
||||
}
|
||||
}
|
||||
return FinishDecode();
|
||||
}
|
||||
|
||||
LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn,
|
||||
void* opaque,
|
||||
int dst_width,
|
||||
int dst_height) {
|
||||
if (dst_width != GetWidth() || dst_height > GetHeight()) {
|
||||
// ERROR: Bad dimensions
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
#ifdef HAVE_SETJMP
|
||||
if (setjmp(error_mgr_->setjmp_buffer)) {
|
||||
// We called into jpeglib, it experienced an error sometime during this
|
||||
// function call, and we called longjmp() and rewound the stack to here.
|
||||
// Return error.
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
#endif
|
||||
if (!StartDecode()) {
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
SetScanlinePointers(databuf_);
|
||||
int lines_left = dst_height;
|
||||
// TODO(fbarchard): Compute amount of lines to skip to implement vertical crop
|
||||
int skip = (GetHeight() - dst_height) / 2;
|
||||
if (skip > 0) {
|
||||
while (skip >= GetImageScanlinesPerImcuRow()) {
|
||||
if (!DecodeImcuRow()) {
|
||||
FinishDecode();
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
skip -= GetImageScanlinesPerImcuRow();
|
||||
}
|
||||
if (skip > 0) {
|
||||
// Have a partial iMCU row left over to skip.
|
||||
if (!DecodeImcuRow()) {
|
||||
FinishDecode();
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
for (int i = 0; i < num_outbufs_; ++i) {
|
||||
// TODO(fbarchard): Compute skip to avoid this
|
||||
assert(skip % GetVertSubSampFactor(i) == 0);
|
||||
int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
|
||||
int data_to_skip = rows_to_skip * GetComponentStride(i);
|
||||
// Change our own data buffer pointers so we can pass them to the
|
||||
// callback.
|
||||
databuf_[i] += data_to_skip;
|
||||
}
|
||||
int scanlines_to_copy = GetImageScanlinesPerImcuRow() - skip;
|
||||
(*fn)(opaque, databuf_, databuf_strides_, scanlines_to_copy);
|
||||
// Now change them back.
|
||||
for (int i = 0; i < num_outbufs_; ++i) {
|
||||
int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
|
||||
int data_to_skip = rows_to_skip * GetComponentStride(i);
|
||||
databuf_[i] -= data_to_skip;
|
||||
}
|
||||
lines_left -= scanlines_to_copy;
|
||||
}
|
||||
}
|
||||
// Read full MCUs until we get to the crop point.
|
||||
for (; lines_left >= GetImageScanlinesPerImcuRow();
|
||||
lines_left -= GetImageScanlinesPerImcuRow()) {
|
||||
if (!DecodeImcuRow()) {
|
||||
FinishDecode();
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
(*fn)(opaque, databuf_, databuf_strides_, GetImageScanlinesPerImcuRow());
|
||||
}
|
||||
if (lines_left > 0) {
|
||||
// Have a partial iMCU row left over to decode.
|
||||
if (!DecodeImcuRow()) {
|
||||
FinishDecode();
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
(*fn)(opaque, databuf_, databuf_strides_, lines_left);
|
||||
}
|
||||
return FinishDecode();
|
||||
}
|
||||
|
||||
void init_source(j_decompress_ptr cinfo) {
|
||||
fill_input_buffer(cinfo);
|
||||
}
|
||||
|
||||
boolean fill_input_buffer(j_decompress_ptr cinfo) {
|
||||
BufferVector* buf_vec = reinterpret_cast<BufferVector*>(cinfo->client_data);
|
||||
if (buf_vec->pos >= buf_vec->len) {
|
||||
// ERROR: No more data
|
||||
return FALSE;
|
||||
}
|
||||
cinfo->src->next_input_byte = buf_vec->buffers[buf_vec->pos].data;
|
||||
cinfo->src->bytes_in_buffer = buf_vec->buffers[buf_vec->pos].len;
|
||||
++buf_vec->pos;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
void skip_input_data(j_decompress_ptr cinfo, long num_bytes) { // NOLINT
|
||||
jpeg_source_mgr* src = cinfo->src;
|
||||
size_t bytes = (size_t)num_bytes;
|
||||
if (bytes > src->bytes_in_buffer) {
|
||||
src->next_input_byte = nullptr;
|
||||
src->bytes_in_buffer = 0;
|
||||
} else {
|
||||
src->next_input_byte += bytes;
|
||||
src->bytes_in_buffer -= bytes;
|
||||
}
|
||||
}
|
||||
|
||||
void term_source(j_decompress_ptr cinfo) {
|
||||
(void)cinfo; // Nothing to do.
|
||||
}
|
||||
|
||||
#ifdef HAVE_SETJMP
|
||||
void ErrorHandler(j_common_ptr cinfo) {
|
||||
// This is called when a jpeglib command experiences an error. Unfortunately
|
||||
// jpeglib's error handling model is not very flexible, because it expects the
|
||||
// error handler to not return--i.e., it wants the program to terminate. To
|
||||
// recover from errors we use setjmp() as shown in their example. setjmp() is
|
||||
// C's implementation for the "call with current continuation" functionality
|
||||
// seen in some functional programming languages.
|
||||
// A formatted message can be output, but is unsafe for release.
|
||||
#ifdef DEBUG
|
||||
char buf[JMSG_LENGTH_MAX];
|
||||
(*cinfo->err->format_message)(cinfo, buf);
|
||||
// ERROR: Error in jpeglib: buf
|
||||
#endif
|
||||
|
||||
SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err);
|
||||
// This rewinds the call stack to the point of the corresponding setjmp()
|
||||
// and causes it to return (for a second time) with value 1.
|
||||
longjmp(mgr->setjmp_buffer, 1);
|
||||
}
|
||||
|
||||
// Suppress fprintf warnings.
|
||||
void OutputHandler(j_common_ptr cinfo) {
|
||||
(void)cinfo;
|
||||
}
|
||||
|
||||
#endif // HAVE_SETJMP
|
||||
|
||||
void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
|
||||
if (num_outbufs != num_outbufs_) {
|
||||
// We could perhaps optimize this case to resize the output buffers without
|
||||
// necessarily having to delete and recreate each one, but it's not worth
|
||||
// it.
|
||||
DestroyOutputBuffers();
|
||||
|
||||
scanlines_ = new uint8_t**[num_outbufs];
|
||||
scanlines_sizes_ = new int[num_outbufs];
|
||||
databuf_ = new uint8_t*[num_outbufs];
|
||||
databuf_strides_ = new int[num_outbufs];
|
||||
|
||||
for (int i = 0; i < num_outbufs; ++i) {
|
||||
scanlines_[i] = NULL;
|
||||
scanlines_sizes_[i] = 0;
|
||||
databuf_[i] = NULL;
|
||||
databuf_strides_[i] = 0;
|
||||
}
|
||||
|
||||
num_outbufs_ = num_outbufs;
|
||||
}
|
||||
}
|
||||
|
||||
void MJpegDecoder::DestroyOutputBuffers() {
|
||||
for (int i = 0; i < num_outbufs_; ++i) {
|
||||
delete[] scanlines_[i];
|
||||
delete[] databuf_[i];
|
||||
}
|
||||
delete[] scanlines_;
|
||||
delete[] databuf_;
|
||||
delete[] scanlines_sizes_;
|
||||
delete[] databuf_strides_;
|
||||
scanlines_ = NULL;
|
||||
databuf_ = NULL;
|
||||
scanlines_sizes_ = NULL;
|
||||
databuf_strides_ = NULL;
|
||||
num_outbufs_ = 0;
|
||||
}
|
||||
|
||||
// JDCT_IFAST and do_block_smoothing improve performance substantially.
|
||||
LIBYUV_BOOL MJpegDecoder::StartDecode() {
|
||||
decompress_struct_->raw_data_out = TRUE;
|
||||
decompress_struct_->dct_method = JDCT_IFAST; // JDCT_ISLOW is default
|
||||
decompress_struct_->dither_mode = JDITHER_NONE;
|
||||
// Not applicable to 'raw':
|
||||
decompress_struct_->do_fancy_upsampling = (boolean)(LIBYUV_FALSE);
|
||||
// Only for buffered mode:
|
||||
decompress_struct_->enable_2pass_quant = (boolean)(LIBYUV_FALSE);
|
||||
// Blocky but fast:
|
||||
decompress_struct_->do_block_smoothing = (boolean)(LIBYUV_FALSE);
|
||||
|
||||
if (!jpeg_start_decompress(decompress_struct_)) {
|
||||
// ERROR: Couldn't start JPEG decompressor";
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
return LIBYUV_TRUE;
|
||||
}
|
||||
|
||||
LIBYUV_BOOL MJpegDecoder::FinishDecode() {
|
||||
// jpeglib considers it an error if we finish without decoding the whole
|
||||
// image, so we call "abort" rather than "finish".
|
||||
jpeg_abort_decompress(decompress_struct_);
|
||||
return LIBYUV_TRUE;
|
||||
}
|
||||
|
||||
void MJpegDecoder::SetScanlinePointers(uint8_t** data) {
|
||||
for (int i = 0; i < num_outbufs_; ++i) {
|
||||
uint8_t* data_i = data[i];
|
||||
for (int j = 0; j < scanlines_sizes_[i]; ++j) {
|
||||
scanlines_[i][j] = data_i;
|
||||
data_i += GetComponentStride(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline LIBYUV_BOOL MJpegDecoder::DecodeImcuRow() {
|
||||
return (unsigned int)(GetImageScanlinesPerImcuRow()) ==
|
||||
jpeg_read_raw_data(decompress_struct_, scanlines_,
|
||||
GetImageScanlinesPerImcuRow());
|
||||
}
|
||||
|
||||
// The helper function which recognizes the jpeg sub-sampling type.
|
||||
JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(
|
||||
int* subsample_x,
|
||||
int* subsample_y,
|
||||
int number_of_components) {
|
||||
if (number_of_components == 3) { // Color images.
|
||||
if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 2 &&
|
||||
subsample_y[1] == 2 && subsample_x[2] == 2 && subsample_y[2] == 2) {
|
||||
return kJpegYuv420;
|
||||
}
|
||||
if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 2 &&
|
||||
subsample_y[1] == 1 && subsample_x[2] == 2 && subsample_y[2] == 1) {
|
||||
return kJpegYuv422;
|
||||
}
|
||||
if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 1 &&
|
||||
subsample_y[1] == 1 && subsample_x[2] == 1 && subsample_y[2] == 1) {
|
||||
return kJpegYuv444;
|
||||
}
|
||||
} else if (number_of_components == 1) { // Grey-scale images.
|
||||
if (subsample_x[0] == 1 && subsample_y[0] == 1) {
|
||||
return kJpegYuv400;
|
||||
}
|
||||
}
|
||||
return kJpegUnknown;
|
||||
}
|
||||
|
||||
} // namespace libyuv
|
||||
#endif // HAVE_JPEG
|
||||
71
3rdparty/libyuv/source/mjpeg_validate.cc
vendored
Normal file
71
3rdparty/libyuv/source/mjpeg_validate.cc
vendored
Normal file
@@ -0,0 +1,71 @@
|
||||
/*
|
||||
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/mjpeg_decoder.h"
|
||||
|
||||
#include <string.h> // For memchr.
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Helper function to scan for EOI marker (0xff 0xd9).
|
||||
static LIBYUV_BOOL ScanEOI(const uint8_t* src_mjpg, size_t src_size_mjpg) {
|
||||
if (src_size_mjpg >= 2) {
|
||||
const uint8_t* end = src_mjpg + src_size_mjpg - 1;
|
||||
const uint8_t* it = src_mjpg;
|
||||
while (it < end) {
|
||||
// TODO(fbarchard): scan for 0xd9 instead.
|
||||
it = (const uint8_t*)(memchr(it, 0xff, end - it));
|
||||
if (it == NULL) {
|
||||
break;
|
||||
}
|
||||
if (it[1] == 0xd9) {
|
||||
return LIBYUV_TRUE; // Success: Valid jpeg.
|
||||
}
|
||||
++it; // Skip over current 0xff.
|
||||
}
|
||||
}
|
||||
// ERROR: Invalid jpeg end code not found. Size src_size_mjpg
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
|
||||
// Helper function to validate the jpeg appears intact.
|
||||
LIBYUV_BOOL ValidateJpeg(const uint8_t* src_mjpg, size_t src_size_mjpg) {
|
||||
// Maximum size that ValidateJpeg will consider valid.
|
||||
const size_t kMaxJpegSize = 0x7fffffffull;
|
||||
const size_t kBackSearchSize = 1024;
|
||||
if (src_size_mjpg < 64 || src_size_mjpg > kMaxJpegSize || !src_mjpg) {
|
||||
// ERROR: Invalid jpeg size: src_size_mjpg
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
// SOI marker
|
||||
if (src_mjpg[0] != 0xff || src_mjpg[1] != 0xd8 || src_mjpg[2] != 0xff) {
|
||||
// ERROR: Invalid jpeg initial start code
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
|
||||
// Look for the End Of Image (EOI) marker near the end of the buffer.
|
||||
if (src_size_mjpg > kBackSearchSize) {
|
||||
if (ScanEOI(src_mjpg + src_size_mjpg - kBackSearchSize, kBackSearchSize)) {
|
||||
return LIBYUV_TRUE; // Success: Valid jpeg.
|
||||
}
|
||||
// Reduce search size for forward search.
|
||||
src_size_mjpg = src_size_mjpg - kBackSearchSize + 1;
|
||||
}
|
||||
// Step over SOI marker and scan for EOI.
|
||||
return ScanEOI(src_mjpg + 2, src_size_mjpg - 2);
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
5671
3rdparty/libyuv/source/planar_functions.cc
vendored
Normal file
5671
3rdparty/libyuv/source/planar_functions.cc
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1241
3rdparty/libyuv/source/rotate.cc
vendored
Normal file
1241
3rdparty/libyuv/source/rotate.cc
vendored
Normal file
File diff suppressed because it is too large
Load Diff
76
3rdparty/libyuv/source/rotate_any.cc
vendored
Normal file
76
3rdparty/libyuv/source/rotate_any.cc
vendored
Normal file
@@ -0,0 +1,76 @@
|
||||
/*
|
||||
* Copyright 2015 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/rotate.h"
|
||||
#include "libyuv/rotate_row.h"
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define TANY(NAMEANY, TPOS_SIMD, TPOS_C, MASK) \
|
||||
void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst, \
|
||||
int dst_stride, int width) { \
|
||||
int r = width & MASK; \
|
||||
int n = width - r; \
|
||||
if (n > 0) { \
|
||||
TPOS_SIMD(src, src_stride, dst, dst_stride, n); \
|
||||
} \
|
||||
TPOS_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r); \
|
||||
}
|
||||
|
||||
#ifdef HAS_TRANSPOSEWX8_NEON
|
||||
TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, TransposeWx8_C, 7)
|
||||
#endif
|
||||
#ifdef HAS_TRANSPOSEWX16_NEON
|
||||
TANY(TransposeWx16_Any_NEON, TransposeWx16_NEON, TransposeWx16_C, 15)
|
||||
#endif
|
||||
#ifdef HAS_TRANSPOSEWX8_SSSE3
|
||||
TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, TransposeWx8_C, 7)
|
||||
#endif
|
||||
#ifdef HAS_TRANSPOSEWX8_FAST_SSSE3
|
||||
TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, TransposeWx8_C, 15)
|
||||
#endif
|
||||
#ifdef HAS_TRANSPOSEWX16_LSX
|
||||
TANY(TransposeWx16_Any_LSX, TransposeWx16_LSX, TransposeWx16_C, 15)
|
||||
#endif
|
||||
#undef TANY
|
||||
|
||||
#define TUVANY(NAMEANY, TPOS_SIMD, MASK) \
|
||||
void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst_a, \
|
||||
int dst_stride_a, uint8_t* dst_b, int dst_stride_b, \
|
||||
int width) { \
|
||||
int r = width & MASK; \
|
||||
int n = width - r; \
|
||||
if (n > 0) { \
|
||||
TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, n); \
|
||||
} \
|
||||
TransposeUVWx8_C(src + n * 2, src_stride, dst_a + n * dst_stride_a, \
|
||||
dst_stride_a, dst_b + n * dst_stride_b, dst_stride_b, r); \
|
||||
}
|
||||
|
||||
#ifdef HAS_TRANSPOSEUVWX8_NEON
|
||||
TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7)
|
||||
#endif
|
||||
#ifdef HAS_TRANSPOSEUVWX8_SSE2
|
||||
TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7)
|
||||
#endif
|
||||
#ifdef HAS_TRANSPOSEUVWX16_LSX
|
||||
TUVANY(TransposeUVWx16_Any_LSX, TransposeUVWx16_LSX, 7)
|
||||
#endif
|
||||
#undef TUVANY
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
259
3rdparty/libyuv/source/rotate_argb.cc
vendored
Normal file
259
3rdparty/libyuv/source/rotate_argb.cc
vendored
Normal file
@@ -0,0 +1,259 @@
|
||||
/*
|
||||
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/rotate_argb.h"
|
||||
|
||||
#include "libyuv/convert.h"
|
||||
#include "libyuv/cpu_id.h"
|
||||
#include "libyuv/planar_functions.h"
|
||||
#include "libyuv/rotate.h"
|
||||
#include "libyuv/row.h"
|
||||
#include "libyuv/scale_row.h" /* for ScaleARGBRowDownEven_ */
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
static int ARGBTranspose(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_argb,
|
||||
int dst_stride_argb,
|
||||
int width,
|
||||
int height) {
|
||||
int i;
|
||||
int src_pixel_step = src_stride_argb >> 2;
|
||||
void (*ScaleARGBRowDownEven)(
|
||||
const uint8_t* src_argb, ptrdiff_t src_stride_argb, int src_step,
|
||||
uint8_t* dst_argb, int dst_width) = ScaleARGBRowDownEven_C;
|
||||
// Check stride is a multiple of 4.
|
||||
if (src_stride_argb & 3) {
|
||||
return -1;
|
||||
}
|
||||
#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_SSE2;
|
||||
if (IS_ALIGNED(height, 4)) { // Width of dest.
|
||||
ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_NEON;
|
||||
if (IS_ALIGNED(height, 4)) { // Width of dest.
|
||||
ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBROWDOWNEVEN_LSX)
|
||||
if (TestCpuFlag(kCpuHasLSX)) {
|
||||
ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_LSX;
|
||||
if (IS_ALIGNED(height, 4)) { // Width of dest.
|
||||
ScaleARGBRowDownEven = ScaleARGBRowDownEven_LSX;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBROWDOWNEVEN_RVV)
|
||||
if (TestCpuFlag(kCpuHasRVV)) {
|
||||
ScaleARGBRowDownEven = ScaleARGBRowDownEven_RVV;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (i = 0; i < width; ++i) { // column of source to row of dest.
|
||||
ScaleARGBRowDownEven(src_argb, 0, src_pixel_step, dst_argb, height);
|
||||
dst_argb += dst_stride_argb;
|
||||
src_argb += 4;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ARGBRotate90(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_argb,
|
||||
int dst_stride_argb,
|
||||
int width,
|
||||
int height) {
|
||||
// Rotate by 90 is a ARGBTranspose with the source read
|
||||
// from bottom to top. So set the source pointer to the end
|
||||
// of the buffer and flip the sign of the source stride.
|
||||
src_argb += src_stride_argb * (height - 1);
|
||||
src_stride_argb = -src_stride_argb;
|
||||
return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
|
||||
width, height);
|
||||
}
|
||||
|
||||
static int ARGBRotate270(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_argb,
|
||||
int dst_stride_argb,
|
||||
int width,
|
||||
int height) {
|
||||
// Rotate by 270 is a ARGBTranspose with the destination written
|
||||
// from bottom to top. So set the destination pointer to the end
|
||||
// of the buffer and flip the sign of the destination stride.
|
||||
dst_argb += dst_stride_argb * (width - 1);
|
||||
dst_stride_argb = -dst_stride_argb;
|
||||
return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
|
||||
width, height);
|
||||
}
|
||||
|
||||
static int ARGBRotate180(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_argb,
|
||||
int dst_stride_argb,
|
||||
int width,
|
||||
int height) {
|
||||
// Swap first and last row and mirror the content. Uses a temporary row.
|
||||
const uint8_t* src_bot = src_argb + src_stride_argb * (height - 1);
|
||||
uint8_t* dst_bot = dst_argb + dst_stride_argb * (height - 1);
|
||||
int half_height = (height + 1) >> 1;
|
||||
int y;
|
||||
void (*ARGBMirrorRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =
|
||||
ARGBMirrorRow_C;
|
||||
void (*CopyRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =
|
||||
CopyRow_C;
|
||||
align_buffer_64(row, width * 4);
|
||||
if (!row)
|
||||
return 1;
|
||||
#if defined(HAS_ARGBMIRRORROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
ARGBMirrorRow = ARGBMirrorRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBMIRRORROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
ARGBMirrorRow = ARGBMirrorRow_Any_SSE2;
|
||||
if (IS_ALIGNED(width, 4)) {
|
||||
ARGBMirrorRow = ARGBMirrorRow_SSE2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBMIRRORROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
ARGBMirrorRow = ARGBMirrorRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
ARGBMirrorRow = ARGBMirrorRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBMIRRORROW_LSX)
|
||||
if (TestCpuFlag(kCpuHasLSX)) {
|
||||
ARGBMirrorRow = ARGBMirrorRow_Any_LSX;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
ARGBMirrorRow = ARGBMirrorRow_LSX;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBMIRRORROW_LASX)
|
||||
if (TestCpuFlag(kCpuHasLASX)) {
|
||||
ARGBMirrorRow = ARGBMirrorRow_Any_LASX;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBMirrorRow = ARGBMirrorRow_LASX;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_AVX)
|
||||
if (TestCpuFlag(kCpuHasAVX)) {
|
||||
CopyRow = IS_ALIGNED(width * 4, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_AVX512BW)
|
||||
if (TestCpuFlag(kCpuHasAVX512BW)) {
|
||||
CopyRow =
|
||||
IS_ALIGNED(width * 4, 128) ? CopyRow_AVX512BW : CopyRow_Any_AVX512BW;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_ERMS)
|
||||
if (TestCpuFlag(kCpuHasERMS)) {
|
||||
CopyRow = CopyRow_ERMS;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_SME)
|
||||
if (TestCpuFlag(kCpuHasSME)) {
|
||||
CopyRow = CopyRow_SME;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_RVV)
|
||||
if (TestCpuFlag(kCpuHasRVV)) {
|
||||
CopyRow = CopyRow_RVV;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Odd height will harmlessly mirror the middle row twice.
|
||||
for (y = 0; y < half_height; ++y) {
|
||||
ARGBMirrorRow(src_argb, row, width); // Mirror first row into a buffer
|
||||
ARGBMirrorRow(src_bot, dst_argb, width); // Mirror last row into first row
|
||||
CopyRow(row, dst_bot, width * 4); // Copy first mirrored row into last
|
||||
src_argb += src_stride_argb;
|
||||
dst_argb += dst_stride_argb;
|
||||
src_bot -= src_stride_argb;
|
||||
dst_bot -= dst_stride_argb;
|
||||
}
|
||||
free_aligned_buffer_64(row);
|
||||
return 0;
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
int ARGBRotate(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_argb,
|
||||
int dst_stride_argb,
|
||||
int width,
|
||||
int height,
|
||||
enum RotationMode mode) {
|
||||
if (!src_argb || width <= 0 || height == 0 || !dst_argb) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
src_argb = src_argb + (height - 1) * src_stride_argb;
|
||||
src_stride_argb = -src_stride_argb;
|
||||
}
|
||||
|
||||
switch (mode) {
|
||||
case kRotate0:
|
||||
// copy frame
|
||||
return ARGBCopy(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
|
||||
width, height);
|
||||
case kRotate90:
|
||||
return ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
|
||||
width, height);
|
||||
case kRotate270:
|
||||
return ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
|
||||
width, height);
|
||||
case kRotate180:
|
||||
return ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
|
||||
width, height);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
208
3rdparty/libyuv/source/rotate_common.cc
vendored
Normal file
208
3rdparty/libyuv/source/rotate_common.cc
vendored
Normal file
@@ -0,0 +1,208 @@
|
||||
/*
|
||||
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/rotate_row.h"
|
||||
#include "libyuv/row.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void TransposeWx8_C(const uint8_t* src,
|
||||
int src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_stride,
|
||||
int width) {
|
||||
int i;
|
||||
for (i = 0; i < width; ++i) {
|
||||
dst[0] = src[0 * src_stride];
|
||||
dst[1] = src[1 * src_stride];
|
||||
dst[2] = src[2 * src_stride];
|
||||
dst[3] = src[3 * src_stride];
|
||||
dst[4] = src[4 * src_stride];
|
||||
dst[5] = src[5 * src_stride];
|
||||
dst[6] = src[6 * src_stride];
|
||||
dst[7] = src[7 * src_stride];
|
||||
++src;
|
||||
dst += dst_stride;
|
||||
}
|
||||
}
|
||||
|
||||
void TransposeWx16_C(const uint8_t* src,
|
||||
int src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_stride,
|
||||
int width) {
|
||||
TransposeWx8_C(src, src_stride, dst, dst_stride, width);
|
||||
TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride,
|
||||
width);
|
||||
}
|
||||
|
||||
void TransposeUVWx8_C(const uint8_t* src,
|
||||
int src_stride,
|
||||
uint8_t* dst_a,
|
||||
int dst_stride_a,
|
||||
uint8_t* dst_b,
|
||||
int dst_stride_b,
|
||||
int width) {
|
||||
int i;
|
||||
for (i = 0; i < width; ++i) {
|
||||
dst_a[0] = src[0 * src_stride + 0];
|
||||
dst_b[0] = src[0 * src_stride + 1];
|
||||
dst_a[1] = src[1 * src_stride + 0];
|
||||
dst_b[1] = src[1 * src_stride + 1];
|
||||
dst_a[2] = src[2 * src_stride + 0];
|
||||
dst_b[2] = src[2 * src_stride + 1];
|
||||
dst_a[3] = src[3 * src_stride + 0];
|
||||
dst_b[3] = src[3 * src_stride + 1];
|
||||
dst_a[4] = src[4 * src_stride + 0];
|
||||
dst_b[4] = src[4 * src_stride + 1];
|
||||
dst_a[5] = src[5 * src_stride + 0];
|
||||
dst_b[5] = src[5 * src_stride + 1];
|
||||
dst_a[6] = src[6 * src_stride + 0];
|
||||
dst_b[6] = src[6 * src_stride + 1];
|
||||
dst_a[7] = src[7 * src_stride + 0];
|
||||
dst_b[7] = src[7 * src_stride + 1];
|
||||
src += 2;
|
||||
dst_a += dst_stride_a;
|
||||
dst_b += dst_stride_b;
|
||||
}
|
||||
}
|
||||
|
||||
void TransposeWxH_C(const uint8_t* src,
|
||||
int src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_stride,
|
||||
int width,
|
||||
int height) {
|
||||
int i;
|
||||
for (i = 0; i < width; ++i) {
|
||||
int j;
|
||||
for (j = 0; j < height; ++j) {
|
||||
dst[i * dst_stride + j] = src[j * src_stride + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TransposeUVWxH_C(const uint8_t* src,
|
||||
int src_stride,
|
||||
uint8_t* dst_a,
|
||||
int dst_stride_a,
|
||||
uint8_t* dst_b,
|
||||
int dst_stride_b,
|
||||
int width,
|
||||
int height) {
|
||||
int i;
|
||||
for (i = 0; i < width * 2; i += 2) {
|
||||
int j;
|
||||
for (j = 0; j < height; ++j) {
|
||||
dst_a[((i >> 1) * dst_stride_a) + j] = src[i + (j * src_stride)];
|
||||
dst_b[((i >> 1) * dst_stride_b) + j] = src[i + (j * src_stride) + 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TransposeWx8_16_C(const uint16_t* src,
|
||||
int src_stride,
|
||||
uint16_t* dst,
|
||||
int dst_stride,
|
||||
int width) {
|
||||
int i;
|
||||
for (i = 0; i < width; ++i) {
|
||||
dst[0] = src[0 * src_stride];
|
||||
dst[1] = src[1 * src_stride];
|
||||
dst[2] = src[2 * src_stride];
|
||||
dst[3] = src[3 * src_stride];
|
||||
dst[4] = src[4 * src_stride];
|
||||
dst[5] = src[5 * src_stride];
|
||||
dst[6] = src[6 * src_stride];
|
||||
dst[7] = src[7 * src_stride];
|
||||
++src;
|
||||
dst += dst_stride;
|
||||
}
|
||||
}
|
||||
|
||||
void TransposeWxH_16_C(const uint16_t* src,
|
||||
int src_stride,
|
||||
uint16_t* dst,
|
||||
int dst_stride,
|
||||
int width,
|
||||
int height) {
|
||||
int i;
|
||||
for (i = 0; i < width; ++i) {
|
||||
int j;
|
||||
for (j = 0; j < height; ++j) {
|
||||
dst[i * dst_stride + j] = src[j * src_stride + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Transpose 32 bit values (ARGB)
|
||||
void Transpose4x4_32_C(const uint8_t* src,
|
||||
int src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_stride,
|
||||
int width) {
|
||||
const uint8_t* src1 = src + src_stride;
|
||||
const uint8_t* src2 = src1 + src_stride;
|
||||
const uint8_t* src3 = src2 + src_stride;
|
||||
uint8_t* dst1 = dst + dst_stride;
|
||||
uint8_t* dst2 = dst1 + dst_stride;
|
||||
uint8_t* dst3 = dst2 + dst_stride;
|
||||
int i;
|
||||
for (i = 0; i < width; i += 4) {
|
||||
uint32_t p00 = ((uint32_t*)(src))[0];
|
||||
uint32_t p10 = ((uint32_t*)(src))[1];
|
||||
uint32_t p20 = ((uint32_t*)(src))[2];
|
||||
uint32_t p30 = ((uint32_t*)(src))[3];
|
||||
uint32_t p01 = ((uint32_t*)(src1))[0];
|
||||
uint32_t p11 = ((uint32_t*)(src1))[1];
|
||||
uint32_t p21 = ((uint32_t*)(src1))[2];
|
||||
uint32_t p31 = ((uint32_t*)(src1))[3];
|
||||
uint32_t p02 = ((uint32_t*)(src2))[0];
|
||||
uint32_t p12 = ((uint32_t*)(src2))[1];
|
||||
uint32_t p22 = ((uint32_t*)(src2))[2];
|
||||
uint32_t p32 = ((uint32_t*)(src2))[3];
|
||||
uint32_t p03 = ((uint32_t*)(src3))[0];
|
||||
uint32_t p13 = ((uint32_t*)(src3))[1];
|
||||
uint32_t p23 = ((uint32_t*)(src3))[2];
|
||||
uint32_t p33 = ((uint32_t*)(src3))[3];
|
||||
((uint32_t*)(dst))[0] = p00;
|
||||
((uint32_t*)(dst))[1] = p01;
|
||||
((uint32_t*)(dst))[2] = p02;
|
||||
((uint32_t*)(dst))[3] = p03;
|
||||
((uint32_t*)(dst1))[0] = p10;
|
||||
((uint32_t*)(dst1))[1] = p11;
|
||||
((uint32_t*)(dst1))[2] = p12;
|
||||
((uint32_t*)(dst1))[3] = p13;
|
||||
((uint32_t*)(dst2))[0] = p20;
|
||||
((uint32_t*)(dst2))[1] = p21;
|
||||
((uint32_t*)(dst2))[2] = p22;
|
||||
((uint32_t*)(dst2))[3] = p23;
|
||||
((uint32_t*)(dst3))[0] = p30;
|
||||
((uint32_t*)(dst3))[1] = p31;
|
||||
((uint32_t*)(dst3))[2] = p32;
|
||||
((uint32_t*)(dst3))[3] = p33;
|
||||
src += src_stride * 4; // advance 4 rows
|
||||
src1 += src_stride * 4;
|
||||
src2 += src_stride * 4;
|
||||
src3 += src_stride * 4;
|
||||
dst += 4 * 4; // advance 4 columns
|
||||
dst1 += 4 * 4;
|
||||
dst2 += 4 * 4;
|
||||
dst3 += 4 * 4;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
505
3rdparty/libyuv/source/rotate_gcc.cc
vendored
Normal file
505
3rdparty/libyuv/source/rotate_gcc.cc
vendored
Normal file
@@ -0,0 +1,505 @@
|
||||
/*
|
||||
* Copyright 2015 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/rotate_row.h"
|
||||
#include "libyuv/row.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// This module is for GCC x86 and x64.
|
||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||
(defined(__x86_64__) || defined(__i386__)) && \
|
||||
!defined(LIBYUV_ENABLE_ROWWIN)
|
||||
|
||||
// Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.
|
||||
#if defined(HAS_TRANSPOSEWX8_SSSE3)
|
||||
void TransposeWx8_SSSE3(const uint8_t* src,
|
||||
int src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_stride,
|
||||
int width) {
|
||||
asm volatile(
|
||||
// Read in the data from the source pointer.
|
||||
// First round of bit swap.
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"movq (%0),%%xmm0 \n"
|
||||
"movq (%0,%3),%%xmm1 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"punpcklbw %%xmm1,%%xmm0 \n"
|
||||
"movq (%0),%%xmm2 \n"
|
||||
"movdqa %%xmm0,%%xmm1 \n"
|
||||
"palignr $0x8,%%xmm1,%%xmm1 \n"
|
||||
"movq (%0,%3),%%xmm3 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"punpcklbw %%xmm3,%%xmm2 \n"
|
||||
"movdqa %%xmm2,%%xmm3 \n"
|
||||
"movq (%0),%%xmm4 \n"
|
||||
"palignr $0x8,%%xmm3,%%xmm3 \n"
|
||||
"movq (%0,%3),%%xmm5 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"punpcklbw %%xmm5,%%xmm4 \n"
|
||||
"movdqa %%xmm4,%%xmm5 \n"
|
||||
"movq (%0),%%xmm6 \n"
|
||||
"palignr $0x8,%%xmm5,%%xmm5 \n"
|
||||
"movq (%0,%3),%%xmm7 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"punpcklbw %%xmm7,%%xmm6 \n"
|
||||
"neg %3 \n"
|
||||
"movdqa %%xmm6,%%xmm7 \n"
|
||||
"lea 0x8(%0,%3,8),%0 \n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7 \n"
|
||||
"neg %3 \n"
|
||||
// Second round of bit swap.
|
||||
"punpcklwd %%xmm2,%%xmm0 \n"
|
||||
"punpcklwd %%xmm3,%%xmm1 \n"
|
||||
"movdqa %%xmm0,%%xmm2 \n"
|
||||
"movdqa %%xmm1,%%xmm3 \n"
|
||||
"palignr $0x8,%%xmm2,%%xmm2 \n"
|
||||
"palignr $0x8,%%xmm3,%%xmm3 \n"
|
||||
"punpcklwd %%xmm6,%%xmm4 \n"
|
||||
"punpcklwd %%xmm7,%%xmm5 \n"
|
||||
"movdqa %%xmm4,%%xmm6 \n"
|
||||
"movdqa %%xmm5,%%xmm7 \n"
|
||||
"palignr $0x8,%%xmm6,%%xmm6 \n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7 \n"
|
||||
// Third round of bit swap.
|
||||
// Write to the destination pointer.
|
||||
"punpckldq %%xmm4,%%xmm0 \n"
|
||||
"movq %%xmm0,(%1) \n"
|
||||
"movdqa %%xmm0,%%xmm4 \n"
|
||||
"palignr $0x8,%%xmm4,%%xmm4 \n"
|
||||
"movq %%xmm4,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"punpckldq %%xmm6,%%xmm2 \n"
|
||||
"movdqa %%xmm2,%%xmm6 \n"
|
||||
"movq %%xmm2,(%1) \n"
|
||||
"palignr $0x8,%%xmm6,%%xmm6 \n"
|
||||
"punpckldq %%xmm5,%%xmm1 \n"
|
||||
"movq %%xmm6,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"movdqa %%xmm1,%%xmm5 \n"
|
||||
"movq %%xmm1,(%1) \n"
|
||||
"palignr $0x8,%%xmm5,%%xmm5 \n"
|
||||
"movq %%xmm5,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"punpckldq %%xmm7,%%xmm3 \n"
|
||||
"movq %%xmm3,(%1) \n"
|
||||
"movdqa %%xmm3,%%xmm7 \n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7 \n"
|
||||
"sub $0x8,%2 \n"
|
||||
"movq %%xmm7,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"((intptr_t)(src_stride)), // %3
|
||||
"r"((intptr_t)(dst_stride)) // %4
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
}
|
||||
#endif // defined(HAS_TRANSPOSEWX8_SSSE3)
|
||||
|
||||
// Transpose 16x8. 64 bit
|
||||
#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
|
||||
void TransposeWx8_Fast_SSSE3(const uint8_t* src,
|
||||
int src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_stride,
|
||||
int width) {
|
||||
asm volatile(
|
||||
// Read in the data from the source pointer.
|
||||
// First round of bit swap.
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"movdqu (%0),%%xmm0 \n"
|
||||
"movdqu (%0,%3),%%xmm1 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"movdqa %%xmm0,%%xmm8 \n"
|
||||
"punpcklbw %%xmm1,%%xmm0 \n"
|
||||
"punpckhbw %%xmm1,%%xmm8 \n"
|
||||
"movdqu (%0),%%xmm2 \n"
|
||||
"movdqa %%xmm0,%%xmm1 \n"
|
||||
"movdqa %%xmm8,%%xmm9 \n"
|
||||
"palignr $0x8,%%xmm1,%%xmm1 \n"
|
||||
"palignr $0x8,%%xmm9,%%xmm9 \n"
|
||||
"movdqu (%0,%3),%%xmm3 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"movdqa %%xmm2,%%xmm10 \n"
|
||||
"punpcklbw %%xmm3,%%xmm2 \n"
|
||||
"punpckhbw %%xmm3,%%xmm10 \n"
|
||||
"movdqa %%xmm2,%%xmm3 \n"
|
||||
"movdqa %%xmm10,%%xmm11 \n"
|
||||
"movdqu (%0),%%xmm4 \n"
|
||||
"palignr $0x8,%%xmm3,%%xmm3 \n"
|
||||
"palignr $0x8,%%xmm11,%%xmm11 \n"
|
||||
"movdqu (%0,%3),%%xmm5 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"movdqa %%xmm4,%%xmm12 \n"
|
||||
"punpcklbw %%xmm5,%%xmm4 \n"
|
||||
"punpckhbw %%xmm5,%%xmm12 \n"
|
||||
"movdqa %%xmm4,%%xmm5 \n"
|
||||
"movdqa %%xmm12,%%xmm13 \n"
|
||||
"movdqu (%0),%%xmm6 \n"
|
||||
"palignr $0x8,%%xmm5,%%xmm5 \n"
|
||||
"palignr $0x8,%%xmm13,%%xmm13 \n"
|
||||
"movdqu (%0,%3),%%xmm7 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"movdqa %%xmm6,%%xmm14 \n"
|
||||
"punpcklbw %%xmm7,%%xmm6 \n"
|
||||
"punpckhbw %%xmm7,%%xmm14 \n"
|
||||
"neg %3 \n"
|
||||
"movdqa %%xmm6,%%xmm7 \n"
|
||||
"movdqa %%xmm14,%%xmm15 \n"
|
||||
"lea 0x10(%0,%3,8),%0 \n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7 \n"
|
||||
"palignr $0x8,%%xmm15,%%xmm15 \n"
|
||||
"neg %3 \n"
|
||||
// Second round of bit swap.
|
||||
"punpcklwd %%xmm2,%%xmm0 \n"
|
||||
"punpcklwd %%xmm3,%%xmm1 \n"
|
||||
"movdqa %%xmm0,%%xmm2 \n"
|
||||
"movdqa %%xmm1,%%xmm3 \n"
|
||||
"palignr $0x8,%%xmm2,%%xmm2 \n"
|
||||
"palignr $0x8,%%xmm3,%%xmm3 \n"
|
||||
"punpcklwd %%xmm6,%%xmm4 \n"
|
||||
"punpcklwd %%xmm7,%%xmm5 \n"
|
||||
"movdqa %%xmm4,%%xmm6 \n"
|
||||
"movdqa %%xmm5,%%xmm7 \n"
|
||||
"palignr $0x8,%%xmm6,%%xmm6 \n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7 \n"
|
||||
"punpcklwd %%xmm10,%%xmm8 \n"
|
||||
"punpcklwd %%xmm11,%%xmm9 \n"
|
||||
"movdqa %%xmm8,%%xmm10 \n"
|
||||
"movdqa %%xmm9,%%xmm11 \n"
|
||||
"palignr $0x8,%%xmm10,%%xmm10 \n"
|
||||
"palignr $0x8,%%xmm11,%%xmm11 \n"
|
||||
"punpcklwd %%xmm14,%%xmm12 \n"
|
||||
"punpcklwd %%xmm15,%%xmm13 \n"
|
||||
"movdqa %%xmm12,%%xmm14 \n"
|
||||
"movdqa %%xmm13,%%xmm15 \n"
|
||||
"palignr $0x8,%%xmm14,%%xmm14 \n"
|
||||
"palignr $0x8,%%xmm15,%%xmm15 \n"
|
||||
// Third round of bit swap.
|
||||
// Write to the destination pointer.
|
||||
"punpckldq %%xmm4,%%xmm0 \n"
|
||||
"movq %%xmm0,(%1) \n"
|
||||
"movdqa %%xmm0,%%xmm4 \n"
|
||||
"palignr $0x8,%%xmm4,%%xmm4 \n"
|
||||
"movq %%xmm4,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"punpckldq %%xmm6,%%xmm2 \n"
|
||||
"movdqa %%xmm2,%%xmm6 \n"
|
||||
"movq %%xmm2,(%1) \n"
|
||||
"palignr $0x8,%%xmm6,%%xmm6 \n"
|
||||
"punpckldq %%xmm5,%%xmm1 \n"
|
||||
"movq %%xmm6,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"movdqa %%xmm1,%%xmm5 \n"
|
||||
"movq %%xmm1,(%1) \n"
|
||||
"palignr $0x8,%%xmm5,%%xmm5 \n"
|
||||
"movq %%xmm5,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"punpckldq %%xmm7,%%xmm3 \n"
|
||||
"movq %%xmm3,(%1) \n"
|
||||
"movdqa %%xmm3,%%xmm7 \n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7 \n"
|
||||
"movq %%xmm7,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"punpckldq %%xmm12,%%xmm8 \n"
|
||||
"movq %%xmm8,(%1) \n"
|
||||
"movdqa %%xmm8,%%xmm12 \n"
|
||||
"palignr $0x8,%%xmm12,%%xmm12 \n"
|
||||
"movq %%xmm12,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"punpckldq %%xmm14,%%xmm10 \n"
|
||||
"movdqa %%xmm10,%%xmm14 \n"
|
||||
"movq %%xmm10,(%1) \n"
|
||||
"palignr $0x8,%%xmm14,%%xmm14 \n"
|
||||
"punpckldq %%xmm13,%%xmm9 \n"
|
||||
"movq %%xmm14,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"movdqa %%xmm9,%%xmm13 \n"
|
||||
"movq %%xmm9,(%1) \n"
|
||||
"palignr $0x8,%%xmm13,%%xmm13 \n"
|
||||
"movq %%xmm13,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"punpckldq %%xmm15,%%xmm11 \n"
|
||||
"movq %%xmm11,(%1) \n"
|
||||
"movdqa %%xmm11,%%xmm15 \n"
|
||||
"palignr $0x8,%%xmm15,%%xmm15 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"movq %%xmm15,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"((intptr_t)(src_stride)), // %3
|
||||
"r"((intptr_t)(dst_stride)) // %4
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
|
||||
"xmm15");
|
||||
}
|
||||
#endif // defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
|
||||
|
||||
// Transpose UV 8x8. 64 bit.
|
||||
#if defined(HAS_TRANSPOSEUVWX8_SSE2)
|
||||
void TransposeUVWx8_SSE2(const uint8_t* src,
|
||||
int src_stride,
|
||||
uint8_t* dst_a,
|
||||
int dst_stride_a,
|
||||
uint8_t* dst_b,
|
||||
int dst_stride_b,
|
||||
int width) {
|
||||
asm volatile(
|
||||
// Read in the data from the source pointer.
|
||||
// First round of bit swap.
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"movdqu (%0),%%xmm0 \n"
|
||||
"movdqu (%0,%4),%%xmm1 \n"
|
||||
"lea (%0,%4,2),%0 \n"
|
||||
"movdqa %%xmm0,%%xmm8 \n"
|
||||
"punpcklbw %%xmm1,%%xmm0 \n"
|
||||
"punpckhbw %%xmm1,%%xmm8 \n"
|
||||
"movdqa %%xmm8,%%xmm1 \n"
|
||||
"movdqu (%0),%%xmm2 \n"
|
||||
"movdqu (%0,%4),%%xmm3 \n"
|
||||
"lea (%0,%4,2),%0 \n"
|
||||
"movdqa %%xmm2,%%xmm8 \n"
|
||||
"punpcklbw %%xmm3,%%xmm2 \n"
|
||||
"punpckhbw %%xmm3,%%xmm8 \n"
|
||||
"movdqa %%xmm8,%%xmm3 \n"
|
||||
"movdqu (%0),%%xmm4 \n"
|
||||
"movdqu (%0,%4),%%xmm5 \n"
|
||||
"lea (%0,%4,2),%0 \n"
|
||||
"movdqa %%xmm4,%%xmm8 \n"
|
||||
"punpcklbw %%xmm5,%%xmm4 \n"
|
||||
"punpckhbw %%xmm5,%%xmm8 \n"
|
||||
"movdqa %%xmm8,%%xmm5 \n"
|
||||
"movdqu (%0),%%xmm6 \n"
|
||||
"movdqu (%0,%4),%%xmm7 \n"
|
||||
"lea (%0,%4,2),%0 \n"
|
||||
"movdqa %%xmm6,%%xmm8 \n"
|
||||
"punpcklbw %%xmm7,%%xmm6 \n"
|
||||
"neg %4 \n"
|
||||
"lea 0x10(%0,%4,8),%0 \n"
|
||||
"punpckhbw %%xmm7,%%xmm8 \n"
|
||||
"movdqa %%xmm8,%%xmm7 \n"
|
||||
"neg %4 \n"
|
||||
// Second round of bit swap.
|
||||
"movdqa %%xmm0,%%xmm8 \n"
|
||||
"movdqa %%xmm1,%%xmm9 \n"
|
||||
"punpckhwd %%xmm2,%%xmm8 \n"
|
||||
"punpckhwd %%xmm3,%%xmm9 \n"
|
||||
"punpcklwd %%xmm2,%%xmm0 \n"
|
||||
"punpcklwd %%xmm3,%%xmm1 \n"
|
||||
"movdqa %%xmm8,%%xmm2 \n"
|
||||
"movdqa %%xmm9,%%xmm3 \n"
|
||||
"movdqa %%xmm4,%%xmm8 \n"
|
||||
"movdqa %%xmm5,%%xmm9 \n"
|
||||
"punpckhwd %%xmm6,%%xmm8 \n"
|
||||
"punpckhwd %%xmm7,%%xmm9 \n"
|
||||
"punpcklwd %%xmm6,%%xmm4 \n"
|
||||
"punpcklwd %%xmm7,%%xmm5 \n"
|
||||
"movdqa %%xmm8,%%xmm6 \n"
|
||||
"movdqa %%xmm9,%%xmm7 \n"
|
||||
// Third round of bit swap.
|
||||
// Write to the destination pointer.
|
||||
"movdqa %%xmm0,%%xmm8 \n"
|
||||
"punpckldq %%xmm4,%%xmm0 \n"
|
||||
"movlpd %%xmm0,(%1) \n" // Write back U channel
|
||||
"movhpd %%xmm0,(%2) \n" // Write back V channel
|
||||
"punpckhdq %%xmm4,%%xmm8 \n"
|
||||
"movlpd %%xmm8,(%1,%5) \n"
|
||||
"lea (%1,%5,2),%1 \n"
|
||||
"movhpd %%xmm8,(%2,%6) \n"
|
||||
"lea (%2,%6,2),%2 \n"
|
||||
"movdqa %%xmm2,%%xmm8 \n"
|
||||
"punpckldq %%xmm6,%%xmm2 \n"
|
||||
"movlpd %%xmm2,(%1) \n"
|
||||
"movhpd %%xmm2,(%2) \n"
|
||||
"punpckhdq %%xmm6,%%xmm8 \n"
|
||||
"movlpd %%xmm8,(%1,%5) \n"
|
||||
"lea (%1,%5,2),%1 \n"
|
||||
"movhpd %%xmm8,(%2,%6) \n"
|
||||
"lea (%2,%6,2),%2 \n"
|
||||
"movdqa %%xmm1,%%xmm8 \n"
|
||||
"punpckldq %%xmm5,%%xmm1 \n"
|
||||
"movlpd %%xmm1,(%1) \n"
|
||||
"movhpd %%xmm1,(%2) \n"
|
||||
"punpckhdq %%xmm5,%%xmm8 \n"
|
||||
"movlpd %%xmm8,(%1,%5) \n"
|
||||
"lea (%1,%5,2),%1 \n"
|
||||
"movhpd %%xmm8,(%2,%6) \n"
|
||||
"lea (%2,%6,2),%2 \n"
|
||||
"movdqa %%xmm3,%%xmm8 \n"
|
||||
"punpckldq %%xmm7,%%xmm3 \n"
|
||||
"movlpd %%xmm3,(%1) \n"
|
||||
"movhpd %%xmm3,(%2) \n"
|
||||
"punpckhdq %%xmm7,%%xmm8 \n"
|
||||
"sub $0x8,%3 \n"
|
||||
"movlpd %%xmm8,(%1,%5) \n"
|
||||
"lea (%1,%5,2),%1 \n"
|
||||
"movhpd %%xmm8,(%2,%6) \n"
|
||||
"lea (%2,%6,2),%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst_a), // %1
|
||||
"+r"(dst_b), // %2
|
||||
"+r"(width) // %3
|
||||
: "r"((intptr_t)(src_stride)), // %4
|
||||
"r"((intptr_t)(dst_stride_a)), // %5
|
||||
"r"((intptr_t)(dst_stride_b)) // %6
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7", "xmm8", "xmm9");
|
||||
}
|
||||
#endif // defined(HAS_TRANSPOSEUVWX8_SSE2)
|
||||
|
||||
#if defined(HAS_TRANSPOSE4X4_32_SSE2)
|
||||
// 4 values, little endian view
|
||||
// a b c d
|
||||
// e f g h
|
||||
// i j k l
|
||||
// m n o p
|
||||
|
||||
// transpose 2x2
|
||||
// a e b f from row 0, 1
|
||||
// i m j n from row 2, 3
|
||||
// c g d h from row 0, 1
|
||||
// k o l p from row 2, 3
|
||||
|
||||
// transpose 4x4
|
||||
// a e i m from row 0, 1
|
||||
// b f j n from row 0, 1
|
||||
// c g k o from row 2, 3
|
||||
// d h l p from row 2, 3
|
||||
|
||||
// Transpose 32 bit values (ARGB)
|
||||
void Transpose4x4_32_SSE2(const uint8_t* src,
|
||||
int src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_stride,
|
||||
int width) {
|
||||
asm volatile(
|
||||
// Main loop transpose 4x4. Read a column, write a row.
|
||||
"1: \n"
|
||||
"movdqu (%0),%%xmm0 \n" // a b c d
|
||||
"movdqu (%0,%3),%%xmm1 \n" // e f g h
|
||||
"lea (%0,%3,2),%0 \n" // src += stride * 2
|
||||
"movdqu (%0),%%xmm2 \n" // i j k l
|
||||
"movdqu (%0,%3),%%xmm3 \n" // m n o p
|
||||
"lea (%0,%3,2),%0 \n" // src += stride * 2
|
||||
|
||||
// Transpose 2x2
|
||||
"movdqa %%xmm0,%%xmm4 \n"
|
||||
"movdqa %%xmm2,%%xmm5 \n"
|
||||
"movdqa %%xmm0,%%xmm6 \n"
|
||||
"movdqa %%xmm2,%%xmm7 \n"
|
||||
"punpckldq %%xmm1,%%xmm4 \n" // a e b f from row 0, 1
|
||||
"punpckldq %%xmm3,%%xmm5 \n" // i m j n from row 2, 3
|
||||
"punpckhdq %%xmm1,%%xmm6 \n" // c g d h from row 0, 1
|
||||
"punpckhdq %%xmm3,%%xmm7 \n" // k o l p from row 2, 3
|
||||
|
||||
// Transpose 4x4
|
||||
"movdqa %%xmm4,%%xmm0 \n"
|
||||
"movdqa %%xmm4,%%xmm1 \n"
|
||||
"movdqa %%xmm6,%%xmm2 \n"
|
||||
"movdqa %%xmm6,%%xmm3 \n"
|
||||
"punpcklqdq %%xmm5,%%xmm0 \n" // a e i m from row 0, 1
|
||||
"punpckhqdq %%xmm5,%%xmm1 \n" // b f j n from row 0, 1
|
||||
"punpcklqdq %%xmm7,%%xmm2 \n" // c g k o from row 2, 3
|
||||
"punpckhqdq %%xmm7,%%xmm3 \n" // d h l p from row 2, 3
|
||||
|
||||
"movdqu %%xmm0,(%1) \n"
|
||||
"lea 16(%1,%4),%1 \n" // dst += stride + 16
|
||||
"movdqu %%xmm1,-16(%1) \n"
|
||||
"movdqu %%xmm2,-16(%1,%4) \n"
|
||||
"movdqu %%xmm3,-16(%1,%4,2) \n"
|
||||
"sub %4,%1 \n"
|
||||
"sub $0x4,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+rm"(width) // %2
|
||||
: "r"((ptrdiff_t)(src_stride)), // %3
|
||||
"r"((ptrdiff_t)(dst_stride)) // %4
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
}
|
||||
#endif // defined(HAS_TRANSPOSE4X4_32_SSE2)
|
||||
|
||||
#if defined(HAS_TRANSPOSE4X4_32_AVX2)
|
||||
|
||||
// Transpose 32 bit values (ARGB)
|
||||
void Transpose4x4_32_AVX2(const uint8_t* src,
|
||||
int src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_stride,
|
||||
int width) {
|
||||
asm volatile(
|
||||
// Main loop transpose 2 blocks of 4x4. Read a column, write a row.
|
||||
"1: \n"
|
||||
"vmovdqu (%0),%%xmm0 \n" // a b c d
|
||||
"vmovdqu (%0,%3),%%xmm1 \n" // e f g h
|
||||
"lea (%0,%3,2),%0 \n" // src += stride * 2
|
||||
"vmovdqu (%0),%%xmm2 \n" // i j k l
|
||||
"vmovdqu (%0,%3),%%xmm3 \n" // m n o p
|
||||
"lea (%0,%3,2),%0 \n" // src += stride * 2
|
||||
|
||||
"vinserti128 $1,(%0),%%ymm0,%%ymm0 \n" // a b c d
|
||||
"vinserti128 $1,(%0,%3),%%ymm1,%%ymm1 \n" // e f g h
|
||||
"lea (%0,%3,2),%0 \n" // src += stride * 2
|
||||
"vinserti128 $1,(%0),%%ymm2,%%ymm2 \n" // i j k l
|
||||
"vinserti128 $1,(%0,%3),%%ymm3,%%ymm3 \n" // m n o p
|
||||
"lea (%0,%3,2),%0 \n" // src += stride * 2
|
||||
|
||||
// Transpose 2x2
|
||||
"vpunpckldq %%ymm1,%%ymm0,%%ymm4 \n" // a e b f from row 0, 1
|
||||
"vpunpckldq %%ymm3,%%ymm2,%%ymm5 \n" // i m j n from row 2, 3
|
||||
"vpunpckhdq %%ymm1,%%ymm0,%%ymm6 \n" // c g d h from row 0, 1
|
||||
"vpunpckhdq %%ymm3,%%ymm2,%%ymm7 \n" // k o l p from row 2, 3
|
||||
|
||||
// Transpose 4x4
|
||||
"vpunpcklqdq %%ymm5,%%ymm4,%%ymm0 \n" // a e i m from row 0, 1
|
||||
"vpunpckhqdq %%ymm5,%%ymm4,%%ymm1 \n" // b f j n from row 0, 1
|
||||
"vpunpcklqdq %%ymm7,%%ymm6,%%ymm2 \n" // c g k o from row 2, 3
|
||||
"vpunpckhqdq %%ymm7,%%ymm6,%%ymm3 \n" // d h l p from row 2, 3
|
||||
|
||||
"vmovdqu %%ymm0,(%1) \n"
|
||||
"lea 32(%1,%4),%1 \n" // dst += stride + 32
|
||||
"vmovdqu %%ymm1,-32(%1) \n"
|
||||
"vmovdqu %%ymm2,-32(%1,%4) \n"
|
||||
"vmovdqu %%ymm3,-32(%1,%4,2) \n"
|
||||
"sub %4,%1 \n"
|
||||
"sub $0x8,%2 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+rm"(width) // %2
|
||||
: "r"((ptrdiff_t)(src_stride)), // %3
|
||||
"r"((ptrdiff_t)(dst_stride)) // %4
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
}
|
||||
#endif // defined(HAS_TRANSPOSE4X4_32_AVX2)
|
||||
|
||||
#endif // defined(__x86_64__) || defined(__i386__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
233
3rdparty/libyuv/source/rotate_lsx.cc
vendored
Normal file
233
3rdparty/libyuv/source/rotate_lsx.cc
vendored
Normal file
@@ -0,0 +1,233 @@
|
||||
/*
|
||||
* Copyright 2022 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2022 Loongson Technology Corporation Limited
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/rotate_row.h"
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
|
||||
#include "libyuv/loongson_intrinsics.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define ILVLH_B(in0, in1, in2, in3, out0, out1, out2, out3) \
|
||||
{ \
|
||||
DUP2_ARG2(__lsx_vilvl_b, in1, in0, in3, in2, out0, out2); \
|
||||
DUP2_ARG2(__lsx_vilvh_b, in1, in0, in3, in2, out1, out3); \
|
||||
}
|
||||
|
||||
#define ILVLH_H(in0, in1, in2, in3, out0, out1, out2, out3) \
|
||||
{ \
|
||||
DUP2_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, out0, out2); \
|
||||
DUP2_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, out1, out3); \
|
||||
}
|
||||
|
||||
#define ILVLH_W(in0, in1, in2, in3, out0, out1, out2, out3) \
|
||||
{ \
|
||||
DUP2_ARG2(__lsx_vilvl_w, in1, in0, in3, in2, out0, out2); \
|
||||
DUP2_ARG2(__lsx_vilvh_w, in1, in0, in3, in2, out1, out3); \
|
||||
}
|
||||
|
||||
#define ILVLH_D(in0, in1, in2, in3, out0, out1, out2, out3) \
|
||||
{ \
|
||||
DUP2_ARG2(__lsx_vilvl_d, in1, in0, in3, in2, out0, out2); \
|
||||
DUP2_ARG2(__lsx_vilvh_d, in1, in0, in3, in2, out1, out3); \
|
||||
}
|
||||
|
||||
#define LSX_ST_4(_dst0, _dst1, _dst2, _dst3, _dst, _stride, _stride2, \
|
||||
_stride3, _stride4) \
|
||||
{ \
|
||||
__lsx_vst(_dst0, _dst, 0); \
|
||||
__lsx_vstx(_dst1, _dst, _stride); \
|
||||
__lsx_vstx(_dst2, _dst, _stride2); \
|
||||
__lsx_vstx(_dst3, _dst, _stride3); \
|
||||
_dst += _stride4; \
|
||||
}
|
||||
|
||||
#define LSX_ST_2(_dst0, _dst1, _dst, _stride, _stride2) \
|
||||
{ \
|
||||
__lsx_vst(_dst0, _dst, 0); \
|
||||
__lsx_vstx(_dst1, _dst, _stride); \
|
||||
_dst += _stride2; \
|
||||
}
|
||||
|
||||
void TransposeUVWx16_C(const uint8_t* src,
|
||||
int src_stride,
|
||||
uint8_t* dst_a,
|
||||
int dst_stride_a,
|
||||
uint8_t* dst_b,
|
||||
int dst_stride_b,
|
||||
int width) {
|
||||
TransposeUVWx8_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
|
||||
width);
|
||||
TransposeUVWx8_C((src + 8 * src_stride), src_stride, (dst_a + 8),
|
||||
dst_stride_a, (dst_b + 8), dst_stride_b, width);
|
||||
}
|
||||
|
||||
void TransposeWx16_LSX(const uint8_t* src,
|
||||
int src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_stride,
|
||||
int width) {
|
||||
int x;
|
||||
int len = width / 16;
|
||||
uint8_t* s;
|
||||
int src_stride2 = src_stride << 1;
|
||||
int src_stride3 = src_stride + src_stride2;
|
||||
int src_stride4 = src_stride2 << 1;
|
||||
int dst_stride2 = dst_stride << 1;
|
||||
int dst_stride3 = dst_stride + dst_stride2;
|
||||
int dst_stride4 = dst_stride2 << 1;
|
||||
__m128i src0, src1, src2, src3, dst0, dst1, dst2, dst3;
|
||||
__m128i tmp0, tmp1, tmp2, tmp3;
|
||||
__m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
|
||||
__m128i res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;
|
||||
|
||||
for (x = 0; x < len; x++) {
|
||||
s = (uint8_t*)src;
|
||||
src0 = __lsx_vld(s, 0);
|
||||
src1 = __lsx_vldx(s, src_stride);
|
||||
src2 = __lsx_vldx(s, src_stride2);
|
||||
src3 = __lsx_vldx(s, src_stride3);
|
||||
s += src_stride4;
|
||||
ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
|
||||
ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3);
|
||||
src0 = __lsx_vld(s, 0);
|
||||
src1 = __lsx_vldx(s, src_stride);
|
||||
src2 = __lsx_vldx(s, src_stride2);
|
||||
src3 = __lsx_vldx(s, src_stride3);
|
||||
s += src_stride4;
|
||||
ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
|
||||
ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7);
|
||||
ILVLH_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3);
|
||||
ILVLH_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7);
|
||||
src0 = __lsx_vld(s, 0);
|
||||
src1 = __lsx_vldx(s, src_stride);
|
||||
src2 = __lsx_vldx(s, src_stride2);
|
||||
src3 = __lsx_vldx(s, src_stride3);
|
||||
s += src_stride4;
|
||||
ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
|
||||
ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3);
|
||||
src0 = __lsx_vld(s, 0);
|
||||
src1 = __lsx_vldx(s, src_stride);
|
||||
src2 = __lsx_vldx(s, src_stride2);
|
||||
src3 = __lsx_vldx(s, src_stride3);
|
||||
s += src_stride4;
|
||||
ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
|
||||
ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7);
|
||||
res8 = __lsx_vilvl_w(reg4, reg0);
|
||||
res9 = __lsx_vilvh_w(reg4, reg0);
|
||||
ILVLH_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3);
|
||||
LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3,
|
||||
dst_stride4);
|
||||
res8 = __lsx_vilvl_w(reg5, reg1);
|
||||
res9 = __lsx_vilvh_w(reg5, reg1);
|
||||
ILVLH_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3);
|
||||
LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3,
|
||||
dst_stride4);
|
||||
res8 = __lsx_vilvl_w(reg6, reg2);
|
||||
res9 = __lsx_vilvh_w(reg6, reg2);
|
||||
ILVLH_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3);
|
||||
LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3,
|
||||
dst_stride4);
|
||||
res8 = __lsx_vilvl_w(reg7, reg3);
|
||||
res9 = __lsx_vilvh_w(reg7, reg3);
|
||||
ILVLH_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3);
|
||||
LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3,
|
||||
dst_stride4);
|
||||
src += 16;
|
||||
}
|
||||
}
|
||||
|
||||
void TransposeUVWx16_LSX(const uint8_t* src,
|
||||
int src_stride,
|
||||
uint8_t* dst_a,
|
||||
int dst_stride_a,
|
||||
uint8_t* dst_b,
|
||||
int dst_stride_b,
|
||||
int width) {
|
||||
int x;
|
||||
int len = width / 8;
|
||||
uint8_t* s;
|
||||
int src_stride2 = src_stride << 1;
|
||||
int src_stride3 = src_stride + src_stride2;
|
||||
int src_stride4 = src_stride2 << 1;
|
||||
int dst_stride_a2 = dst_stride_a << 1;
|
||||
int dst_stride_b2 = dst_stride_b << 1;
|
||||
__m128i src0, src1, src2, src3, dst0, dst1, dst2, dst3;
|
||||
__m128i tmp0, tmp1, tmp2, tmp3;
|
||||
__m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
|
||||
__m128i res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;
|
||||
|
||||
for (x = 0; x < len; x++) {
|
||||
s = (uint8_t*)src;
|
||||
src0 = __lsx_vld(s, 0);
|
||||
src1 = __lsx_vldx(s, src_stride);
|
||||
src2 = __lsx_vldx(s, src_stride2);
|
||||
src3 = __lsx_vldx(s, src_stride3);
|
||||
s += src_stride4;
|
||||
ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
|
||||
ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3);
|
||||
src0 = __lsx_vld(s, 0);
|
||||
src1 = __lsx_vldx(s, src_stride);
|
||||
src2 = __lsx_vldx(s, src_stride2);
|
||||
src3 = __lsx_vldx(s, src_stride3);
|
||||
s += src_stride4;
|
||||
ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
|
||||
ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7);
|
||||
ILVLH_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3);
|
||||
ILVLH_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7);
|
||||
src0 = __lsx_vld(s, 0);
|
||||
src1 = __lsx_vldx(s, src_stride);
|
||||
src2 = __lsx_vldx(s, src_stride2);
|
||||
src3 = __lsx_vldx(s, src_stride3);
|
||||
s += src_stride4;
|
||||
ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
|
||||
ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3);
|
||||
src0 = __lsx_vld(s, 0);
|
||||
src1 = __lsx_vldx(s, src_stride);
|
||||
src2 = __lsx_vldx(s, src_stride2);
|
||||
src3 = __lsx_vldx(s, src_stride3);
|
||||
s += src_stride4;
|
||||
ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
|
||||
ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7);
|
||||
res8 = __lsx_vilvl_w(reg4, reg0);
|
||||
res9 = __lsx_vilvh_w(reg4, reg0);
|
||||
ILVLH_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3);
|
||||
LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2);
|
||||
LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2);
|
||||
res8 = __lsx_vilvl_w(reg5, reg1);
|
||||
res9 = __lsx_vilvh_w(reg5, reg1);
|
||||
ILVLH_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3);
|
||||
LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2);
|
||||
LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2);
|
||||
res8 = __lsx_vilvl_w(reg6, reg2);
|
||||
res9 = __lsx_vilvh_w(reg6, reg2);
|
||||
ILVLH_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3);
|
||||
LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2);
|
||||
LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2);
|
||||
res8 = __lsx_vilvl_w(reg7, reg3);
|
||||
res9 = __lsx_vilvh_w(reg7, reg3);
|
||||
ILVLH_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3);
|
||||
LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2);
|
||||
LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2);
|
||||
src += 16;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
|
||||
219
3rdparty/libyuv/source/rotate_neon.cc
vendored
Normal file
219
3rdparty/libyuv/source/rotate_neon.cc
vendored
Normal file
@@ -0,0 +1,219 @@
|
||||
/*
|
||||
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/rotate_row.h"
|
||||
#include "libyuv/row.h"
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
|
||||
!defined(__aarch64__)
|
||||
|
||||
void TransposeWx8_NEON(const uint8_t* src,
|
||||
int src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_stride,
|
||||
int width) {
|
||||
const uint8_t* temp;
|
||||
asm volatile(
|
||||
// loops are on blocks of 8. loop will stop when
|
||||
// counter gets to or below 0. starting the counter
|
||||
// at w-8 allow for this
|
||||
"sub %[width], #8 \n"
|
||||
|
||||
"1: \n"
|
||||
"mov %[temp], %[src] \n"
|
||||
"vld1.8 {d0}, [%[temp]], %[src_stride] \n"
|
||||
"vld1.8 {d1}, [%[temp]], %[src_stride] \n"
|
||||
"vld1.8 {d2}, [%[temp]], %[src_stride] \n"
|
||||
"vld1.8 {d3}, [%[temp]], %[src_stride] \n"
|
||||
"vld1.8 {d4}, [%[temp]], %[src_stride] \n"
|
||||
"vld1.8 {d5}, [%[temp]], %[src_stride] \n"
|
||||
"vld1.8 {d6}, [%[temp]], %[src_stride] \n"
|
||||
"vld1.8 {d7}, [%[temp]] \n"
|
||||
"add %[src], #8 \n"
|
||||
|
||||
"vtrn.8 d1, d0 \n"
|
||||
"vtrn.8 d3, d2 \n"
|
||||
"vtrn.8 d5, d4 \n"
|
||||
"vtrn.8 d7, d6 \n"
|
||||
"subs %[width], #8 \n"
|
||||
|
||||
"vtrn.16 d1, d3 \n"
|
||||
"vtrn.16 d0, d2 \n"
|
||||
"vtrn.16 d5, d7 \n"
|
||||
"vtrn.16 d4, d6 \n"
|
||||
|
||||
"vtrn.32 d1, d5 \n"
|
||||
"vtrn.32 d0, d4 \n"
|
||||
"vtrn.32 d3, d7 \n"
|
||||
"vtrn.32 d2, d6 \n"
|
||||
|
||||
"vrev16.8 q0, q0 \n"
|
||||
"vrev16.8 q1, q1 \n"
|
||||
"vrev16.8 q2, q2 \n"
|
||||
"vrev16.8 q3, q3 \n"
|
||||
|
||||
"mov %[temp], %[dst] \n"
|
||||
"vst1.8 {d1}, [%[temp]], %[dst_stride] \n"
|
||||
"vst1.8 {d0}, [%[temp]], %[dst_stride] \n"
|
||||
"vst1.8 {d3}, [%[temp]], %[dst_stride] \n"
|
||||
"vst1.8 {d2}, [%[temp]], %[dst_stride] \n"
|
||||
"vst1.8 {d5}, [%[temp]], %[dst_stride] \n"
|
||||
"vst1.8 {d4}, [%[temp]], %[dst_stride] \n"
|
||||
"vst1.8 {d7}, [%[temp]], %[dst_stride] \n"
|
||||
"vst1.8 {d6}, [%[temp]] \n"
|
||||
"add %[dst], %[dst], %[dst_stride], lsl #3 \n"
|
||||
|
||||
"bge 1b \n"
|
||||
: [temp] "=&r"(temp), // %[temp]
|
||||
[src] "+r"(src), // %[src]
|
||||
[dst] "+r"(dst), // %[dst]
|
||||
[width] "+r"(width) // %[width]
|
||||
: [src_stride] "r"(src_stride), // %[src_stride]
|
||||
[dst_stride] "r"(dst_stride) // %[dst_stride]
|
||||
: "memory", "cc", "q0", "q1", "q2", "q3");
|
||||
}
|
||||
|
||||
void TransposeUVWx8_NEON(const uint8_t* src,
|
||||
int src_stride,
|
||||
uint8_t* dst_a,
|
||||
int dst_stride_a,
|
||||
uint8_t* dst_b,
|
||||
int dst_stride_b,
|
||||
int width) {
|
||||
const uint8_t* temp;
|
||||
asm volatile(
|
||||
// loops are on blocks of 8. loop will stop when
|
||||
// counter gets to or below 0. starting the counter
|
||||
// at w-8 allow for this
|
||||
"sub %[width], #8 \n"
|
||||
|
||||
"1: \n"
|
||||
"mov %[temp], %[src] \n"
|
||||
"vld2.8 {d0, d1}, [%[temp]], %[src_stride] \n"
|
||||
"vld2.8 {d2, d3}, [%[temp]], %[src_stride] \n"
|
||||
"vld2.8 {d4, d5}, [%[temp]], %[src_stride] \n"
|
||||
"vld2.8 {d6, d7}, [%[temp]], %[src_stride] \n"
|
||||
"vld2.8 {d16, d17}, [%[temp]], %[src_stride] \n"
|
||||
"vld2.8 {d18, d19}, [%[temp]], %[src_stride] \n"
|
||||
"vld2.8 {d20, d21}, [%[temp]], %[src_stride] \n"
|
||||
"vld2.8 {d22, d23}, [%[temp]] \n"
|
||||
"add %[src], #8*2 \n"
|
||||
|
||||
"vtrn.8 q1, q0 \n"
|
||||
"vtrn.8 q3, q2 \n"
|
||||
"vtrn.8 q9, q8 \n"
|
||||
"vtrn.8 q11, q10 \n"
|
||||
"subs %[width], #8 \n"
|
||||
|
||||
"vtrn.16 q1, q3 \n"
|
||||
"vtrn.16 q0, q2 \n"
|
||||
"vtrn.16 q9, q11 \n"
|
||||
"vtrn.16 q8, q10 \n"
|
||||
|
||||
"vtrn.32 q1, q9 \n"
|
||||
"vtrn.32 q0, q8 \n"
|
||||
"vtrn.32 q3, q11 \n"
|
||||
"vtrn.32 q2, q10 \n"
|
||||
|
||||
"vrev16.8 q0, q0 \n"
|
||||
"vrev16.8 q1, q1 \n"
|
||||
"vrev16.8 q2, q2 \n"
|
||||
"vrev16.8 q3, q3 \n"
|
||||
"vrev16.8 q8, q8 \n"
|
||||
"vrev16.8 q9, q9 \n"
|
||||
"vrev16.8 q10, q10 \n"
|
||||
"vrev16.8 q11, q11 \n"
|
||||
|
||||
"mov %[temp], %[dst_a] \n"
|
||||
"vst1.8 {d2}, [%[temp]], %[dst_stride_a] \n"
|
||||
"vst1.8 {d0}, [%[temp]], %[dst_stride_a] \n"
|
||||
"vst1.8 {d6}, [%[temp]], %[dst_stride_a] \n"
|
||||
"vst1.8 {d4}, [%[temp]], %[dst_stride_a] \n"
|
||||
"vst1.8 {d18}, [%[temp]], %[dst_stride_a] \n"
|
||||
"vst1.8 {d16}, [%[temp]], %[dst_stride_a] \n"
|
||||
"vst1.8 {d22}, [%[temp]], %[dst_stride_a] \n"
|
||||
"vst1.8 {d20}, [%[temp]] \n"
|
||||
"add %[dst_a], %[dst_a], %[dst_stride_a], lsl #3 \n"
|
||||
|
||||
"mov %[temp], %[dst_b] \n"
|
||||
"vst1.8 {d3}, [%[temp]], %[dst_stride_b] \n"
|
||||
"vst1.8 {d1}, [%[temp]], %[dst_stride_b] \n"
|
||||
"vst1.8 {d7}, [%[temp]], %[dst_stride_b] \n"
|
||||
"vst1.8 {d5}, [%[temp]], %[dst_stride_b] \n"
|
||||
"vst1.8 {d19}, [%[temp]], %[dst_stride_b] \n"
|
||||
"vst1.8 {d17}, [%[temp]], %[dst_stride_b] \n"
|
||||
"vst1.8 {d23}, [%[temp]], %[dst_stride_b] \n"
|
||||
"vst1.8 {d21}, [%[temp]] \n"
|
||||
"add %[dst_b], %[dst_b], %[dst_stride_b], lsl #3 \n"
|
||||
|
||||
"bge 1b \n"
|
||||
: [temp] "=&r"(temp), // %[temp]
|
||||
[src] "+r"(src), // %[src]
|
||||
[dst_a] "+r"(dst_a), // %[dst_a]
|
||||
[dst_b] "+r"(dst_b), // %[dst_b]
|
||||
[width] "+r"(width) // %[width]
|
||||
: [src_stride] "r"(src_stride), // %[src_stride]
|
||||
[dst_stride_a] "r"(dst_stride_a), // %[dst_stride_a]
|
||||
[dst_stride_b] "r"(dst_stride_b) // %[dst_stride_b]
|
||||
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
|
||||
}
|
||||
|
||||
// Transpose 32 bit values (ARGB)
|
||||
void Transpose4x4_32_NEON(const uint8_t* src,
|
||||
int src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_stride,
|
||||
int width) {
|
||||
const uint8_t* src1 = src + src_stride;
|
||||
const uint8_t* src2 = src1 + src_stride;
|
||||
const uint8_t* src3 = src2 + src_stride;
|
||||
uint8_t* dst1 = dst + dst_stride;
|
||||
uint8_t* dst2 = dst1 + dst_stride;
|
||||
uint8_t* dst3 = dst2 + dst_stride;
|
||||
asm volatile(
|
||||
// Main loop transpose 4x4. Read a column, write a row.
|
||||
"1: \n"
|
||||
"vld4.32 {d0[0], d2[0], d4[0], d6[0]}, [%0], %9 \n"
|
||||
"vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [%1], %9 \n"
|
||||
"vld4.32 {d1[0], d3[0], d5[0], d7[0]}, [%2], %9 \n"
|
||||
"vld4.32 {d1[1], d3[1], d5[1], d7[1]}, [%3], %9 \n"
|
||||
"subs %8, %8, #4 \n" // w -= 4
|
||||
"vst1.8 {q0}, [%4]! \n"
|
||||
"vst1.8 {q1}, [%5]! \n"
|
||||
"vst1.8 {q2}, [%6]! \n"
|
||||
"vst1.8 {q3}, [%7]! \n"
|
||||
"bgt 1b \n"
|
||||
|
||||
: "+r"(src), // %0
|
||||
"+r"(src1), // %1
|
||||
"+r"(src2), // %2
|
||||
"+r"(src3), // %3
|
||||
"+r"(dst), // %4
|
||||
"+r"(dst1), // %5
|
||||
"+r"(dst2), // %6
|
||||
"+r"(dst3), // %7
|
||||
"+r"(width) // %8
|
||||
: "r"((ptrdiff_t)(src_stride * 4)) // %9
|
||||
: "memory", "cc", "q0", "q1", "q2", "q3");
|
||||
}
|
||||
|
||||
#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
273
3rdparty/libyuv/source/rotate_neon64.cc
vendored
Normal file
273
3rdparty/libyuv/source/rotate_neon64.cc
vendored
Normal file
@@ -0,0 +1,273 @@
|
||||
/*
|
||||
* Copyright 2014 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/rotate_row.h"
|
||||
#include "libyuv/row.h"
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// This module is for GCC Neon armv8 64 bit.
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
|
||||
void TransposeWx16_NEON(const uint8_t* src,
|
||||
int src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_stride,
|
||||
int width) {
|
||||
const uint8_t* src_temp;
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"mov %[src_temp], %[src] \n"
|
||||
|
||||
"ld1 {v16.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v17.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v18.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v19.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v20.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v21.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v22.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v23.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v24.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v25.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v26.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v27.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v28.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v29.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v30.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v31.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
|
||||
"add %[src], %[src], #16 \n"
|
||||
|
||||
// Transpose bytes within each 2x2 block.
|
||||
"trn1 v0.16b, v16.16b, v17.16b \n"
|
||||
"trn2 v1.16b, v16.16b, v17.16b \n"
|
||||
"trn1 v2.16b, v18.16b, v19.16b \n"
|
||||
"trn2 v3.16b, v18.16b, v19.16b \n"
|
||||
"trn1 v4.16b, v20.16b, v21.16b \n"
|
||||
"trn2 v5.16b, v20.16b, v21.16b \n"
|
||||
"trn1 v6.16b, v22.16b, v23.16b \n"
|
||||
"trn2 v7.16b, v22.16b, v23.16b \n"
|
||||
"trn1 v8.16b, v24.16b, v25.16b \n"
|
||||
"trn2 v9.16b, v24.16b, v25.16b \n"
|
||||
"trn1 v10.16b, v26.16b, v27.16b \n"
|
||||
"trn2 v11.16b, v26.16b, v27.16b \n"
|
||||
"trn1 v12.16b, v28.16b, v29.16b \n"
|
||||
"trn2 v13.16b, v28.16b, v29.16b \n"
|
||||
"trn1 v14.16b, v30.16b, v31.16b \n"
|
||||
"trn2 v15.16b, v30.16b, v31.16b \n"
|
||||
|
||||
// Transpose 2x2-byte blocks within each 4x4 block.
|
||||
"trn1 v16.8h, v0.8h, v2.8h \n"
|
||||
"trn1 v17.8h, v1.8h, v3.8h \n"
|
||||
"trn2 v18.8h, v0.8h, v2.8h \n"
|
||||
"trn2 v19.8h, v1.8h, v3.8h \n"
|
||||
"trn1 v20.8h, v4.8h, v6.8h \n"
|
||||
"trn1 v21.8h, v5.8h, v7.8h \n"
|
||||
"trn2 v22.8h, v4.8h, v6.8h \n"
|
||||
"trn2 v23.8h, v5.8h, v7.8h \n"
|
||||
"trn1 v24.8h, v8.8h, v10.8h \n"
|
||||
"trn1 v25.8h, v9.8h, v11.8h \n"
|
||||
"trn2 v26.8h, v8.8h, v10.8h \n"
|
||||
"trn2 v27.8h, v9.8h, v11.8h \n"
|
||||
"trn1 v28.8h, v12.8h, v14.8h \n"
|
||||
"trn1 v29.8h, v13.8h, v15.8h \n"
|
||||
"trn2 v30.8h, v12.8h, v14.8h \n"
|
||||
"trn2 v31.8h, v13.8h, v15.8h \n"
|
||||
|
||||
"subs %w[width], %w[width], #16 \n"
|
||||
|
||||
// Transpose 4x4-byte blocks within each 8x8 block.
|
||||
"trn1 v0.4s, v16.4s, v20.4s \n"
|
||||
"trn1 v2.4s, v17.4s, v21.4s \n"
|
||||
"trn1 v4.4s, v18.4s, v22.4s \n"
|
||||
"trn1 v6.4s, v19.4s, v23.4s \n"
|
||||
"trn2 v8.4s, v16.4s, v20.4s \n"
|
||||
"trn2 v10.4s, v17.4s, v21.4s \n"
|
||||
"trn2 v12.4s, v18.4s, v22.4s \n"
|
||||
"trn2 v14.4s, v19.4s, v23.4s \n"
|
||||
"trn1 v1.4s, v24.4s, v28.4s \n"
|
||||
"trn1 v3.4s, v25.4s, v29.4s \n"
|
||||
"trn1 v5.4s, v26.4s, v30.4s \n"
|
||||
"trn1 v7.4s, v27.4s, v31.4s \n"
|
||||
"trn2 v9.4s, v24.4s, v28.4s \n"
|
||||
"trn2 v11.4s, v25.4s, v29.4s \n"
|
||||
"trn2 v13.4s, v26.4s, v30.4s \n"
|
||||
"trn2 v15.4s, v27.4s, v31.4s \n"
|
||||
|
||||
// Transpose 8x8-byte blocks and store.
|
||||
"st2 {v0.d, v1.d}[0], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v2.d, v3.d}[0], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v4.d, v5.d}[0], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v6.d, v7.d}[0], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v8.d, v9.d}[0], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v10.d, v11.d}[0], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v12.d, v13.d}[0], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v14.d, v15.d}[0], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v0.d, v1.d}[1], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v2.d, v3.d}[1], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v4.d, v5.d}[1], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v6.d, v7.d}[1], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v8.d, v9.d}[1], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v10.d, v11.d}[1], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v12.d, v13.d}[1], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v14.d, v15.d}[1], [%[dst]], %[dst_stride] \n"
|
||||
|
||||
"b.gt 1b \n"
|
||||
: [src] "+r"(src), // %[src]
|
||||
[src_temp] "=&r"(src_temp), // %[src_temp]
|
||||
[dst] "+r"(dst), // %[dst]
|
||||
[width] "+r"(width) // %[width]
|
||||
: [src_stride] "r"((ptrdiff_t)src_stride), // %[src_stride]
|
||||
[dst_stride] "r"((ptrdiff_t)dst_stride) // %[dst_stride]
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
|
||||
"v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
|
||||
"v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28",
|
||||
"v29", "v30", "v31");
|
||||
}
|
||||
|
||||
void TransposeUVWx8_NEON(const uint8_t* src,
|
||||
int src_stride,
|
||||
uint8_t* dst_a,
|
||||
int dst_stride_a,
|
||||
uint8_t* dst_b,
|
||||
int dst_stride_b,
|
||||
int width) {
|
||||
const uint8_t* temp;
|
||||
asm volatile(
|
||||
// loops are on blocks of 8. loop will stop when
|
||||
// counter gets to or below 0. starting the counter
|
||||
// at w-8 allow for this
|
||||
"sub %w[width], %w[width], #8 \n"
|
||||
|
||||
"1: \n"
|
||||
"mov %[temp], %[src] \n"
|
||||
"ld1 {v0.16b}, [%[temp]], %[src_stride] \n"
|
||||
"ld1 {v1.16b}, [%[temp]], %[src_stride] \n"
|
||||
"ld1 {v2.16b}, [%[temp]], %[src_stride] \n"
|
||||
"ld1 {v3.16b}, [%[temp]], %[src_stride] \n"
|
||||
"ld1 {v4.16b}, [%[temp]], %[src_stride] \n"
|
||||
"ld1 {v5.16b}, [%[temp]], %[src_stride] \n"
|
||||
"ld1 {v6.16b}, [%[temp]], %[src_stride] \n"
|
||||
"ld1 {v7.16b}, [%[temp]] \n"
|
||||
"add %[src], %[src], #16 \n"
|
||||
|
||||
"trn1 v16.16b, v0.16b, v1.16b \n"
|
||||
"trn2 v17.16b, v0.16b, v1.16b \n"
|
||||
"trn1 v18.16b, v2.16b, v3.16b \n"
|
||||
"trn2 v19.16b, v2.16b, v3.16b \n"
|
||||
"trn1 v20.16b, v4.16b, v5.16b \n"
|
||||
"trn2 v21.16b, v4.16b, v5.16b \n"
|
||||
"trn1 v22.16b, v6.16b, v7.16b \n"
|
||||
"trn2 v23.16b, v6.16b, v7.16b \n"
|
||||
|
||||
"subs %w[width], %w[width], #8 \n"
|
||||
|
||||
"trn1 v0.8h, v16.8h, v18.8h \n"
|
||||
"trn2 v1.8h, v16.8h, v18.8h \n"
|
||||
"trn1 v2.8h, v20.8h, v22.8h \n"
|
||||
"trn2 v3.8h, v20.8h, v22.8h \n"
|
||||
"trn1 v4.8h, v17.8h, v19.8h \n"
|
||||
"trn2 v5.8h, v17.8h, v19.8h \n"
|
||||
"trn1 v6.8h, v21.8h, v23.8h \n"
|
||||
"trn2 v7.8h, v21.8h, v23.8h \n"
|
||||
|
||||
"trn1 v16.4s, v0.4s, v2.4s \n"
|
||||
"trn2 v17.4s, v0.4s, v2.4s \n"
|
||||
"trn1 v18.4s, v1.4s, v3.4s \n"
|
||||
"trn2 v19.4s, v1.4s, v3.4s \n"
|
||||
"trn1 v20.4s, v4.4s, v6.4s \n"
|
||||
"trn2 v21.4s, v4.4s, v6.4s \n"
|
||||
"trn1 v22.4s, v5.4s, v7.4s \n"
|
||||
"trn2 v23.4s, v5.4s, v7.4s \n"
|
||||
|
||||
"mov %[temp], %[dst_a] \n"
|
||||
"st1 {v16.d}[0], [%[temp]], %[dst_stride_a] \n"
|
||||
"st1 {v18.d}[0], [%[temp]], %[dst_stride_a] \n"
|
||||
"st1 {v17.d}[0], [%[temp]], %[dst_stride_a] \n"
|
||||
"st1 {v19.d}[0], [%[temp]], %[dst_stride_a] \n"
|
||||
"st1 {v16.d}[1], [%[temp]], %[dst_stride_a] \n"
|
||||
"st1 {v18.d}[1], [%[temp]], %[dst_stride_a] \n"
|
||||
"st1 {v17.d}[1], [%[temp]], %[dst_stride_a] \n"
|
||||
"st1 {v19.d}[1], [%[temp]] \n"
|
||||
"add %[dst_a], %[dst_a], %[dst_stride_a], lsl #3 \n"
|
||||
|
||||
"mov %[temp], %[dst_b] \n"
|
||||
"st1 {v20.d}[0], [%[temp]], %[dst_stride_b] \n"
|
||||
"st1 {v22.d}[0], [%[temp]], %[dst_stride_b] \n"
|
||||
"st1 {v21.d}[0], [%[temp]], %[dst_stride_b] \n"
|
||||
"st1 {v23.d}[0], [%[temp]], %[dst_stride_b] \n"
|
||||
"st1 {v20.d}[1], [%[temp]], %[dst_stride_b] \n"
|
||||
"st1 {v22.d}[1], [%[temp]], %[dst_stride_b] \n"
|
||||
"st1 {v21.d}[1], [%[temp]], %[dst_stride_b] \n"
|
||||
"st1 {v23.d}[1], [%[temp]] \n"
|
||||
"add %[dst_b], %[dst_b], %[dst_stride_b], lsl #3 \n"
|
||||
|
||||
"b.ge 1b \n"
|
||||
: [temp] "=&r"(temp), // %[temp]
|
||||
[src] "+r"(src), // %[src]
|
||||
[dst_a] "+r"(dst_a), // %[dst_a]
|
||||
[dst_b] "+r"(dst_b), // %[dst_b]
|
||||
[width] "+r"(width) // %[width]
|
||||
: [src_stride] "r"((ptrdiff_t)src_stride), // %[src_stride]
|
||||
[dst_stride_a] "r"((ptrdiff_t)dst_stride_a), // %[dst_stride_a]
|
||||
[dst_stride_b] "r"((ptrdiff_t)dst_stride_b) // %[dst_stride_b]
|
||||
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
|
||||
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31");
|
||||
}
|
||||
|
||||
// Transpose 32 bit values (ARGB)
|
||||
void Transpose4x4_32_NEON(const uint8_t* src,
|
||||
int src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_stride,
|
||||
int width) {
|
||||
const uint8_t* src1 = src + src_stride;
|
||||
const uint8_t* src2 = src1 + src_stride;
|
||||
const uint8_t* src3 = src2 + src_stride;
|
||||
uint8_t* dst1 = dst + dst_stride;
|
||||
uint8_t* dst2 = dst1 + dst_stride;
|
||||
uint8_t* dst3 = dst2 + dst_stride;
|
||||
asm volatile(
|
||||
// Main loop transpose 4x4. Read a column, write a row.
|
||||
"1: \n"
|
||||
"ld4 {v0.s, v1.s, v2.s, v3.s}[0], [%0], %9 \n"
|
||||
"ld4 {v0.s, v1.s, v2.s, v3.s}[1], [%1], %9 \n"
|
||||
"ld4 {v0.s, v1.s, v2.s, v3.s}[2], [%2], %9 \n"
|
||||
"ld4 {v0.s, v1.s, v2.s, v3.s}[3], [%3], %9 \n"
|
||||
"subs %w8, %w8, #4 \n" // w -= 4
|
||||
"st1 {v0.4s}, [%4], 16 \n"
|
||||
"st1 {v1.4s}, [%5], 16 \n"
|
||||
"st1 {v2.4s}, [%6], 16 \n"
|
||||
"st1 {v3.4s}, [%7], 16 \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(src1), // %1
|
||||
"+r"(src2), // %2
|
||||
"+r"(src3), // %3
|
||||
"+r"(dst), // %4
|
||||
"+r"(dst1), // %5
|
||||
"+r"(dst2), // %6
|
||||
"+r"(dst3), // %7
|
||||
"+r"(width) // %8
|
||||
: "r"((ptrdiff_t)(src_stride * 4)) // %9
|
||||
: "memory", "cc", "v0", "v1", "v2", "v3");
|
||||
}
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
174
3rdparty/libyuv/source/rotate_sme.cc
vendored
Normal file
174
3rdparty/libyuv/source/rotate_sme.cc
vendored
Normal file
@@ -0,0 +1,174 @@
|
||||
/*
|
||||
* Copyright 2024 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/rotate_row.h"
|
||||
#include "libyuv/row.h"
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && \
|
||||
defined(__aarch64__)
|
||||
|
||||
__arm_locally_streaming __arm_new("za") void TransposeWxH_SME(
|
||||
const uint8_t* src,
|
||||
int src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_stride,
|
||||
int width,
|
||||
int height) {
|
||||
int vl;
|
||||
asm("cntb %x0" : "=r"(vl));
|
||||
|
||||
do {
|
||||
const uint8_t* src2 = src;
|
||||
uint8_t* dst2 = dst;
|
||||
|
||||
// Process up to VL elements per iteration of the inner loop.
|
||||
int block_height = height > vl ? vl : height;
|
||||
|
||||
int width2 = width;
|
||||
do {
|
||||
const uint8_t* src3 = src2;
|
||||
|
||||
// Process up to VL elements per iteration of the inner loop.
|
||||
int block_width = width2 > vl ? vl : width2;
|
||||
|
||||
asm volatile(
|
||||
"mov w12, #0 \n"
|
||||
|
||||
// Create a predicate to handle loading partial rows.
|
||||
"whilelt p0.b, wzr, %w[block_width] \n"
|
||||
|
||||
// Load H <= VL rows into ZA0.
|
||||
"1: \n"
|
||||
"ld1b {za0h.b[w12, 0]}, p0/z, [%[src3]] \n"
|
||||
"add %[src3], %[src3], %[src_stride] \n"
|
||||
"add w12, w12, #1 \n"
|
||||
"cmp w12, %w[block_height] \n"
|
||||
"b.ne 1b \n"
|
||||
|
||||
// Create a predicate to handle storing partial columns.
|
||||
"whilelt p0.b, wzr, %w[block_height] \n"
|
||||
"mov w12, #0 \n"
|
||||
|
||||
// Store W <= VL columns from ZA0.
|
||||
"2: \n"
|
||||
"st1b {za0v.b[w12, 0]}, p0, [%[dst2]] \n"
|
||||
"add %[dst2], %[dst2], %[dst_stride] \n"
|
||||
"add w12, w12, #1 \n"
|
||||
"cmp w12, %w[block_width] \n"
|
||||
"b.ne 2b \n"
|
||||
: [src3] "+r"(src3), // %[src3]
|
||||
[dst2] "+r"(dst2) // %[dst2]
|
||||
: [src_stride] "r"((ptrdiff_t)src_stride), // %[src_stride]
|
||||
[dst_stride] "r"((ptrdiff_t)dst_stride), // %[dst_stride]
|
||||
[block_width] "r"(block_width), // %[block_width]
|
||||
[block_height] "r"(block_height) // %[block_height]
|
||||
: "cc", "memory", "p0", "w12", "za");
|
||||
|
||||
src2 += vl;
|
||||
width2 -= vl;
|
||||
} while (width2 > 0);
|
||||
|
||||
src += vl * src_stride;
|
||||
dst += vl;
|
||||
height -= vl;
|
||||
} while (height > 0);
|
||||
}
|
||||
|
||||
__arm_locally_streaming __arm_new("za") void TransposeUVWxH_SME(
|
||||
const uint8_t* src,
|
||||
int src_stride,
|
||||
uint8_t* dst_a,
|
||||
int dst_stride_a,
|
||||
uint8_t* dst_b,
|
||||
int dst_stride_b,
|
||||
int width,
|
||||
int height) {
|
||||
int vl;
|
||||
asm("cnth %x0" : "=r"(vl));
|
||||
|
||||
do {
|
||||
const uint8_t* src2 = src;
|
||||
uint8_t* dst2_a = dst_a;
|
||||
uint8_t* dst2_b = dst_b;
|
||||
|
||||
// Process up to VL bytes per iteration of the inner loop.
|
||||
int block_height = height > vl * 2 ? vl * 2 : height;
|
||||
|
||||
int width2 = width;
|
||||
do {
|
||||
const uint8_t* src3 = src2;
|
||||
|
||||
// Process up to VL 16-bit elements per iteration of the inner loop.
|
||||
int block_width = width2 > vl ? vl : width2;
|
||||
|
||||
asm volatile(
|
||||
"mov w12, #0 \n"
|
||||
|
||||
// Create a predicate to handle loading partial rows,
|
||||
// %[block_width] is always a multiple of two here.
|
||||
"whilelt p0.b, wzr, %w[block_width] \n"
|
||||
|
||||
// Load H <= VL rows into ZA0, such that U/V components exist in
|
||||
// alternating columns.
|
||||
"1: \n"
|
||||
"ld1b {za0h.b[w12, 0]}, p0/z, [%[src]] \n"
|
||||
"add %[src], %[src], %[src_stride] \n"
|
||||
"add w12, w12, #1 \n"
|
||||
"cmp w12, %w[block_height] \n"
|
||||
"b.ne 1b \n"
|
||||
|
||||
// Create a predicate to handle storing partial columns.
|
||||
"whilelt p0.b, wzr, %w[block_height] \n"
|
||||
"mov w12, #0 \n"
|
||||
|
||||
// Store alternating UV data from pairs of ZA0 columns.
|
||||
"2: \n"
|
||||
"st1b {za0v.b[w12, 0]}, p0, [%[dst_a]] \n"
|
||||
"st1b {za0v.b[w12, 1]}, p0, [%[dst_b]] \n"
|
||||
"add %[dst_a], %[dst_a], %[dst_stride_a] \n"
|
||||
"add %[dst_b], %[dst_b], %[dst_stride_b] \n"
|
||||
"add w12, w12, #2 \n"
|
||||
"cmp w12, %w[block_width] \n"
|
||||
"b.ne 2b \n"
|
||||
: [src] "+r"(src3), // %[src]
|
||||
[dst_a] "+r"(dst2_a), // %[dst_a]
|
||||
[dst_b] "+r"(dst2_b) // %[dst_b]
|
||||
: [src_stride] "r"((ptrdiff_t)src_stride), // %[src_stride]
|
||||
[dst_stride_a] "r"((ptrdiff_t)dst_stride_a), // %[dst_stride_a]
|
||||
[dst_stride_b] "r"((ptrdiff_t)dst_stride_b), // %[dst_stride_b]
|
||||
[block_width] "r"(block_width * 2), // %[block_width]
|
||||
[block_height] "r"(block_height) // %[block_height]
|
||||
: "cc", "memory", "p0", "w12", "za");
|
||||
|
||||
src2 += 2 * vl;
|
||||
width2 -= vl;
|
||||
} while (width2 > 0);
|
||||
|
||||
src += 2 * vl * src_stride;
|
||||
dst_a += 2 * vl;
|
||||
dst_b += 2 * vl;
|
||||
height -= 2 * vl;
|
||||
} while (height > 0);
|
||||
}
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) &&
|
||||
// defined(__aarch64__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
253
3rdparty/libyuv/source/rotate_win.cc
vendored
Normal file
253
3rdparty/libyuv/source/rotate_win.cc
vendored
Normal file
@@ -0,0 +1,253 @@
|
||||
/*
|
||||
* Copyright 2013 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/rotate_row.h"
|
||||
#include "libyuv/row.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// This module is for 32 bit Visual C x86
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && defined(_M_IX86) && \
|
||||
(!defined(__clang__) || defined(LIBYUV_ENABLE_ROWWIN))
|
||||
|
||||
__declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src,
|
||||
int src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_stride,
|
||||
int width) {
|
||||
__asm {
|
||||
push edi
|
||||
push esi
|
||||
push ebp
|
||||
mov eax, [esp + 12 + 4] // src
|
||||
mov edi, [esp + 12 + 8] // src_stride
|
||||
mov edx, [esp + 12 + 12] // dst
|
||||
mov esi, [esp + 12 + 16] // dst_stride
|
||||
mov ecx, [esp + 12 + 20] // width
|
||||
|
||||
// Read in the data from the source pointer.
|
||||
// First round of bit swap.
|
||||
align 4
|
||||
convertloop:
|
||||
movq xmm0, qword ptr [eax]
|
||||
lea ebp, [eax + 8]
|
||||
movq xmm1, qword ptr [eax + edi]
|
||||
lea eax, [eax + 2 * edi]
|
||||
punpcklbw xmm0, xmm1
|
||||
movq xmm2, qword ptr [eax]
|
||||
movdqa xmm1, xmm0
|
||||
palignr xmm1, xmm1, 8
|
||||
movq xmm3, qword ptr [eax + edi]
|
||||
lea eax, [eax + 2 * edi]
|
||||
punpcklbw xmm2, xmm3
|
||||
movdqa xmm3, xmm2
|
||||
movq xmm4, qword ptr [eax]
|
||||
palignr xmm3, xmm3, 8
|
||||
movq xmm5, qword ptr [eax + edi]
|
||||
punpcklbw xmm4, xmm5
|
||||
lea eax, [eax + 2 * edi]
|
||||
movdqa xmm5, xmm4
|
||||
movq xmm6, qword ptr [eax]
|
||||
palignr xmm5, xmm5, 8
|
||||
movq xmm7, qword ptr [eax + edi]
|
||||
punpcklbw xmm6, xmm7
|
||||
mov eax, ebp
|
||||
movdqa xmm7, xmm6
|
||||
palignr xmm7, xmm7, 8
|
||||
// Second round of bit swap.
|
||||
punpcklwd xmm0, xmm2
|
||||
punpcklwd xmm1, xmm3
|
||||
movdqa xmm2, xmm0
|
||||
movdqa xmm3, xmm1
|
||||
palignr xmm2, xmm2, 8
|
||||
palignr xmm3, xmm3, 8
|
||||
punpcklwd xmm4, xmm6
|
||||
punpcklwd xmm5, xmm7
|
||||
movdqa xmm6, xmm4
|
||||
movdqa xmm7, xmm5
|
||||
palignr xmm6, xmm6, 8
|
||||
palignr xmm7, xmm7, 8
|
||||
// Third round of bit swap.
|
||||
// Write to the destination pointer.
|
||||
punpckldq xmm0, xmm4
|
||||
movq qword ptr [edx], xmm0
|
||||
movdqa xmm4, xmm0
|
||||
palignr xmm4, xmm4, 8
|
||||
movq qword ptr [edx + esi], xmm4
|
||||
lea edx, [edx + 2 * esi]
|
||||
punpckldq xmm2, xmm6
|
||||
movdqa xmm6, xmm2
|
||||
palignr xmm6, xmm6, 8
|
||||
movq qword ptr [edx], xmm2
|
||||
punpckldq xmm1, xmm5
|
||||
movq qword ptr [edx + esi], xmm6
|
||||
lea edx, [edx + 2 * esi]
|
||||
movdqa xmm5, xmm1
|
||||
movq qword ptr [edx], xmm1
|
||||
palignr xmm5, xmm5, 8
|
||||
punpckldq xmm3, xmm7
|
||||
movq qword ptr [edx + esi], xmm5
|
||||
lea edx, [edx + 2 * esi]
|
||||
movq qword ptr [edx], xmm3
|
||||
movdqa xmm7, xmm3
|
||||
palignr xmm7, xmm7, 8
|
||||
sub ecx, 8
|
||||
movq qword ptr [edx + esi], xmm7
|
||||
lea edx, [edx + 2 * esi]
|
||||
jg convertloop
|
||||
|
||||
pop ebp
|
||||
pop esi
|
||||
pop edi
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src,
|
||||
int src_stride,
|
||||
uint8_t* dst_a,
|
||||
int dst_stride_a,
|
||||
uint8_t* dst_b,
|
||||
int dst_stride_b,
|
||||
int w) {
|
||||
__asm {
|
||||
push ebx
|
||||
push esi
|
||||
push edi
|
||||
push ebp
|
||||
mov eax, [esp + 16 + 4] // src
|
||||
mov edi, [esp + 16 + 8] // src_stride
|
||||
mov edx, [esp + 16 + 12] // dst_a
|
||||
mov esi, [esp + 16 + 16] // dst_stride_a
|
||||
mov ebx, [esp + 16 + 20] // dst_b
|
||||
mov ebp, [esp + 16 + 24] // dst_stride_b
|
||||
mov ecx, esp
|
||||
sub esp, 4 + 16
|
||||
and esp, ~15
|
||||
mov [esp + 16], ecx
|
||||
mov ecx, [ecx + 16 + 28] // w
|
||||
|
||||
align 4
|
||||
// Read in the data from the source pointer.
|
||||
// First round of bit swap.
|
||||
convertloop:
|
||||
movdqu xmm0, [eax]
|
||||
movdqu xmm1, [eax + edi]
|
||||
lea eax, [eax + 2 * edi]
|
||||
movdqa xmm7, xmm0 // use xmm7 as temp register.
|
||||
punpcklbw xmm0, xmm1
|
||||
punpckhbw xmm7, xmm1
|
||||
movdqa xmm1, xmm7
|
||||
movdqu xmm2, [eax]
|
||||
movdqu xmm3, [eax + edi]
|
||||
lea eax, [eax + 2 * edi]
|
||||
movdqa xmm7, xmm2
|
||||
punpcklbw xmm2, xmm3
|
||||
punpckhbw xmm7, xmm3
|
||||
movdqa xmm3, xmm7
|
||||
movdqu xmm4, [eax]
|
||||
movdqu xmm5, [eax + edi]
|
||||
lea eax, [eax + 2 * edi]
|
||||
movdqa xmm7, xmm4
|
||||
punpcklbw xmm4, xmm5
|
||||
punpckhbw xmm7, xmm5
|
||||
movdqa xmm5, xmm7
|
||||
movdqu xmm6, [eax]
|
||||
movdqu xmm7, [eax + edi]
|
||||
lea eax, [eax + 2 * edi]
|
||||
movdqu [esp], xmm5 // backup xmm5
|
||||
neg edi
|
||||
movdqa xmm5, xmm6 // use xmm5 as temp register.
|
||||
punpcklbw xmm6, xmm7
|
||||
punpckhbw xmm5, xmm7
|
||||
movdqa xmm7, xmm5
|
||||
lea eax, [eax + 8 * edi + 16]
|
||||
neg edi
|
||||
// Second round of bit swap.
|
||||
movdqa xmm5, xmm0
|
||||
punpcklwd xmm0, xmm2
|
||||
punpckhwd xmm5, xmm2
|
||||
movdqa xmm2, xmm5
|
||||
movdqa xmm5, xmm1
|
||||
punpcklwd xmm1, xmm3
|
||||
punpckhwd xmm5, xmm3
|
||||
movdqa xmm3, xmm5
|
||||
movdqa xmm5, xmm4
|
||||
punpcklwd xmm4, xmm6
|
||||
punpckhwd xmm5, xmm6
|
||||
movdqa xmm6, xmm5
|
||||
movdqu xmm5, [esp] // restore xmm5
|
||||
movdqu [esp], xmm6 // backup xmm6
|
||||
movdqa xmm6, xmm5 // use xmm6 as temp register.
|
||||
punpcklwd xmm5, xmm7
|
||||
punpckhwd xmm6, xmm7
|
||||
movdqa xmm7, xmm6
|
||||
|
||||
// Third round of bit swap.
|
||||
// Write to the destination pointer.
|
||||
movdqa xmm6, xmm0
|
||||
punpckldq xmm0, xmm4
|
||||
punpckhdq xmm6, xmm4
|
||||
movdqa xmm4, xmm6
|
||||
movdqu xmm6, [esp] // restore xmm6
|
||||
movlpd qword ptr [edx], xmm0
|
||||
movhpd qword ptr [ebx], xmm0
|
||||
movlpd qword ptr [edx + esi], xmm4
|
||||
lea edx, [edx + 2 * esi]
|
||||
movhpd qword ptr [ebx + ebp], xmm4
|
||||
lea ebx, [ebx + 2 * ebp]
|
||||
movdqa xmm0, xmm2 // use xmm0 as the temp register.
|
||||
punpckldq xmm2, xmm6
|
||||
movlpd qword ptr [edx], xmm2
|
||||
movhpd qword ptr [ebx], xmm2
|
||||
punpckhdq xmm0, xmm6
|
||||
movlpd qword ptr [edx + esi], xmm0
|
||||
lea edx, [edx + 2 * esi]
|
||||
movhpd qword ptr [ebx + ebp], xmm0
|
||||
lea ebx, [ebx + 2 * ebp]
|
||||
movdqa xmm0, xmm1 // use xmm0 as the temp register.
|
||||
punpckldq xmm1, xmm5
|
||||
movlpd qword ptr [edx], xmm1
|
||||
movhpd qword ptr [ebx], xmm1
|
||||
punpckhdq xmm0, xmm5
|
||||
movlpd qword ptr [edx + esi], xmm0
|
||||
lea edx, [edx + 2 * esi]
|
||||
movhpd qword ptr [ebx + ebp], xmm0
|
||||
lea ebx, [ebx + 2 * ebp]
|
||||
movdqa xmm0, xmm3 // use xmm0 as the temp register.
|
||||
punpckldq xmm3, xmm7
|
||||
movlpd qword ptr [edx], xmm3
|
||||
movhpd qword ptr [ebx], xmm3
|
||||
punpckhdq xmm0, xmm7
|
||||
sub ecx, 8
|
||||
movlpd qword ptr [edx + esi], xmm0
|
||||
lea edx, [edx + 2 * esi]
|
||||
movhpd qword ptr [ebx + ebp], xmm0
|
||||
lea ebx, [ebx + 2 * ebp]
|
||||
jg convertloop
|
||||
|
||||
mov esp, [esp + 16]
|
||||
pop ebp
|
||||
pop edi
|
||||
pop esi
|
||||
pop ebx
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
2659
3rdparty/libyuv/source/row_any.cc
vendored
Normal file
2659
3rdparty/libyuv/source/row_any.cc
vendored
Normal file
File diff suppressed because it is too large
Load Diff
4668
3rdparty/libyuv/source/row_common.cc
vendored
Normal file
4668
3rdparty/libyuv/source/row_common.cc
vendored
Normal file
File diff suppressed because it is too large
Load Diff
9742
3rdparty/libyuv/source/row_gcc.cc
vendored
Normal file
9742
3rdparty/libyuv/source/row_gcc.cc
vendored
Normal file
File diff suppressed because it is too large
Load Diff
2343
3rdparty/libyuv/source/row_lasx.cc
vendored
Normal file
2343
3rdparty/libyuv/source/row_lasx.cc
vendored
Normal file
File diff suppressed because it is too large
Load Diff
3030
3rdparty/libyuv/source/row_lsx.cc
vendored
Normal file
3030
3rdparty/libyuv/source/row_lsx.cc
vendored
Normal file
File diff suppressed because it is too large
Load Diff
3983
3rdparty/libyuv/source/row_neon.cc
vendored
Normal file
3983
3rdparty/libyuv/source/row_neon.cc
vendored
Normal file
File diff suppressed because it is too large
Load Diff
5617
3rdparty/libyuv/source/row_neon64.cc
vendored
Normal file
5617
3rdparty/libyuv/source/row_neon64.cc
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1670
3rdparty/libyuv/source/row_rvv.cc
vendored
Normal file
1670
3rdparty/libyuv/source/row_rvv.cc
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1183
3rdparty/libyuv/source/row_sme.cc
vendored
Normal file
1183
3rdparty/libyuv/source/row_sme.cc
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1088
3rdparty/libyuv/source/row_sve.cc
vendored
Normal file
1088
3rdparty/libyuv/source/row_sve.cc
vendored
Normal file
File diff suppressed because it is too large
Load Diff
221
3rdparty/libyuv/source/row_win.cc
vendored
Normal file
221
3rdparty/libyuv/source/row_win.cc
vendored
Normal file
@@ -0,0 +1,221 @@
|
||||
/*
|
||||
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/row.h"
|
||||
#include "libyuv/convert_from_argb.h" // For ArgbConstants
|
||||
|
||||
// This module is for Visual C 32/64 bit
|
||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||
(defined(__x86_64__) || defined(__i386__) || \
|
||||
defined(_M_X64) || defined(_M_X86)) && \
|
||||
((defined(_MSC_VER) && !defined(__clang__)) || \
|
||||
defined(LIBYUV_ENABLE_ROWWIN))
|
||||
|
||||
#include <emmintrin.h>
|
||||
#include <tmmintrin.h> // For _mm_maddubs_epi16
|
||||
#include <immintrin.h> // For AVX2 intrinsics
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Read 8 UV from 444
|
||||
#define READYUV444 \
|
||||
xmm3 = _mm_loadl_epi64((__m128i*)u_buf); \
|
||||
xmm1 = _mm_loadl_epi64((__m128i*)(u_buf + offset)); \
|
||||
xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \
|
||||
u_buf += 8; \
|
||||
xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
|
||||
xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
|
||||
y_buf += 8;
|
||||
|
||||
// Read 8 UV from 444, With 8 Alpha.
|
||||
#define READYUVA444 \
|
||||
xmm3 = _mm_loadl_epi64((__m128i*)u_buf); \
|
||||
xmm1 = _mm_loadl_epi64((__m128i*)(u_buf + offset)); \
|
||||
xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \
|
||||
u_buf += 8; \
|
||||
xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
|
||||
xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
|
||||
y_buf += 8; \
|
||||
xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \
|
||||
a_buf += 8;
|
||||
|
||||
// Read 4 UV from 422, upsample to 8 UV.
|
||||
#define READYUV422 \
|
||||
xmm3 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \
|
||||
xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
|
||||
xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \
|
||||
xmm3 = _mm_unpacklo_epi16(xmm3, xmm3); \
|
||||
u_buf += 4; \
|
||||
xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
|
||||
xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
|
||||
y_buf += 8;
|
||||
|
||||
// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
|
||||
#define READYUVA422 \
|
||||
xmm3 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \
|
||||
xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
|
||||
xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \
|
||||
xmm3 = _mm_unpacklo_epi16(xmm3, xmm3); \
|
||||
u_buf += 4; \
|
||||
xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
|
||||
xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
|
||||
y_buf += 8; \
|
||||
xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \
|
||||
a_buf += 8;
|
||||
|
||||
// Convert 8 pixels: 8 UV and 8 Y.
|
||||
#define YUVTORGB(yuvconstants) \
|
||||
xmm3 = _mm_sub_epi8(xmm3, _mm_set1_epi8((char)0x80)); \
|
||||
xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \
|
||||
xmm4 = _mm_add_epi16(xmm4, *(__m128i*)yuvconstants->kYBiasToRgb); \
|
||||
xmm0 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToB, xmm3); \
|
||||
xmm1 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToG, xmm3); \
|
||||
xmm2 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToR, xmm3); \
|
||||
xmm0 = _mm_adds_epi16(xmm4, xmm0); \
|
||||
xmm1 = _mm_subs_epi16(xmm4, xmm1); \
|
||||
xmm2 = _mm_adds_epi16(xmm4, xmm2); \
|
||||
xmm0 = _mm_srai_epi16(xmm0, 6); \
|
||||
xmm1 = _mm_srai_epi16(xmm1, 6); \
|
||||
xmm2 = _mm_srai_epi16(xmm2, 6); \
|
||||
xmm0 = _mm_packus_epi16(xmm0, xmm0); \
|
||||
xmm1 = _mm_packus_epi16(xmm1, xmm1); \
|
||||
xmm2 = _mm_packus_epi16(xmm2, xmm2);
|
||||
|
||||
// Store 8 ARGB values.
|
||||
#define STOREARGB \
|
||||
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
|
||||
xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \
|
||||
xmm1 = _mm_loadu_si128(&xmm0); \
|
||||
xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \
|
||||
xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \
|
||||
_mm_storeu_si128((__m128i*)dst_argb, xmm0); \
|
||||
_mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \
|
||||
dst_argb += 32;
|
||||
|
||||
#if defined(HAS_I422TOARGBROW_SSSE3)
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(HAS_I444TOARGBROW_SSSE3)
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(HAS_I444ALPHATOARGBROW_SSSE3)
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(HAS_ARGBTOYROW_AVX2)
|
||||
|
||||
#if defined(__clang__) || defined(__GNUC__)
|
||||
#define LIBYUV_TARGET_AVX2 __attribute__((target("avx2")))
|
||||
#else
|
||||
#define LIBYUV_TARGET_AVX2
|
||||
#endif
|
||||
|
||||
LIBYUV_TARGET_AVX2
|
||||
void ARGBToYMatrixRow_AVX2(const uint8_t* src_argb,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct ArgbConstants* c) {
|
||||
__m256i ymm5 = _mm256_set1_epi8((char)0x80);
|
||||
__m128i kRGBToY = _mm_loadu_si128((const __m128i*)c->kRGBToY);
|
||||
__m256i ymm4 = _mm256_broadcastsi128_si256(kRGBToY);
|
||||
__m128i kAddY = _mm_loadu_si128((const __m128i*)c->kAddY);
|
||||
__m256i ymm7 = _mm256_broadcastsi128_si256(kAddY);
|
||||
__m256i ymm6 = _mm256_maddubs_epi16(ymm4, ymm5);
|
||||
ymm6 = _mm256_hadd_epi16(ymm6, ymm6);
|
||||
ymm7 = _mm256_sub_epi16(ymm7, ymm6);
|
||||
__m256i perm_mask = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
|
||||
|
||||
while (width > 0) {
|
||||
__m256i ymm0 = _mm256_loadu_si256((const __m256i*)src_argb);
|
||||
__m256i ymm1 = _mm256_loadu_si256((const __m256i*)(src_argb + 32));
|
||||
__m256i ymm2 = _mm256_loadu_si256((const __m256i*)(src_argb + 64));
|
||||
__m256i ymm3 = _mm256_loadu_si256((const __m256i*)(src_argb + 96));
|
||||
src_argb += 128;
|
||||
|
||||
ymm0 = _mm256_sub_epi8(ymm0, ymm5);
|
||||
ymm1 = _mm256_sub_epi8(ymm1, ymm5);
|
||||
ymm2 = _mm256_sub_epi8(ymm2, ymm5);
|
||||
ymm3 = _mm256_sub_epi8(ymm3, ymm5);
|
||||
|
||||
ymm0 = _mm256_maddubs_epi16(ymm4, ymm0);
|
||||
ymm1 = _mm256_maddubs_epi16(ymm4, ymm1);
|
||||
ymm2 = _mm256_maddubs_epi16(ymm4, ymm2);
|
||||
ymm3 = _mm256_maddubs_epi16(ymm4, ymm3);
|
||||
|
||||
ymm0 = _mm256_hadd_epi16(ymm0, ymm1);
|
||||
ymm2 = _mm256_hadd_epi16(ymm2, ymm3);
|
||||
|
||||
ymm0 = _mm256_add_epi16(ymm0, ymm7);
|
||||
ymm2 = _mm256_add_epi16(ymm2, ymm7);
|
||||
|
||||
ymm0 = _mm256_srli_epi16(ymm0, 8);
|
||||
ymm2 = _mm256_srli_epi16(ymm2, 8);
|
||||
|
||||
ymm0 = _mm256_packus_epi16(ymm0, ymm2);
|
||||
ymm0 = _mm256_permutevar8x32_epi32(ymm0, perm_mask);
|
||||
|
||||
_mm256_storeu_si256((__m256i*)dst_y, ymm0);
|
||||
dst_y += 32;
|
||||
width -= 32;
|
||||
}
|
||||
}
|
||||
|
||||
LIBYUV_TARGET_AVX2
|
||||
void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
ARGBToYMatrixRow_AVX2(src_argb, dst_y, width, &kArgbI601Constants);
|
||||
}
|
||||
|
||||
LIBYUV_TARGET_AVX2
|
||||
void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
|
||||
ARGBToYMatrixRow_AVX2(src_abgr, dst_y, width, &kAbgrI601Constants);
|
||||
}
|
||||
|
||||
LIBYUV_TARGET_AVX2
|
||||
void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
ARGBToYMatrixRow_AVX2(src_argb, dst_y, width, &kArgbJPEGConstants);
|
||||
}
|
||||
|
||||
LIBYUV_TARGET_AVX2
|
||||
void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
|
||||
ARGBToYMatrixRow_AVX2(src_abgr, dst_y, width, &kAbgrJPEGConstants);
|
||||
}
|
||||
|
||||
LIBYUV_TARGET_AVX2
|
||||
void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
|
||||
ARGBToYMatrixRow_AVX2(src_rgba, dst_y, width, &kRgbaJPEGConstants);
|
||||
}
|
||||
|
||||
LIBYUV_TARGET_AVX2
|
||||
void RGBAToYRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
|
||||
ARGBToYMatrixRow_AVX2(src_rgba, dst_y, width, &kRgbaI601Constants);
|
||||
}
|
||||
|
||||
LIBYUV_TARGET_AVX2
|
||||
void BGRAToYRow_AVX2(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
|
||||
ARGBToYMatrixRow_AVX2(src_bgra, dst_y, width, &kBgraI601Constants);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || defined(_M_X86)) && ((defined(_MSC_VER) && !defined(__clang__)) || defined(LIBYUV_ENABLE_ROWWIN))
|
||||
2710
3rdparty/libyuv/source/scale.cc
vendored
Normal file
2710
3rdparty/libyuv/source/scale.cc
vendored
Normal file
File diff suppressed because it is too large
Load Diff
991
3rdparty/libyuv/source/scale_any.cc
vendored
Normal file
991
3rdparty/libyuv/source/scale_any.cc
vendored
Normal file
@@ -0,0 +1,991 @@
|
||||
/*
|
||||
* Copyright 2015 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <string.h> // For memset/memcpy
|
||||
|
||||
#include "libyuv/scale.h"
|
||||
#include "libyuv/scale_row.h"
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Fixed scale down.
|
||||
// Mask may be non-power of 2, so use MOD
|
||||
#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
|
||||
void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \
|
||||
int dst_width) { \
|
||||
int r = (int)((unsigned int)dst_width % (MASK + 1)); /* NOLINT */ \
|
||||
int n = dst_width - r; \
|
||||
if (n > 0) { \
|
||||
SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
|
||||
} \
|
||||
SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \
|
||||
dst_ptr + n * BPP, r); \
|
||||
}
|
||||
|
||||
// Fixed scale down for odd source width. Used by I420Blend subsampling.
|
||||
// Since dst_width is (width + 1) / 2, this function scales one less pixel
|
||||
// and copies the last pixel.
|
||||
#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
|
||||
void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \
|
||||
int dst_width) { \
|
||||
int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); /* NOLINT */ \
|
||||
int n = (dst_width - 1) - r; \
|
||||
if (n > 0) { \
|
||||
SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
|
||||
} \
|
||||
SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \
|
||||
dst_ptr + n * BPP, r + 1); \
|
||||
}
|
||||
|
||||
#ifdef HAS_SCALEROWDOWN2_SSSE3
|
||||
SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15)
|
||||
SDANY(ScaleRowDown2Linear_Any_SSSE3,
|
||||
ScaleRowDown2Linear_SSSE3,
|
||||
ScaleRowDown2Linear_C,
|
||||
2,
|
||||
1,
|
||||
15)
|
||||
SDANY(ScaleRowDown2Box_Any_SSSE3,
|
||||
ScaleRowDown2Box_SSSE3,
|
||||
ScaleRowDown2Box_C,
|
||||
2,
|
||||
1,
|
||||
15)
|
||||
SDODD(ScaleRowDown2Box_Odd_SSSE3,
|
||||
ScaleRowDown2Box_SSSE3,
|
||||
ScaleRowDown2Box_Odd_C,
|
||||
2,
|
||||
1,
|
||||
15)
|
||||
#endif
|
||||
#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
|
||||
SDANY(ScaleUVRowDown2Box_Any_SSSE3,
|
||||
ScaleUVRowDown2Box_SSSE3,
|
||||
ScaleUVRowDown2Box_C,
|
||||
2,
|
||||
2,
|
||||
3)
|
||||
#endif
|
||||
#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
|
||||
SDANY(ScaleUVRowDown2Box_Any_AVX2,
|
||||
ScaleUVRowDown2Box_AVX2,
|
||||
ScaleUVRowDown2Box_C,
|
||||
2,
|
||||
2,
|
||||
7)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN2_AVX2
|
||||
SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
|
||||
SDANY(ScaleRowDown2Linear_Any_AVX2,
|
||||
ScaleRowDown2Linear_AVX2,
|
||||
ScaleRowDown2Linear_C,
|
||||
2,
|
||||
1,
|
||||
31)
|
||||
SDANY(ScaleRowDown2Box_Any_AVX2,
|
||||
ScaleRowDown2Box_AVX2,
|
||||
ScaleRowDown2Box_C,
|
||||
2,
|
||||
1,
|
||||
31)
|
||||
SDODD(ScaleRowDown2Box_Odd_AVX2,
|
||||
ScaleRowDown2Box_AVX2,
|
||||
ScaleRowDown2Box_Odd_C,
|
||||
2,
|
||||
1,
|
||||
31)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN2_NEON
|
||||
SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15)
|
||||
SDANY(ScaleRowDown2Linear_Any_NEON,
|
||||
ScaleRowDown2Linear_NEON,
|
||||
ScaleRowDown2Linear_C,
|
||||
2,
|
||||
1,
|
||||
15)
|
||||
SDANY(ScaleRowDown2Box_Any_NEON,
|
||||
ScaleRowDown2Box_NEON,
|
||||
ScaleRowDown2Box_C,
|
||||
2,
|
||||
1,
|
||||
15)
|
||||
SDODD(ScaleRowDown2Box_Odd_NEON,
|
||||
ScaleRowDown2Box_NEON,
|
||||
ScaleRowDown2Box_Odd_C,
|
||||
2,
|
||||
1,
|
||||
15)
|
||||
#endif
|
||||
#ifdef HAS_SCALEUVROWDOWN2_NEON
|
||||
SDANY(ScaleUVRowDown2_Any_NEON,
|
||||
ScaleUVRowDown2_NEON,
|
||||
ScaleUVRowDown2_C,
|
||||
2,
|
||||
2,
|
||||
7)
|
||||
#endif
|
||||
#ifdef HAS_SCALEUVROWDOWN2LINEAR_NEON
|
||||
SDANY(ScaleUVRowDown2Linear_Any_NEON,
|
||||
ScaleUVRowDown2Linear_NEON,
|
||||
ScaleUVRowDown2Linear_C,
|
||||
2,
|
||||
2,
|
||||
7)
|
||||
#endif
|
||||
#ifdef HAS_SCALEUVROWDOWN2BOX_NEON
|
||||
SDANY(ScaleUVRowDown2Box_Any_NEON,
|
||||
ScaleUVRowDown2Box_NEON,
|
||||
ScaleUVRowDown2Box_C,
|
||||
2,
|
||||
2,
|
||||
7)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN2_LSX
|
||||
SDANY(ScaleRowDown2_Any_LSX, ScaleRowDown2_LSX, ScaleRowDown2_C, 2, 1, 31)
|
||||
SDANY(ScaleRowDown2Linear_Any_LSX,
|
||||
ScaleRowDown2Linear_LSX,
|
||||
ScaleRowDown2Linear_C,
|
||||
2,
|
||||
1,
|
||||
31)
|
||||
SDANY(ScaleRowDown2Box_Any_LSX,
|
||||
ScaleRowDown2Box_LSX,
|
||||
ScaleRowDown2Box_C,
|
||||
2,
|
||||
1,
|
||||
31)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN4_SSSE3
|
||||
SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7)
|
||||
SDANY(ScaleRowDown4Box_Any_SSSE3,
|
||||
ScaleRowDown4Box_SSSE3,
|
||||
ScaleRowDown4Box_C,
|
||||
4,
|
||||
1,
|
||||
7)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN4_AVX2
|
||||
SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15)
|
||||
SDANY(ScaleRowDown4Box_Any_AVX2,
|
||||
ScaleRowDown4Box_AVX2,
|
||||
ScaleRowDown4Box_C,
|
||||
4,
|
||||
1,
|
||||
15)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN4_NEON
|
||||
SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 15)
|
||||
SDANY(ScaleRowDown4Box_Any_NEON,
|
||||
ScaleRowDown4Box_NEON,
|
||||
ScaleRowDown4Box_C,
|
||||
4,
|
||||
1,
|
||||
7)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN4_LSX
|
||||
SDANY(ScaleRowDown4_Any_LSX, ScaleRowDown4_LSX, ScaleRowDown4_C, 4, 1, 15)
|
||||
SDANY(ScaleRowDown4Box_Any_LSX,
|
||||
ScaleRowDown4Box_LSX,
|
||||
ScaleRowDown4Box_C,
|
||||
4,
|
||||
1,
|
||||
15)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN34_SSSE3
|
||||
SDANY(ScaleRowDown34_Any_SSSE3,
|
||||
ScaleRowDown34_SSSE3,
|
||||
ScaleRowDown34_C,
|
||||
4 / 3,
|
||||
1,
|
||||
23)
|
||||
SDANY(ScaleRowDown34_0_Box_Any_SSSE3,
|
||||
ScaleRowDown34_0_Box_SSSE3,
|
||||
ScaleRowDown34_0_Box_C,
|
||||
4 / 3,
|
||||
1,
|
||||
23)
|
||||
SDANY(ScaleRowDown34_1_Box_Any_SSSE3,
|
||||
ScaleRowDown34_1_Box_SSSE3,
|
||||
ScaleRowDown34_1_Box_C,
|
||||
4 / 3,
|
||||
1,
|
||||
23)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN34_NEON
|
||||
#ifdef __aarch64__
|
||||
SDANY(ScaleRowDown34_Any_NEON,
|
||||
ScaleRowDown34_NEON,
|
||||
ScaleRowDown34_C,
|
||||
4 / 3,
|
||||
1,
|
||||
47)
|
||||
SDANY(ScaleRowDown34_0_Box_Any_NEON,
|
||||
ScaleRowDown34_0_Box_NEON,
|
||||
ScaleRowDown34_0_Box_C,
|
||||
4 / 3,
|
||||
1,
|
||||
47)
|
||||
SDANY(ScaleRowDown34_1_Box_Any_NEON,
|
||||
ScaleRowDown34_1_Box_NEON,
|
||||
ScaleRowDown34_1_Box_C,
|
||||
4 / 3,
|
||||
1,
|
||||
47)
|
||||
#else
|
||||
SDANY(ScaleRowDown34_Any_NEON,
|
||||
ScaleRowDown34_NEON,
|
||||
ScaleRowDown34_C,
|
||||
4 / 3,
|
||||
1,
|
||||
23)
|
||||
SDANY(ScaleRowDown34_0_Box_Any_NEON,
|
||||
ScaleRowDown34_0_Box_NEON,
|
||||
ScaleRowDown34_0_Box_C,
|
||||
4 / 3,
|
||||
1,
|
||||
23)
|
||||
SDANY(ScaleRowDown34_1_Box_Any_NEON,
|
||||
ScaleRowDown34_1_Box_NEON,
|
||||
ScaleRowDown34_1_Box_C,
|
||||
4 / 3,
|
||||
1,
|
||||
23)
|
||||
#endif
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN34_LSX
|
||||
SDANY(ScaleRowDown34_Any_LSX,
|
||||
ScaleRowDown34_LSX,
|
||||
ScaleRowDown34_C,
|
||||
4 / 3,
|
||||
1,
|
||||
47)
|
||||
SDANY(ScaleRowDown34_0_Box_Any_LSX,
|
||||
ScaleRowDown34_0_Box_LSX,
|
||||
ScaleRowDown34_0_Box_C,
|
||||
4 / 3,
|
||||
1,
|
||||
47)
|
||||
SDANY(ScaleRowDown34_1_Box_Any_LSX,
|
||||
ScaleRowDown34_1_Box_LSX,
|
||||
ScaleRowDown34_1_Box_C,
|
||||
4 / 3,
|
||||
1,
|
||||
47)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN38_SSSE3
|
||||
SDANY(ScaleRowDown38_Any_SSSE3,
|
||||
ScaleRowDown38_SSSE3,
|
||||
ScaleRowDown38_C,
|
||||
8 / 3,
|
||||
1,
|
||||
11)
|
||||
SDANY(ScaleRowDown38_3_Box_Any_SSSE3,
|
||||
ScaleRowDown38_3_Box_SSSE3,
|
||||
ScaleRowDown38_3_Box_C,
|
||||
8 / 3,
|
||||
1,
|
||||
5)
|
||||
SDANY(ScaleRowDown38_2_Box_Any_SSSE3,
|
||||
ScaleRowDown38_2_Box_SSSE3,
|
||||
ScaleRowDown38_2_Box_C,
|
||||
8 / 3,
|
||||
1,
|
||||
5)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN38_NEON
|
||||
SDANY(ScaleRowDown38_Any_NEON,
|
||||
ScaleRowDown38_NEON,
|
||||
ScaleRowDown38_C,
|
||||
8 / 3,
|
||||
1,
|
||||
11)
|
||||
SDANY(ScaleRowDown38_3_Box_Any_NEON,
|
||||
ScaleRowDown38_3_Box_NEON,
|
||||
ScaleRowDown38_3_Box_C,
|
||||
8 / 3,
|
||||
1,
|
||||
11)
|
||||
SDANY(ScaleRowDown38_2_Box_Any_NEON,
|
||||
ScaleRowDown38_2_Box_NEON,
|
||||
ScaleRowDown38_2_Box_C,
|
||||
8 / 3,
|
||||
1,
|
||||
11)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN38_LSX
|
||||
SDANY(ScaleRowDown38_Any_LSX,
|
||||
ScaleRowDown38_LSX,
|
||||
ScaleRowDown38_C,
|
||||
8 / 3,
|
||||
1,
|
||||
11)
|
||||
SDANY(ScaleRowDown38_3_Box_Any_LSX,
|
||||
ScaleRowDown38_3_Box_LSX,
|
||||
ScaleRowDown38_3_Box_C,
|
||||
8 / 3,
|
||||
1,
|
||||
11)
|
||||
SDANY(ScaleRowDown38_2_Box_Any_LSX,
|
||||
ScaleRowDown38_2_Box_LSX,
|
||||
ScaleRowDown38_2_Box_C,
|
||||
8 / 3,
|
||||
1,
|
||||
11)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEARGBROWDOWN2_SSE2
|
||||
SDANY(ScaleARGBRowDown2_Any_SSE2,
|
||||
ScaleARGBRowDown2_SSE2,
|
||||
ScaleARGBRowDown2_C,
|
||||
2,
|
||||
4,
|
||||
3)
|
||||
SDANY(ScaleARGBRowDown2Linear_Any_SSE2,
|
||||
ScaleARGBRowDown2Linear_SSE2,
|
||||
ScaleARGBRowDown2Linear_C,
|
||||
2,
|
||||
4,
|
||||
3)
|
||||
SDANY(ScaleARGBRowDown2Box_Any_SSE2,
|
||||
ScaleARGBRowDown2Box_SSE2,
|
||||
ScaleARGBRowDown2Box_C,
|
||||
2,
|
||||
4,
|
||||
3)
|
||||
#endif
|
||||
#ifdef HAS_SCALEARGBROWDOWN2_NEON
|
||||
SDANY(ScaleARGBRowDown2_Any_NEON,
|
||||
ScaleARGBRowDown2_NEON,
|
||||
ScaleARGBRowDown2_C,
|
||||
2,
|
||||
4,
|
||||
7)
|
||||
SDANY(ScaleARGBRowDown2Linear_Any_NEON,
|
||||
ScaleARGBRowDown2Linear_NEON,
|
||||
ScaleARGBRowDown2Linear_C,
|
||||
2,
|
||||
4,
|
||||
7)
|
||||
SDANY(ScaleARGBRowDown2Box_Any_NEON,
|
||||
ScaleARGBRowDown2Box_NEON,
|
||||
ScaleARGBRowDown2Box_C,
|
||||
2,
|
||||
4,
|
||||
7)
|
||||
#endif
|
||||
#ifdef HAS_SCALEARGBROWDOWN2_LSX
|
||||
SDANY(ScaleARGBRowDown2_Any_LSX,
|
||||
ScaleARGBRowDown2_LSX,
|
||||
ScaleARGBRowDown2_C,
|
||||
2,
|
||||
4,
|
||||
3)
|
||||
SDANY(ScaleARGBRowDown2Linear_Any_LSX,
|
||||
ScaleARGBRowDown2Linear_LSX,
|
||||
ScaleARGBRowDown2Linear_C,
|
||||
2,
|
||||
4,
|
||||
3)
|
||||
SDANY(ScaleARGBRowDown2Box_Any_LSX,
|
||||
ScaleARGBRowDown2Box_LSX,
|
||||
ScaleARGBRowDown2Box_C,
|
||||
2,
|
||||
4,
|
||||
3)
|
||||
#endif
|
||||
#undef SDANY
|
||||
|
||||
// Scale down by even scale factor.
|
||||
#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK) \
|
||||
void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, \
|
||||
uint8_t* dst_ptr, int dst_width) { \
|
||||
int r = dst_width & MASK; \
|
||||
int n = dst_width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n); \
|
||||
} \
|
||||
SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, src_stepx, \
|
||||
dst_ptr + n * BPP, r); \
|
||||
}
|
||||
|
||||
#ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2
|
||||
SDAANY(ScaleARGBRowDownEven_Any_SSE2,
|
||||
ScaleARGBRowDownEven_SSE2,
|
||||
ScaleARGBRowDownEven_C,
|
||||
4,
|
||||
3)
|
||||
SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2,
|
||||
ScaleARGBRowDownEvenBox_SSE2,
|
||||
ScaleARGBRowDownEvenBox_C,
|
||||
4,
|
||||
3)
|
||||
#endif
|
||||
#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
|
||||
SDAANY(ScaleARGBRowDownEven_Any_NEON,
|
||||
ScaleARGBRowDownEven_NEON,
|
||||
ScaleARGBRowDownEven_C,
|
||||
4,
|
||||
3)
|
||||
SDAANY(ScaleARGBRowDownEvenBox_Any_NEON,
|
||||
ScaleARGBRowDownEvenBox_NEON,
|
||||
ScaleARGBRowDownEvenBox_C,
|
||||
4,
|
||||
3)
|
||||
#endif
|
||||
#ifdef HAS_SCALEARGBROWDOWNEVEN_LSX
|
||||
SDAANY(ScaleARGBRowDownEven_Any_LSX,
|
||||
ScaleARGBRowDownEven_LSX,
|
||||
ScaleARGBRowDownEven_C,
|
||||
4,
|
||||
3)
|
||||
SDAANY(ScaleARGBRowDownEvenBox_Any_LSX,
|
||||
ScaleARGBRowDownEvenBox_LSX,
|
||||
ScaleARGBRowDownEvenBox_C,
|
||||
4,
|
||||
3)
|
||||
#endif
|
||||
#ifdef HAS_SCALEUVROWDOWNEVEN_NEON
|
||||
SDAANY(ScaleUVRowDownEven_Any_NEON,
|
||||
ScaleUVRowDownEven_NEON,
|
||||
ScaleUVRowDownEven_C,
|
||||
2,
|
||||
3)
|
||||
#endif
|
||||
|
||||
#ifdef SASIMDONLY
|
||||
// This also works and uses memcpy and SIMD instead of C, but is slower on ARM
|
||||
|
||||
// Add rows box filter scale down. Using macro from row_any
|
||||
#define SAROW(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \
|
||||
void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int width) { \
|
||||
SIMD_ALIGNED(uint16_t dst_temp[32]); \
|
||||
SIMD_ALIGNED(uint8_t src_temp[32]); \
|
||||
memset(dst_temp, 0, 32 * 2); /* for msan */ \
|
||||
int r = width & MASK; \
|
||||
int n = width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
ANY_SIMD(src_ptr, dst_ptr, n); \
|
||||
} \
|
||||
memcpy(src_temp, src_ptr + n * SBPP, r * SBPP); \
|
||||
memcpy(dst_temp, dst_ptr + n * BPP, r * BPP); \
|
||||
ANY_SIMD(src_temp, dst_temp, MASK + 1); \
|
||||
memcpy(dst_ptr + n * BPP, dst_temp, r * BPP); \
|
||||
}
|
||||
|
||||
#ifdef HAS_SCALEADDROW_SSE2
|
||||
SAROW(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, 1, 2, 15)
|
||||
#endif
|
||||
#ifdef HAS_SCALEADDROW_AVX2
|
||||
SAROW(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, 1, 2, 31)
|
||||
#endif
|
||||
#ifdef HAS_SCALEADDROW_NEON
|
||||
SAROW(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, 1, 2, 15)
|
||||
#endif
|
||||
#ifdef HAS_SCALEADDROW_LSX
|
||||
SAROW(ScaleAddRow_Any_LSX, ScaleAddRow_LSX, 1, 2, 15)
|
||||
#endif
|
||||
#undef SAANY
|
||||
|
||||
#else
|
||||
|
||||
// Add rows box filter scale down.
|
||||
#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \
|
||||
void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { \
|
||||
int n = src_width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \
|
||||
} \
|
||||
SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \
|
||||
}
|
||||
|
||||
#ifdef HAS_SCALEADDROW_SSE2
|
||||
SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15)
|
||||
#endif
|
||||
#ifdef HAS_SCALEADDROW_AVX2
|
||||
SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31)
|
||||
#endif
|
||||
#ifdef HAS_SCALEADDROW_NEON
|
||||
SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
|
||||
#endif
|
||||
#ifdef HAS_SCALEADDROW_LSX
|
||||
SAANY(ScaleAddRow_Any_LSX, ScaleAddRow_LSX, ScaleAddRow_C, 15)
|
||||
#endif
|
||||
#undef SAANY
|
||||
|
||||
#endif // SASIMDONLY
|
||||
|
||||
// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
|
||||
#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \
|
||||
void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \
|
||||
int dx) { \
|
||||
int r = dst_width & MASK; \
|
||||
int n = dst_width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \
|
||||
} \
|
||||
TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx); \
|
||||
}
|
||||
|
||||
#ifdef HAS_SCALEFILTERCOLS_NEON
|
||||
CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
|
||||
#endif
|
||||
#ifdef HAS_SCALEFILTERCOLS_LSX
|
||||
CANY(ScaleFilterCols_Any_LSX, ScaleFilterCols_LSX, ScaleFilterCols_C, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_SCALEARGBCOLS_NEON
|
||||
CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
|
||||
#endif
|
||||
#ifdef HAS_SCALEARGBCOLS_LSX
|
||||
CANY(ScaleARGBCols_Any_LSX, ScaleARGBCols_LSX, ScaleARGBCols_C, 4, 3)
|
||||
#endif
|
||||
#ifdef HAS_SCALEARGBFILTERCOLS_NEON
|
||||
CANY(ScaleARGBFilterCols_Any_NEON,
|
||||
ScaleARGBFilterCols_NEON,
|
||||
ScaleARGBFilterCols_C,
|
||||
4,
|
||||
3)
|
||||
#endif
|
||||
#ifdef HAS_SCALEARGBFILTERCOLS_LSX
|
||||
CANY(ScaleARGBFilterCols_Any_LSX,
|
||||
ScaleARGBFilterCols_LSX,
|
||||
ScaleARGBFilterCols_C,
|
||||
4,
|
||||
7)
|
||||
#endif
|
||||
#undef CANY
|
||||
|
||||
// Scale up horizontally 2 times using linear filter.
|
||||
#define SUH2LANY(NAME, SIMD, C, MASK, PTYPE) \
|
||||
void NAME(const PTYPE* src_ptr, PTYPE* dst_ptr, int dst_width) { \
|
||||
int work_width = (dst_width - 1) & ~1; \
|
||||
int r = work_width & MASK; \
|
||||
int n = work_width & ~MASK; \
|
||||
dst_ptr[0] = src_ptr[0]; \
|
||||
if (work_width > 0) { \
|
||||
if (n != 0) { \
|
||||
SIMD(src_ptr, dst_ptr + 1, n); \
|
||||
} \
|
||||
C(src_ptr + (n / 2), dst_ptr + n + 1, r); \
|
||||
} \
|
||||
dst_ptr[dst_width - 1] = src_ptr[(dst_width - 1) / 2]; \
|
||||
}
|
||||
|
||||
// Even the C versions need to be wrapped, because boundary pixels have to
|
||||
// be handled differently
|
||||
|
||||
SUH2LANY(ScaleRowUp2_Linear_Any_C,
|
||||
ScaleRowUp2_Linear_C,
|
||||
ScaleRowUp2_Linear_C,
|
||||
0,
|
||||
uint8_t)
|
||||
|
||||
SUH2LANY(ScaleRowUp2_Linear_16_Any_C,
|
||||
ScaleRowUp2_Linear_16_C,
|
||||
ScaleRowUp2_Linear_16_C,
|
||||
0,
|
||||
uint16_t)
|
||||
|
||||
#ifdef HAS_SCALEROWUP2_LINEAR_SSE2
|
||||
SUH2LANY(ScaleRowUp2_Linear_Any_SSE2,
|
||||
ScaleRowUp2_Linear_SSE2,
|
||||
ScaleRowUp2_Linear_C,
|
||||
15,
|
||||
uint8_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2_LINEAR_SSSE3
|
||||
SUH2LANY(ScaleRowUp2_Linear_Any_SSSE3,
|
||||
ScaleRowUp2_Linear_SSSE3,
|
||||
ScaleRowUp2_Linear_C,
|
||||
15,
|
||||
uint8_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2_LINEAR_12_SSSE3
|
||||
SUH2LANY(ScaleRowUp2_Linear_12_Any_SSSE3,
|
||||
ScaleRowUp2_Linear_12_SSSE3,
|
||||
ScaleRowUp2_Linear_16_C,
|
||||
15,
|
||||
uint16_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2_LINEAR_16_SSE2
|
||||
SUH2LANY(ScaleRowUp2_Linear_16_Any_SSE2,
|
||||
ScaleRowUp2_Linear_16_SSE2,
|
||||
ScaleRowUp2_Linear_16_C,
|
||||
7,
|
||||
uint16_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2_LINEAR_AVX2
|
||||
SUH2LANY(ScaleRowUp2_Linear_Any_AVX2,
|
||||
ScaleRowUp2_Linear_AVX2,
|
||||
ScaleRowUp2_Linear_C,
|
||||
31,
|
||||
uint8_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2_LINEAR_12_AVX2
|
||||
SUH2LANY(ScaleRowUp2_Linear_12_Any_AVX2,
|
||||
ScaleRowUp2_Linear_12_AVX2,
|
||||
ScaleRowUp2_Linear_16_C,
|
||||
31,
|
||||
uint16_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2_LINEAR_16_AVX2
|
||||
SUH2LANY(ScaleRowUp2_Linear_16_Any_AVX2,
|
||||
ScaleRowUp2_Linear_16_AVX2,
|
||||
ScaleRowUp2_Linear_16_C,
|
||||
15,
|
||||
uint16_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2_LINEAR_NEON
|
||||
#ifdef __aarch64__
|
||||
SUH2LANY(ScaleRowUp2_Linear_Any_NEON,
|
||||
ScaleRowUp2_Linear_NEON,
|
||||
ScaleRowUp2_Linear_C,
|
||||
31,
|
||||
uint8_t)
|
||||
#else
|
||||
SUH2LANY(ScaleRowUp2_Linear_Any_NEON,
|
||||
ScaleRowUp2_Linear_NEON,
|
||||
ScaleRowUp2_Linear_C,
|
||||
15,
|
||||
uint8_t)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2_LINEAR_12_NEON
|
||||
SUH2LANY(ScaleRowUp2_Linear_12_Any_NEON,
|
||||
ScaleRowUp2_Linear_12_NEON,
|
||||
ScaleRowUp2_Linear_16_C,
|
||||
15,
|
||||
uint16_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2_LINEAR_16_NEON
|
||||
SUH2LANY(ScaleRowUp2_Linear_16_Any_NEON,
|
||||
ScaleRowUp2_Linear_16_NEON,
|
||||
ScaleRowUp2_Linear_16_C,
|
||||
15,
|
||||
uint16_t)
|
||||
#endif
|
||||
|
||||
#undef SUH2LANY
|
||||
|
||||
// Scale up 2 times using bilinear filter.
|
||||
// This function produces 2 rows at a time.
|
||||
#define SU2BLANY(NAME, SIMD, C, MASK, PTYPE) \
|
||||
void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \
|
||||
ptrdiff_t dst_stride, int dst_width) { \
|
||||
int work_width = (dst_width - 1) & ~1; \
|
||||
int r = work_width & MASK; \
|
||||
int n = work_width & ~MASK; \
|
||||
const PTYPE* sa = src_ptr; \
|
||||
const PTYPE* sb = src_ptr + src_stride; \
|
||||
PTYPE* da = dst_ptr; \
|
||||
PTYPE* db = dst_ptr + dst_stride; \
|
||||
da[0] = (3 * sa[0] + sb[0] + 2) >> 2; \
|
||||
db[0] = (sa[0] + 3 * sb[0] + 2) >> 2; \
|
||||
if (work_width > 0) { \
|
||||
if (n != 0) { \
|
||||
SIMD(sa, sb - sa, da + 1, db - da, n); \
|
||||
} \
|
||||
C(sa + (n / 2), sb - sa, da + n + 1, db - da, r); \
|
||||
} \
|
||||
da[dst_width - 1] = \
|
||||
(3 * sa[(dst_width - 1) / 2] + sb[(dst_width - 1) / 2] + 2) >> 2; \
|
||||
db[dst_width - 1] = \
|
||||
(sa[(dst_width - 1) / 2] + 3 * sb[(dst_width - 1) / 2] + 2) >> 2; \
|
||||
}
|
||||
|
||||
SU2BLANY(ScaleRowUp2_Bilinear_Any_C,
|
||||
ScaleRowUp2_Bilinear_C,
|
||||
ScaleRowUp2_Bilinear_C,
|
||||
0,
|
||||
uint8_t)
|
||||
|
||||
SU2BLANY(ScaleRowUp2_Bilinear_16_Any_C,
|
||||
ScaleRowUp2_Bilinear_16_C,
|
||||
ScaleRowUp2_Bilinear_16_C,
|
||||
0,
|
||||
uint16_t)
|
||||
|
||||
#ifdef HAS_SCALEROWUP2_BILINEAR_SSE2
|
||||
SU2BLANY(ScaleRowUp2_Bilinear_Any_SSE2,
|
||||
ScaleRowUp2_Bilinear_SSE2,
|
||||
ScaleRowUp2_Bilinear_C,
|
||||
15,
|
||||
uint8_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2_BILINEAR_12_SSSE3
|
||||
SU2BLANY(ScaleRowUp2_Bilinear_12_Any_SSSE3,
|
||||
ScaleRowUp2_Bilinear_12_SSSE3,
|
||||
ScaleRowUp2_Bilinear_16_C,
|
||||
15,
|
||||
uint16_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2_BILINEAR_16_SSE2
|
||||
SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSE2,
|
||||
ScaleRowUp2_Bilinear_16_SSE2,
|
||||
ScaleRowUp2_Bilinear_16_C,
|
||||
7,
|
||||
uint16_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3
|
||||
SU2BLANY(ScaleRowUp2_Bilinear_Any_SSSE3,
|
||||
ScaleRowUp2_Bilinear_SSSE3,
|
||||
ScaleRowUp2_Bilinear_C,
|
||||
15,
|
||||
uint8_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2_BILINEAR_AVX2
|
||||
SU2BLANY(ScaleRowUp2_Bilinear_Any_AVX2,
|
||||
ScaleRowUp2_Bilinear_AVX2,
|
||||
ScaleRowUp2_Bilinear_C,
|
||||
31,
|
||||
uint8_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2_BILINEAR_12_AVX2
|
||||
SU2BLANY(ScaleRowUp2_Bilinear_12_Any_AVX2,
|
||||
ScaleRowUp2_Bilinear_12_AVX2,
|
||||
ScaleRowUp2_Bilinear_16_C,
|
||||
15,
|
||||
uint16_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2_BILINEAR_16_AVX2
|
||||
SU2BLANY(ScaleRowUp2_Bilinear_16_Any_AVX2,
|
||||
ScaleRowUp2_Bilinear_16_AVX2,
|
||||
ScaleRowUp2_Bilinear_16_C,
|
||||
15,
|
||||
uint16_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2_BILINEAR_NEON
|
||||
SU2BLANY(ScaleRowUp2_Bilinear_Any_NEON,
|
||||
ScaleRowUp2_Bilinear_NEON,
|
||||
ScaleRowUp2_Bilinear_C,
|
||||
15,
|
||||
uint8_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2_BILINEAR_12_NEON
|
||||
SU2BLANY(ScaleRowUp2_Bilinear_12_Any_NEON,
|
||||
ScaleRowUp2_Bilinear_12_NEON,
|
||||
ScaleRowUp2_Bilinear_16_C,
|
||||
15,
|
||||
uint16_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2_BILINEAR_16_NEON
|
||||
SU2BLANY(ScaleRowUp2_Bilinear_16_Any_NEON,
|
||||
ScaleRowUp2_Bilinear_16_NEON,
|
||||
ScaleRowUp2_Bilinear_16_C,
|
||||
7,
|
||||
uint16_t)
|
||||
#endif
|
||||
|
||||
#undef SU2BLANY
|
||||
|
||||
// Scale bi-planar plane up horizontally 2 times using linear filter.
|
||||
#define SBUH2LANY(NAME, SIMD, C, MASK, PTYPE) \
|
||||
void NAME(const PTYPE* src_ptr, PTYPE* dst_ptr, int dst_width) { \
|
||||
int work_width = (dst_width - 1) & ~1; \
|
||||
int r = work_width & MASK; \
|
||||
int n = work_width & ~MASK; \
|
||||
dst_ptr[0] = src_ptr[0]; \
|
||||
dst_ptr[1] = src_ptr[1]; \
|
||||
if (work_width > 0) { \
|
||||
if (n != 0) { \
|
||||
SIMD(src_ptr, dst_ptr + 2, n); \
|
||||
} \
|
||||
C(src_ptr + n, dst_ptr + 2 * n + 2, r); \
|
||||
} \
|
||||
dst_ptr[2 * dst_width - 2] = src_ptr[((dst_width + 1) & ~1) - 2]; \
|
||||
dst_ptr[2 * dst_width - 1] = src_ptr[((dst_width + 1) & ~1) - 1]; \
|
||||
}
|
||||
|
||||
SBUH2LANY(ScaleUVRowUp2_Linear_Any_C,
|
||||
ScaleUVRowUp2_Linear_C,
|
||||
ScaleUVRowUp2_Linear_C,
|
||||
0,
|
||||
uint8_t)
|
||||
|
||||
SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_C,
|
||||
ScaleUVRowUp2_Linear_16_C,
|
||||
ScaleUVRowUp2_Linear_16_C,
|
||||
0,
|
||||
uint16_t)
|
||||
|
||||
#ifdef HAS_SCALEUVROWUP2_LINEAR_SSSE3
|
||||
SBUH2LANY(ScaleUVRowUp2_Linear_Any_SSSE3,
|
||||
ScaleUVRowUp2_Linear_SSSE3,
|
||||
ScaleUVRowUp2_Linear_C,
|
||||
7,
|
||||
uint8_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEUVROWUP2_LINEAR_AVX2
|
||||
SBUH2LANY(ScaleUVRowUp2_Linear_Any_AVX2,
|
||||
ScaleUVRowUp2_Linear_AVX2,
|
||||
ScaleUVRowUp2_Linear_C,
|
||||
15,
|
||||
uint8_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41
|
||||
SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_SSE41,
|
||||
ScaleUVRowUp2_Linear_16_SSE41,
|
||||
ScaleUVRowUp2_Linear_16_C,
|
||||
3,
|
||||
uint16_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2
|
||||
SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_AVX2,
|
||||
ScaleUVRowUp2_Linear_16_AVX2,
|
||||
ScaleUVRowUp2_Linear_16_C,
|
||||
7,
|
||||
uint16_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEUVROWUP2_LINEAR_NEON
|
||||
SBUH2LANY(ScaleUVRowUp2_Linear_Any_NEON,
|
||||
ScaleUVRowUp2_Linear_NEON,
|
||||
ScaleUVRowUp2_Linear_C,
|
||||
15,
|
||||
uint8_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEUVROWUP2_LINEAR_16_NEON
|
||||
SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_NEON,
|
||||
ScaleUVRowUp2_Linear_16_NEON,
|
||||
ScaleUVRowUp2_Linear_16_C,
|
||||
15,
|
||||
uint16_t)
|
||||
#endif
|
||||
|
||||
#undef SBUH2LANY
|
||||
|
||||
// Scale bi-planar plane up 2 times using bilinear filter.
|
||||
// This function produces 2 rows at a time.
|
||||
#define SBU2BLANY(NAME, SIMD, C, MASK, PTYPE) \
|
||||
void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \
|
||||
ptrdiff_t dst_stride, int dst_width) { \
|
||||
int work_width = (dst_width - 1) & ~1; \
|
||||
int r = work_width & MASK; \
|
||||
int n = work_width & ~MASK; \
|
||||
const PTYPE* sa = src_ptr; \
|
||||
const PTYPE* sb = src_ptr + src_stride; \
|
||||
PTYPE* da = dst_ptr; \
|
||||
PTYPE* db = dst_ptr + dst_stride; \
|
||||
da[0] = (3 * sa[0] + sb[0] + 2) >> 2; \
|
||||
db[0] = (sa[0] + 3 * sb[0] + 2) >> 2; \
|
||||
da[1] = (3 * sa[1] + sb[1] + 2) >> 2; \
|
||||
db[1] = (sa[1] + 3 * sb[1] + 2) >> 2; \
|
||||
if (work_width > 0) { \
|
||||
if (n != 0) { \
|
||||
SIMD(sa, sb - sa, da + 2, db - da, n); \
|
||||
} \
|
||||
C(sa + n, sb - sa, da + 2 * n + 2, db - da, r); \
|
||||
} \
|
||||
da[2 * dst_width - 2] = (3 * sa[((dst_width + 1) & ~1) - 2] + \
|
||||
sb[((dst_width + 1) & ~1) - 2] + 2) >> \
|
||||
2; \
|
||||
db[2 * dst_width - 2] = (sa[((dst_width + 1) & ~1) - 2] + \
|
||||
3 * sb[((dst_width + 1) & ~1) - 2] + 2) >> \
|
||||
2; \
|
||||
da[2 * dst_width - 1] = (3 * sa[((dst_width + 1) & ~1) - 1] + \
|
||||
sb[((dst_width + 1) & ~1) - 1] + 2) >> \
|
||||
2; \
|
||||
db[2 * dst_width - 1] = (sa[((dst_width + 1) & ~1) - 1] + \
|
||||
3 * sb[((dst_width + 1) & ~1) - 1] + 2) >> \
|
||||
2; \
|
||||
}
|
||||
|
||||
SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_C,
|
||||
ScaleUVRowUp2_Bilinear_C,
|
||||
ScaleUVRowUp2_Bilinear_C,
|
||||
0,
|
||||
uint8_t)
|
||||
|
||||
SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_C,
|
||||
ScaleUVRowUp2_Bilinear_16_C,
|
||||
ScaleUVRowUp2_Bilinear_16_C,
|
||||
0,
|
||||
uint16_t)
|
||||
|
||||
#ifdef HAS_SCALEUVROWUP2_BILINEAR_SSSE3
|
||||
SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_SSSE3,
|
||||
ScaleUVRowUp2_Bilinear_SSSE3,
|
||||
ScaleUVRowUp2_Bilinear_C,
|
||||
7,
|
||||
uint8_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEUVROWUP2_BILINEAR_AVX2
|
||||
SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_AVX2,
|
||||
ScaleUVRowUp2_Bilinear_AVX2,
|
||||
ScaleUVRowUp2_Bilinear_C,
|
||||
15,
|
||||
uint8_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
|
||||
SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_SSE41,
|
||||
ScaleUVRowUp2_Bilinear_16_SSE41,
|
||||
ScaleUVRowUp2_Bilinear_16_C,
|
||||
7,
|
||||
uint16_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2
|
||||
SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_AVX2,
|
||||
ScaleUVRowUp2_Bilinear_16_AVX2,
|
||||
ScaleUVRowUp2_Bilinear_16_C,
|
||||
7,
|
||||
uint16_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEUVROWUP2_BILINEAR_NEON
|
||||
SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_NEON,
|
||||
ScaleUVRowUp2_Bilinear_NEON,
|
||||
ScaleUVRowUp2_Bilinear_C,
|
||||
7,
|
||||
uint8_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_NEON
|
||||
SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_NEON,
|
||||
ScaleUVRowUp2_Bilinear_16_NEON,
|
||||
ScaleUVRowUp2_Bilinear_16_C,
|
||||
7,
|
||||
uint16_t)
|
||||
#endif
|
||||
|
||||
#undef SBU2BLANY
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
1158
3rdparty/libyuv/source/scale_argb.cc
vendored
Normal file
1158
3rdparty/libyuv/source/scale_argb.cc
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1977
3rdparty/libyuv/source/scale_common.cc
vendored
Normal file
1977
3rdparty/libyuv/source/scale_common.cc
vendored
Normal file
File diff suppressed because it is too large
Load Diff
2947
3rdparty/libyuv/source/scale_gcc.cc
vendored
Normal file
2947
3rdparty/libyuv/source/scale_gcc.cc
vendored
Normal file
File diff suppressed because it is too large
Load Diff
739
3rdparty/libyuv/source/scale_lsx.cc
vendored
Normal file
739
3rdparty/libyuv/source/scale_lsx.cc
vendored
Normal file
@@ -0,0 +1,739 @@
|
||||
/*
|
||||
* Copyright 2022 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2022 Loongson Technology Corporation Limited
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
#include "libyuv/scale_row.h"
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
|
||||
#include "libyuv/loongson_intrinsics.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define LOAD_DATA(_src, _in, _out) \
|
||||
{ \
|
||||
int _tmp1, _tmp2, _tmp3, _tmp4; \
|
||||
DUP4_ARG2(__lsx_vpickve2gr_w, _in, 0, _in, 1, _in, 2, _in, 3, _tmp1, \
|
||||
_tmp2, _tmp3, _tmp4); \
|
||||
_out = __lsx_vinsgr2vr_w(_out, _src[_tmp1], 0); \
|
||||
_out = __lsx_vinsgr2vr_w(_out, _src[_tmp2], 1); \
|
||||
_out = __lsx_vinsgr2vr_w(_out, _src[_tmp3], 2); \
|
||||
_out = __lsx_vinsgr2vr_w(_out, _src[_tmp4], 3); \
|
||||
}
|
||||
|
||||
void ScaleARGBRowDown2_LSX(const uint8_t* src_argb,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_argb,
|
||||
int dst_width) {
|
||||
int x;
|
||||
int len = dst_width / 4;
|
||||
(void)src_stride;
|
||||
__m128i src0, src1, dst0;
|
||||
|
||||
for (x = 0; x < len; x++) {
|
||||
DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
|
||||
dst0 = __lsx_vpickod_w(src1, src0);
|
||||
__lsx_vst(dst0, dst_argb, 0);
|
||||
src_argb += 32;
|
||||
dst_argb += 16;
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleARGBRowDown2Linear_LSX(const uint8_t* src_argb,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_argb,
|
||||
int dst_width) {
|
||||
int x;
|
||||
int len = dst_width / 4;
|
||||
(void)src_stride;
|
||||
__m128i src0, src1, tmp0, tmp1, dst0;
|
||||
|
||||
for (x = 0; x < len; x++) {
|
||||
DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
|
||||
tmp0 = __lsx_vpickev_w(src1, src0);
|
||||
tmp1 = __lsx_vpickod_w(src1, src0);
|
||||
dst0 = __lsx_vavgr_bu(tmp1, tmp0);
|
||||
__lsx_vst(dst0, dst_argb, 0);
|
||||
src_argb += 32;
|
||||
dst_argb += 16;
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleARGBRowDown2Box_LSX(const uint8_t* src_argb,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_argb,
|
||||
int dst_width) {
|
||||
int x;
|
||||
int len = dst_width / 4;
|
||||
const uint8_t* s = src_argb;
|
||||
const uint8_t* t = src_argb + src_stride;
|
||||
__m128i src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3, dst0;
|
||||
__m128i reg0, reg1, reg2, reg3;
|
||||
__m128i shuff = {0x0703060205010400, 0x0F0B0E0A0D090C08};
|
||||
|
||||
for (x = 0; x < len; x++) {
|
||||
DUP2_ARG2(__lsx_vld, s, 0, s, 16, src0, src1);
|
||||
DUP2_ARG2(__lsx_vld, t, 0, t, 16, src2, src3);
|
||||
DUP4_ARG3(__lsx_vshuf_b, src0, src0, shuff, src1, src1, shuff, src2, src2,
|
||||
shuff, src3, src3, shuff, tmp0, tmp1, tmp2, tmp3);
|
||||
DUP4_ARG2(__lsx_vhaddw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3,
|
||||
tmp3, reg0, reg1, reg2, reg3);
|
||||
DUP2_ARG2(__lsx_vsadd_hu, reg0, reg2, reg1, reg3, reg0, reg1);
|
||||
dst0 = __lsx_vsrarni_b_h(reg1, reg0, 2);
|
||||
__lsx_vst(dst0, dst_argb, 0);
|
||||
s += 32;
|
||||
t += 32;
|
||||
dst_argb += 16;
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleARGBRowDownEven_LSX(const uint8_t* src_argb,
|
||||
ptrdiff_t src_stride,
|
||||
int32_t src_stepx,
|
||||
uint8_t* dst_argb,
|
||||
int dst_width) {
|
||||
int x;
|
||||
int len = dst_width / 4;
|
||||
int32_t stepx = src_stepx << 2;
|
||||
(void)src_stride;
|
||||
__m128i dst0, dst1, dst2, dst3;
|
||||
|
||||
for (x = 0; x < len; x++) {
|
||||
dst0 = __lsx_vldrepl_w(src_argb, 0);
|
||||
src_argb += stepx;
|
||||
dst1 = __lsx_vldrepl_w(src_argb, 0);
|
||||
src_argb += stepx;
|
||||
dst2 = __lsx_vldrepl_w(src_argb, 0);
|
||||
src_argb += stepx;
|
||||
dst3 = __lsx_vldrepl_w(src_argb, 0);
|
||||
src_argb += stepx;
|
||||
__lsx_vstelm_w(dst0, dst_argb, 0, 0);
|
||||
__lsx_vstelm_w(dst1, dst_argb, 4, 0);
|
||||
__lsx_vstelm_w(dst2, dst_argb, 8, 0);
|
||||
__lsx_vstelm_w(dst3, dst_argb, 12, 0);
|
||||
dst_argb += 16;
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleARGBRowDownEvenBox_LSX(const uint8_t* src_argb,
|
||||
ptrdiff_t src_stride,
|
||||
int src_stepx,
|
||||
uint8_t* dst_argb,
|
||||
int dst_width) {
|
||||
int x;
|
||||
int len = dst_width / 4;
|
||||
int32_t stepx = src_stepx * 4;
|
||||
const uint8_t* next_argb = src_argb + src_stride;
|
||||
__m128i src0, src1, src2, src3;
|
||||
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
||||
__m128i reg0, reg1, dst0;
|
||||
|
||||
for (x = 0; x < len; x++) {
|
||||
tmp0 = __lsx_vldrepl_d(src_argb, 0);
|
||||
src_argb += stepx;
|
||||
tmp1 = __lsx_vldrepl_d(src_argb, 0);
|
||||
src_argb += stepx;
|
||||
tmp2 = __lsx_vldrepl_d(src_argb, 0);
|
||||
src_argb += stepx;
|
||||
tmp3 = __lsx_vldrepl_d(src_argb, 0);
|
||||
src_argb += stepx;
|
||||
tmp4 = __lsx_vldrepl_d(next_argb, 0);
|
||||
next_argb += stepx;
|
||||
tmp5 = __lsx_vldrepl_d(next_argb, 0);
|
||||
next_argb += stepx;
|
||||
tmp6 = __lsx_vldrepl_d(next_argb, 0);
|
||||
next_argb += stepx;
|
||||
tmp7 = __lsx_vldrepl_d(next_argb, 0);
|
||||
next_argb += stepx;
|
||||
DUP4_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
|
||||
src0, src1, src2, src3);
|
||||
DUP2_ARG2(__lsx_vaddwev_h_bu, src0, src2, src1, src3, tmp0, tmp2);
|
||||
DUP2_ARG2(__lsx_vaddwod_h_bu, src0, src2, src1, src3, tmp1, tmp3);
|
||||
DUP2_ARG2(__lsx_vpackev_w, tmp1, tmp0, tmp3, tmp2, reg0, reg1);
|
||||
DUP2_ARG2(__lsx_vpackod_w, tmp1, tmp0, tmp3, tmp2, tmp4, tmp5);
|
||||
DUP2_ARG2(__lsx_vadd_h, reg0, tmp4, reg1, tmp5, reg0, reg1);
|
||||
dst0 = __lsx_vsrarni_b_h(reg1, reg0, 2);
|
||||
dst0 = __lsx_vshuf4i_b(dst0, 0xD8);
|
||||
__lsx_vst(dst0, dst_argb, 0);
|
||||
dst_argb += 16;
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleRowDown2_LSX(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
int x;
|
||||
int len = dst_width / 32;
|
||||
__m128i src0, src1, src2, src3, dst0, dst1;
|
||||
(void)src_stride;
|
||||
|
||||
for (x = 0; x < len; x++) {
|
||||
DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
|
||||
src0, src1, src2, src3);
|
||||
DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, dst0, dst1);
|
||||
__lsx_vst(dst0, dst, 0);
|
||||
__lsx_vst(dst1, dst, 16);
|
||||
src_ptr += 64;
|
||||
dst += 32;
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleRowDown2Linear_LSX(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
int x;
|
||||
int len = dst_width / 32;
|
||||
__m128i src0, src1, src2, src3;
|
||||
__m128i tmp0, tmp1, tmp2, tmp3, dst0, dst1;
|
||||
(void)src_stride;
|
||||
|
||||
for (x = 0; x < len; x++) {
|
||||
DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
|
||||
src0, src1, src2, src3);
|
||||
DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, tmp0, tmp2);
|
||||
DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, tmp1, tmp3);
|
||||
DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp1, tmp2, tmp3, dst0, dst1);
|
||||
__lsx_vst(dst0, dst, 0);
|
||||
__lsx_vst(dst1, dst, 16);
|
||||
src_ptr += 64;
|
||||
dst += 32;
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleRowDown2Box_LSX(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
int x;
|
||||
int len = dst_width / 32;
|
||||
const uint8_t* src_nex = src_ptr + src_stride;
|
||||
__m128i src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
||||
__m128i dst0, dst1;
|
||||
|
||||
for (x = 0; x < len; x++) {
|
||||
DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
|
||||
src0, src1, src2, src3);
|
||||
DUP4_ARG2(__lsx_vld, src_nex, 0, src_nex, 16, src_nex, 32, src_nex, 48,
|
||||
src4, src5, src6, src7);
|
||||
DUP4_ARG2(__lsx_vaddwev_h_bu, src0, src4, src1, src5, src2, src6, src3,
|
||||
src7, tmp0, tmp2, tmp4, tmp6);
|
||||
DUP4_ARG2(__lsx_vaddwod_h_bu, src0, src4, src1, src5, src2, src6, src3,
|
||||
src7, tmp1, tmp3, tmp5, tmp7);
|
||||
DUP4_ARG2(__lsx_vadd_h, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
|
||||
tmp0, tmp1, tmp2, tmp3);
|
||||
DUP2_ARG3(__lsx_vsrarni_b_h, tmp1, tmp0, 2, tmp3, tmp2, 2, dst0, dst1);
|
||||
__lsx_vst(dst0, dst, 0);
|
||||
__lsx_vst(dst1, dst, 16);
|
||||
src_ptr += 64;
|
||||
src_nex += 64;
|
||||
dst += 32;
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleRowDown4_LSX(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
int x;
|
||||
int len = dst_width / 16;
|
||||
__m128i src0, src1, src2, src3, tmp0, tmp1, dst0;
|
||||
(void)src_stride;
|
||||
|
||||
for (x = 0; x < len; x++) {
|
||||
DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
|
||||
src0, src1, src2, src3);
|
||||
DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, tmp0, tmp1);
|
||||
dst0 = __lsx_vpickod_b(tmp1, tmp0);
|
||||
__lsx_vst(dst0, dst, 0);
|
||||
src_ptr += 64;
|
||||
dst += 16;
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleRowDown4Box_LSX(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
int x;
|
||||
int len = dst_width / 16;
|
||||
const uint8_t* ptr1 = src_ptr + src_stride;
|
||||
const uint8_t* ptr2 = ptr1 + src_stride;
|
||||
const uint8_t* ptr3 = ptr2 + src_stride;
|
||||
__m128i src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
||||
__m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, dst0;
|
||||
|
||||
for (x = 0; x < len; x++) {
|
||||
DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
|
||||
src0, src1, src2, src3);
|
||||
DUP4_ARG2(__lsx_vld, ptr1, 0, ptr1, 16, ptr1, 32, ptr1, 48, src4, src5,
|
||||
src6, src7);
|
||||
DUP4_ARG2(__lsx_vaddwev_h_bu, src0, src4, src1, src5, src2, src6, src3,
|
||||
src7, tmp0, tmp2, tmp4, tmp6);
|
||||
DUP4_ARG2(__lsx_vaddwod_h_bu, src0, src4, src1, src5, src2, src6, src3,
|
||||
src7, tmp1, tmp3, tmp5, tmp7);
|
||||
DUP4_ARG2(__lsx_vadd_h, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
|
||||
reg0, reg1, reg2, reg3);
|
||||
DUP4_ARG2(__lsx_vld, ptr2, 0, ptr2, 16, ptr2, 32, ptr2, 48, src0, src1,
|
||||
src2, src3);
|
||||
DUP4_ARG2(__lsx_vld, ptr3, 0, ptr3, 16, ptr3, 32, ptr3, 48, src4, src5,
|
||||
src6, src7);
|
||||
DUP4_ARG2(__lsx_vaddwev_h_bu, src0, src4, src1, src5, src2, src6, src3,
|
||||
src7, tmp0, tmp2, tmp4, tmp6);
|
||||
DUP4_ARG2(__lsx_vaddwod_h_bu, src0, src4, src1, src5, src2, src6, src3,
|
||||
src7, tmp1, tmp3, tmp5, tmp7);
|
||||
DUP4_ARG2(__lsx_vadd_h, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
|
||||
reg4, reg5, reg6, reg7);
|
||||
DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7,
|
||||
reg0, reg1, reg2, reg3);
|
||||
DUP4_ARG2(__lsx_vhaddw_wu_hu, reg0, reg0, reg1, reg1, reg2, reg2, reg3,
|
||||
reg3, reg0, reg1, reg2, reg3);
|
||||
DUP2_ARG3(__lsx_vsrarni_h_w, reg1, reg0, 4, reg3, reg2, 4, tmp0, tmp1);
|
||||
dst0 = __lsx_vpickev_b(tmp1, tmp0);
|
||||
__lsx_vst(dst0, dst, 0);
|
||||
src_ptr += 64;
|
||||
ptr1 += 64;
|
||||
ptr2 += 64;
|
||||
ptr3 += 64;
|
||||
dst += 16;
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleRowDown38_LSX(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
int x, len;
|
||||
__m128i src0, src1, tmp0;
|
||||
__m128i shuff = {0x13100E0B08060300, 0x000000001E1B1816};
|
||||
|
||||
assert(dst_width % 3 == 0);
|
||||
len = dst_width / 12;
|
||||
(void)src_stride;
|
||||
|
||||
for (x = 0; x < len; x++) {
|
||||
DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
|
||||
tmp0 = __lsx_vshuf_b(src1, src0, shuff);
|
||||
__lsx_vstelm_d(tmp0, dst, 0, 0);
|
||||
__lsx_vstelm_w(tmp0, dst, 8, 2);
|
||||
src_ptr += 32;
|
||||
dst += 12;
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleRowDown38_2_Box_LSX(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
int x, len;
|
||||
const uint8_t* src_nex = src_ptr + src_stride;
|
||||
__m128i src0, src1, src2, src3, dst0;
|
||||
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
||||
__m128i reg0, reg1, reg2, reg3;
|
||||
__m128i shuff = {0x0A08160604120200, 0x000000001E0E0C1A};
|
||||
__m128i const_0x2AAA = __lsx_vreplgr2vr_h(0x2AAA);
|
||||
__m128i const_0x4000 = __lsx_vreplgr2vr_w(0x4000);
|
||||
|
||||
assert((dst_width % 3 == 0) && (dst_width > 0));
|
||||
len = dst_width / 12;
|
||||
|
||||
for (x = 0; x < len; x++) {
|
||||
DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_nex, 0, src_nex, 16, src0,
|
||||
src1, src2, src3);
|
||||
DUP2_ARG2(__lsx_vaddwev_h_bu, src0, src2, src1, src3, tmp0, tmp2);
|
||||
DUP2_ARG2(__lsx_vaddwod_h_bu, src0, src2, src1, src3, tmp1, tmp3);
|
||||
DUP2_ARG2(__lsx_vpickev_h, tmp2, tmp0, tmp3, tmp1, reg0, reg1);
|
||||
DUP2_ARG2(__lsx_vpackod_h, tmp1, tmp0, tmp3, tmp2, reg2, reg3);
|
||||
tmp4 = __lsx_vpickev_w(reg3, reg2);
|
||||
tmp5 = __lsx_vadd_h(reg0, reg1);
|
||||
tmp6 = __lsx_vadd_h(tmp5, tmp4);
|
||||
tmp7 = __lsx_vmuh_h(tmp6, const_0x2AAA);
|
||||
tmp0 = __lsx_vpickod_w(reg3, reg2);
|
||||
tmp1 = __lsx_vhaddw_wu_hu(tmp0, tmp0);
|
||||
tmp2 = __lsx_vmul_w(tmp1, const_0x4000);
|
||||
dst0 = __lsx_vshuf_b(tmp2, tmp7, shuff);
|
||||
__lsx_vstelm_d(dst0, dst_ptr, 0, 0);
|
||||
__lsx_vstelm_w(dst0, dst_ptr, 8, 2);
|
||||
src_ptr += 32;
|
||||
src_nex += 32;
|
||||
dst_ptr += 12;
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleRowDown38_3_Box_LSX(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
int x, len;
|
||||
const uint8_t* ptr1 = src_ptr + src_stride;
|
||||
const uint8_t* ptr2 = ptr1 + src_stride;
|
||||
__m128i src0, src1, src2, src3, src4, src5;
|
||||
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
||||
__m128i reg0, reg1, reg2, reg3, dst0;
|
||||
__m128i zero = __lsx_vldi(0);
|
||||
__m128i shuff = {0x0A08160604120200, 0x000000001E0E0C1A};
|
||||
__m128i const_0x1C71 = __lsx_vreplgr2vr_h(0x1C71);
|
||||
__m128i const_0x2AAA = __lsx_vreplgr2vr_w(0x2AAA);
|
||||
|
||||
assert((dst_width % 3 == 0) && (dst_width > 0));
|
||||
len = dst_width / 12;
|
||||
|
||||
for (x = 0; x < len; x++) {
|
||||
DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, ptr1, 0, ptr1, 16, src0, src1,
|
||||
src2, src3);
|
||||
DUP2_ARG2(__lsx_vld, ptr2, 0, ptr2, 16, src4, src5);
|
||||
DUP2_ARG2(__lsx_vaddwev_h_bu, src0, src2, src1, src3, tmp0, tmp2);
|
||||
DUP2_ARG2(__lsx_vaddwod_h_bu, src0, src2, src1, src3, tmp1, tmp3);
|
||||
DUP2_ARG2(__lsx_vpackev_b, zero, src4, zero, src5, tmp4, tmp6);
|
||||
DUP2_ARG2(__lsx_vpackod_b, zero, src4, zero, src5, tmp5, tmp7);
|
||||
DUP4_ARG2(__lsx_vadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7,
|
||||
tmp0, tmp1, tmp2, tmp3);
|
||||
DUP2_ARG2(__lsx_vpickev_h, tmp2, tmp0, tmp3, tmp1, reg0, reg1);
|
||||
DUP2_ARG2(__lsx_vpackod_h, tmp1, tmp0, tmp3, tmp2, reg2, reg3);
|
||||
tmp4 = __lsx_vpickev_w(reg3, reg2);
|
||||
tmp5 = __lsx_vadd_h(reg0, reg1);
|
||||
tmp6 = __lsx_vadd_h(tmp5, tmp4);
|
||||
tmp7 = __lsx_vmuh_h(tmp6, const_0x1C71);
|
||||
tmp0 = __lsx_vpickod_w(reg3, reg2);
|
||||
tmp1 = __lsx_vhaddw_wu_hu(tmp0, tmp0);
|
||||
tmp2 = __lsx_vmul_w(tmp1, const_0x2AAA);
|
||||
dst0 = __lsx_vshuf_b(tmp2, tmp7, shuff);
|
||||
__lsx_vstelm_d(dst0, dst_ptr, 0, 0);
|
||||
__lsx_vstelm_w(dst0, dst_ptr, 8, 2);
|
||||
src_ptr += 32;
|
||||
ptr1 += 32;
|
||||
ptr2 += 32;
|
||||
dst_ptr += 12;
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleAddRow_LSX(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
|
||||
int x;
|
||||
int len = src_width / 16;
|
||||
__m128i src0, tmp0, tmp1, dst0, dst1;
|
||||
__m128i zero = __lsx_vldi(0);
|
||||
|
||||
assert(src_width > 0);
|
||||
|
||||
for (x = 0; x < len; x++) {
|
||||
src0 = __lsx_vld(src_ptr, 0);
|
||||
DUP2_ARG2(__lsx_vld, dst_ptr, 0, dst_ptr, 16, dst0, dst1);
|
||||
tmp0 = __lsx_vilvl_b(zero, src0);
|
||||
tmp1 = __lsx_vilvh_b(zero, src0);
|
||||
DUP2_ARG2(__lsx_vadd_h, dst0, tmp0, dst1, tmp1, dst0, dst1);
|
||||
__lsx_vst(dst0, dst_ptr, 0);
|
||||
__lsx_vst(dst1, dst_ptr, 16);
|
||||
src_ptr += 16;
|
||||
dst_ptr += 16;
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleFilterCols_LSX(uint8_t* dst_ptr,
|
||||
const uint8_t* src_ptr,
|
||||
int dst_width,
|
||||
int x,
|
||||
int dx) {
|
||||
int j;
|
||||
int len = dst_width / 16;
|
||||
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
||||
__m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
|
||||
__m128i vec0, vec1, dst0;
|
||||
__m128i vec_x = __lsx_vreplgr2vr_w(x);
|
||||
__m128i vec_dx = __lsx_vreplgr2vr_w(dx);
|
||||
__m128i const1 = __lsx_vreplgr2vr_w(0xFFFF);
|
||||
__m128i const2 = __lsx_vreplgr2vr_w(0x40);
|
||||
__m128i const_tmp = {0x0000000100000000, 0x0000000300000002};
|
||||
|
||||
vec0 = __lsx_vmul_w(vec_dx, const_tmp);
|
||||
vec1 = __lsx_vslli_w(vec_dx, 2);
|
||||
vec_x = __lsx_vadd_w(vec_x, vec0);
|
||||
|
||||
for (j = 0; j < len; j++) {
|
||||
tmp0 = __lsx_vsrai_w(vec_x, 16);
|
||||
tmp4 = __lsx_vand_v(vec_x, const1);
|
||||
vec_x = __lsx_vadd_w(vec_x, vec1);
|
||||
tmp1 = __lsx_vsrai_w(vec_x, 16);
|
||||
tmp5 = __lsx_vand_v(vec_x, const1);
|
||||
vec_x = __lsx_vadd_w(vec_x, vec1);
|
||||
tmp2 = __lsx_vsrai_w(vec_x, 16);
|
||||
tmp6 = __lsx_vand_v(vec_x, const1);
|
||||
vec_x = __lsx_vadd_w(vec_x, vec1);
|
||||
tmp3 = __lsx_vsrai_w(vec_x, 16);
|
||||
tmp7 = __lsx_vand_v(vec_x, const1);
|
||||
vec_x = __lsx_vadd_w(vec_x, vec1);
|
||||
DUP4_ARG2(__lsx_vsrai_w, tmp4, 9, tmp5, 9, tmp6, 9, tmp7, 9, tmp4, tmp5,
|
||||
tmp6, tmp7);
|
||||
LOAD_DATA(src_ptr, tmp0, reg0);
|
||||
LOAD_DATA(src_ptr, tmp1, reg1);
|
||||
LOAD_DATA(src_ptr, tmp2, reg2);
|
||||
LOAD_DATA(src_ptr, tmp3, reg3);
|
||||
DUP4_ARG2(__lsx_vaddi_wu, tmp0, 1, tmp1, 1, tmp2, 1, tmp3, 1, tmp0, tmp1,
|
||||
tmp2, tmp3);
|
||||
LOAD_DATA(src_ptr, tmp0, reg4);
|
||||
LOAD_DATA(src_ptr, tmp1, reg5);
|
||||
LOAD_DATA(src_ptr, tmp2, reg6);
|
||||
LOAD_DATA(src_ptr, tmp3, reg7);
|
||||
DUP4_ARG2(__lsx_vsub_w, reg4, reg0, reg5, reg1, reg6, reg2, reg7, reg3,
|
||||
reg4, reg5, reg6, reg7);
|
||||
DUP4_ARG2(__lsx_vmul_w, reg4, tmp4, reg5, tmp5, reg6, tmp6, reg7, tmp7,
|
||||
reg4, reg5, reg6, reg7);
|
||||
DUP4_ARG2(__lsx_vadd_w, reg4, const2, reg5, const2, reg6, const2, reg7,
|
||||
const2, reg4, reg5, reg6, reg7);
|
||||
DUP4_ARG2(__lsx_vsrai_w, reg4, 7, reg5, 7, reg6, 7, reg7, 7, reg4, reg5,
|
||||
reg6, reg7);
|
||||
DUP4_ARG2(__lsx_vadd_w, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7,
|
||||
reg0, reg1, reg2, reg3);
|
||||
DUP2_ARG2(__lsx_vpickev_h, reg1, reg0, reg3, reg2, tmp0, tmp1);
|
||||
dst0 = __lsx_vpickev_b(tmp1, tmp0);
|
||||
__lsx_vst(dst0, dst_ptr, 0);
|
||||
dst_ptr += 16;
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleARGBCols_LSX(uint8_t* dst_argb,
|
||||
const uint8_t* src_argb,
|
||||
int dst_width,
|
||||
int x,
|
||||
int dx) {
|
||||
const uint32_t* src = (const uint32_t*)src_argb;
|
||||
uint32_t* dst = (uint32_t*)dst_argb;
|
||||
int j;
|
||||
int len = dst_width / 4;
|
||||
__m128i tmp0, tmp1, tmp2, dst0;
|
||||
__m128i vec_x = __lsx_vreplgr2vr_w(x);
|
||||
__m128i vec_dx = __lsx_vreplgr2vr_w(dx);
|
||||
__m128i const_tmp = {0x0000000100000000, 0x0000000300000002};
|
||||
|
||||
tmp0 = __lsx_vmul_w(vec_dx, const_tmp);
|
||||
tmp1 = __lsx_vslli_w(vec_dx, 2);
|
||||
vec_x = __lsx_vadd_w(vec_x, tmp0);
|
||||
|
||||
for (j = 0; j < len; j++) {
|
||||
tmp2 = __lsx_vsrai_w(vec_x, 16);
|
||||
vec_x = __lsx_vadd_w(vec_x, tmp1);
|
||||
LOAD_DATA(src, tmp2, dst0);
|
||||
__lsx_vst(dst0, dst, 0);
|
||||
dst += 4;
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleARGBFilterCols_LSX(uint8_t* dst_argb,
|
||||
const uint8_t* src_argb,
|
||||
int dst_width,
|
||||
int x,
|
||||
int dx) {
|
||||
const uint32_t* src = (const uint32_t*)src_argb;
|
||||
int j;
|
||||
int len = dst_width / 8;
|
||||
__m128i src0, src1, src2, src3;
|
||||
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
||||
__m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
|
||||
__m128i vec0, vec1, dst0, dst1;
|
||||
__m128i vec_x = __lsx_vreplgr2vr_w(x);
|
||||
__m128i vec_dx = __lsx_vreplgr2vr_w(dx);
|
||||
__m128i const_tmp = {0x0000000100000000, 0x0000000300000002};
|
||||
__m128i const_7f = __lsx_vldi(0x7F);
|
||||
|
||||
vec0 = __lsx_vmul_w(vec_dx, const_tmp);
|
||||
vec1 = __lsx_vslli_w(vec_dx, 2);
|
||||
vec_x = __lsx_vadd_w(vec_x, vec0);
|
||||
|
||||
for (j = 0; j < len; j++) {
|
||||
tmp0 = __lsx_vsrai_w(vec_x, 16);
|
||||
reg0 = __lsx_vsrai_w(vec_x, 9);
|
||||
vec_x = __lsx_vadd_w(vec_x, vec1);
|
||||
tmp1 = __lsx_vsrai_w(vec_x, 16);
|
||||
reg1 = __lsx_vsrai_w(vec_x, 9);
|
||||
vec_x = __lsx_vadd_w(vec_x, vec1);
|
||||
DUP2_ARG2(__lsx_vand_v, reg0, const_7f, reg1, const_7f, reg0, reg1);
|
||||
DUP2_ARG2(__lsx_vshuf4i_b, reg0, 0, reg1, 0, reg0, reg1);
|
||||
DUP2_ARG2(__lsx_vxor_v, reg0, const_7f, reg1, const_7f, reg2, reg3);
|
||||
DUP2_ARG2(__lsx_vilvl_b, reg0, reg2, reg1, reg3, reg4, reg6);
|
||||
DUP2_ARG2(__lsx_vilvh_b, reg0, reg2, reg1, reg3, reg5, reg7);
|
||||
LOAD_DATA(src, tmp0, src0);
|
||||
LOAD_DATA(src, tmp1, src1);
|
||||
DUP2_ARG2(__lsx_vaddi_wu, tmp0, 1, tmp1, 1, tmp0, tmp1);
|
||||
LOAD_DATA(src, tmp0, src2);
|
||||
LOAD_DATA(src, tmp1, src3);
|
||||
DUP2_ARG2(__lsx_vilvl_b, src2, src0, src3, src1, tmp4, tmp6);
|
||||
DUP2_ARG2(__lsx_vilvh_b, src2, src0, src3, src1, tmp5, tmp7);
|
||||
DUP4_ARG2(__lsx_vdp2_h_bu, tmp4, reg4, tmp5, reg5, tmp6, reg6, tmp7, reg7,
|
||||
tmp0, tmp1, tmp2, tmp3);
|
||||
DUP2_ARG3(__lsx_vsrani_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, dst0, dst1);
|
||||
__lsx_vst(dst0, dst_argb, 0);
|
||||
__lsx_vst(dst1, dst_argb, 16);
|
||||
dst_argb += 32;
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleRowDown34_LSX(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
int x;
|
||||
(void)src_stride;
|
||||
__m128i src0, src1, src2, src3;
|
||||
__m128i dst0, dst1, dst2;
|
||||
__m128i shuff0 = {0x0908070504030100, 0x141311100F0D0C0B};
|
||||
__m128i shuff1 = {0x0F0D0C0B09080705, 0x1918171514131110};
|
||||
__m128i shuff2 = {0x141311100F0D0C0B, 0x1F1D1C1B19181715};
|
||||
|
||||
assert((dst_width % 3 == 0) && (dst_width > 0));
|
||||
|
||||
for (x = 0; x < dst_width; x += 48) {
|
||||
DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
|
||||
src0, src1, src2, src3);
|
||||
DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0, src2, src1, shuff1, dst0,
|
||||
dst1);
|
||||
dst2 = __lsx_vshuf_b(src3, src2, shuff2);
|
||||
__lsx_vst(dst0, dst, 0);
|
||||
__lsx_vst(dst1, dst, 16);
|
||||
__lsx_vst(dst2, dst, 32);
|
||||
src_ptr += 64;
|
||||
dst += 48;
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleRowDown34_0_Box_LSX(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* d,
|
||||
int dst_width) {
|
||||
const uint8_t* src_nex = src_ptr + src_stride;
|
||||
int x;
|
||||
__m128i src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
|
||||
__m128i tmp10, tmp11, dst0, dst1, dst2;
|
||||
__m128i const0 = {0x0103030101010103, 0x0101010303010101};
|
||||
__m128i const1 = {0x0301010101030301, 0x0103030101010103};
|
||||
__m128i const2 = {0x0101010303010101, 0x0301010101030301};
|
||||
__m128i shuff0 = {0x0504030202010100, 0x0A09090807060605};
|
||||
__m128i shuff1 = {0x0F0E0E0D0D0C0B0A, 0x1514131212111110};
|
||||
__m128i shuff2 = {0x0A09090807060605, 0x0F0E0E0D0D0C0B0A};
|
||||
__m128i shift0 = {0x0002000200010002, 0x0001000200020001};
|
||||
__m128i shift1 = {0x0002000100020002, 0x0002000200010002};
|
||||
__m128i shift2 = {0x0001000200020001, 0x0002000100020002};
|
||||
|
||||
assert((dst_width % 3 == 0) && (dst_width > 0));
|
||||
|
||||
for (x = 0; x < dst_width; x += 48) {
|
||||
DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
|
||||
src0, src1, src2, src3);
|
||||
DUP4_ARG2(__lsx_vld, src_nex, 0, src_nex, 16, src_nex, 32, src_nex, 48,
|
||||
src4, src5, src6, src7);
|
||||
DUP4_ARG3(__lsx_vshuf_b, src0, src0, shuff0, src1, src0, shuff1, src1, src1,
|
||||
shuff2, src2, src2, shuff0, tmp0, tmp1, tmp2, tmp3);
|
||||
DUP4_ARG3(__lsx_vshuf_b, src3, src2, shuff1, src3, src3, shuff2, src4, src4,
|
||||
shuff0, src5, src4, shuff1, tmp4, tmp5, tmp6, tmp7);
|
||||
DUP4_ARG3(__lsx_vshuf_b, src5, src5, shuff2, src6, src6, shuff0, src7, src6,
|
||||
shuff1, src7, src7, shuff2, tmp8, tmp9, tmp10, tmp11);
|
||||
DUP4_ARG2(__lsx_vdp2_h_bu, tmp0, const0, tmp1, const1, tmp2, const2, tmp3,
|
||||
const0, src0, src1, src2, src3);
|
||||
DUP4_ARG2(__lsx_vdp2_h_bu, tmp4, const1, tmp5, const2, tmp6, const0, tmp7,
|
||||
const1, src4, src5, src6, src7);
|
||||
DUP4_ARG2(__lsx_vdp2_h_bu, tmp8, const2, tmp9, const0, tmp10, const1, tmp11,
|
||||
const2, tmp0, tmp1, tmp2, tmp3);
|
||||
DUP4_ARG2(__lsx_vsrar_h, src0, shift0, src1, shift1, src2, shift2, src3,
|
||||
shift0, src0, src1, src2, src3);
|
||||
DUP4_ARG2(__lsx_vsrar_h, src4, shift1, src5, shift2, src6, shift0, src7,
|
||||
shift1, src4, src5, src6, src7);
|
||||
DUP4_ARG2(__lsx_vsrar_h, tmp0, shift2, tmp1, shift0, tmp2, shift1, tmp3,
|
||||
shift2, tmp0, tmp1, tmp2, tmp3);
|
||||
DUP4_ARG2(__lsx_vslli_h, src0, 1, src1, 1, src2, 1, src3, 1, tmp5, tmp6,
|
||||
tmp7, tmp8);
|
||||
DUP2_ARG2(__lsx_vslli_h, src4, 1, src5, 1, tmp9, tmp10);
|
||||
DUP4_ARG2(__lsx_vadd_h, src0, tmp5, src1, tmp6, src2, tmp7, src3, tmp8,
|
||||
src0, src1, src2, src3);
|
||||
DUP2_ARG2(__lsx_vadd_h, src4, tmp9, src5, tmp10, src4, src5);
|
||||
DUP4_ARG2(__lsx_vadd_h, src0, src6, src1, src7, src2, tmp0, src3, tmp1,
|
||||
src0, src1, src2, src3);
|
||||
DUP2_ARG2(__lsx_vadd_h, src4, tmp2, src5, tmp3, src4, src5);
|
||||
DUP2_ARG3(__lsx_vsrarni_b_h, src1, src0, 2, src3, src2, 2, dst0, dst1);
|
||||
dst2 = __lsx_vsrarni_b_h(src5, src4, 2);
|
||||
__lsx_vst(dst0, d, 0);
|
||||
__lsx_vst(dst1, d, 16);
|
||||
__lsx_vst(dst2, d, 32);
|
||||
src_ptr += 64;
|
||||
src_nex += 64;
|
||||
d += 48;
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleRowDown34_1_Box_LSX(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* d,
|
||||
int dst_width) {
|
||||
const uint8_t* src_nex = src_ptr + src_stride;
|
||||
int x;
|
||||
__m128i src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
|
||||
__m128i tmp10, tmp11, dst0, dst1, dst2;
|
||||
__m128i const0 = {0x0103030101010103, 0x0101010303010101};
|
||||
__m128i const1 = {0x0301010101030301, 0x0103030101010103};
|
||||
__m128i const2 = {0x0101010303010101, 0x0301010101030301};
|
||||
__m128i shuff0 = {0x0504030202010100, 0x0A09090807060605};
|
||||
__m128i shuff1 = {0x0F0E0E0D0D0C0B0A, 0x1514131212111110};
|
||||
__m128i shuff2 = {0x0A09090807060605, 0x0F0E0E0D0D0C0B0A};
|
||||
__m128i shift0 = {0x0002000200010002, 0x0001000200020001};
|
||||
__m128i shift1 = {0x0002000100020002, 0x0002000200010002};
|
||||
__m128i shift2 = {0x0001000200020001, 0x0002000100020002};
|
||||
|
||||
assert((dst_width % 3 == 0) && (dst_width > 0));
|
||||
|
||||
for (x = 0; x < dst_width; x += 48) {
|
||||
DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
|
||||
src0, src1, src2, src3);
|
||||
DUP4_ARG2(__lsx_vld, src_nex, 0, src_nex, 16, src_nex, 32, src_nex, 48,
|
||||
src4, src5, src6, src7);
|
||||
DUP4_ARG3(__lsx_vshuf_b, src0, src0, shuff0, src1, src0, shuff1, src1, src1,
|
||||
shuff2, src2, src2, shuff0, tmp0, tmp1, tmp2, tmp3);
|
||||
DUP4_ARG3(__lsx_vshuf_b, src3, src2, shuff1, src3, src3, shuff2, src4, src4,
|
||||
shuff0, src5, src4, shuff1, tmp4, tmp5, tmp6, tmp7);
|
||||
DUP4_ARG3(__lsx_vshuf_b, src5, src5, shuff2, src6, src6, shuff0, src7, src6,
|
||||
shuff1, src7, src7, shuff2, tmp8, tmp9, tmp10, tmp11);
|
||||
DUP4_ARG2(__lsx_vdp2_h_bu, tmp0, const0, tmp1, const1, tmp2, const2, tmp3,
|
||||
const0, src0, src1, src2, src3);
|
||||
DUP4_ARG2(__lsx_vdp2_h_bu, tmp4, const1, tmp5, const2, tmp6, const0, tmp7,
|
||||
const1, src4, src5, src6, src7);
|
||||
DUP4_ARG2(__lsx_vdp2_h_bu, tmp8, const2, tmp9, const0, tmp10, const1, tmp11,
|
||||
const2, tmp0, tmp1, tmp2, tmp3);
|
||||
DUP4_ARG2(__lsx_vsrar_h, src0, shift0, src1, shift1, src2, shift2, src3,
|
||||
shift0, src0, src1, src2, src3);
|
||||
DUP4_ARG2(__lsx_vsrar_h, src4, shift1, src5, shift2, src6, shift0, src7,
|
||||
shift1, src4, src5, src6, src7);
|
||||
DUP4_ARG2(__lsx_vsrar_h, tmp0, shift2, tmp1, shift0, tmp2, shift1, tmp3,
|
||||
shift2, tmp0, tmp1, tmp2, tmp3);
|
||||
DUP4_ARG2(__lsx_vadd_h, src0, src6, src1, src7, src2, tmp0, src3, tmp1,
|
||||
src0, src1, src2, src3);
|
||||
DUP2_ARG2(__lsx_vadd_h, src4, tmp2, src5, tmp3, src4, src5);
|
||||
DUP2_ARG3(__lsx_vsrarni_b_h, src1, src0, 1, src3, src2, 1, dst0, dst1);
|
||||
dst2 = __lsx_vsrarni_b_h(src5, src4, 1);
|
||||
__lsx_vst(dst0, d, 0);
|
||||
__lsx_vst(dst1, d, 16);
|
||||
__lsx_vst(dst2, d, 32);
|
||||
src_ptr += 64;
|
||||
src_nex += 64;
|
||||
d += 48;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
|
||||
1449
3rdparty/libyuv/source/scale_neon.cc
vendored
Normal file
1449
3rdparty/libyuv/source/scale_neon.cc
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1552
3rdparty/libyuv/source/scale_neon64.cc
vendored
Normal file
1552
3rdparty/libyuv/source/scale_neon64.cc
vendored
Normal file
File diff suppressed because it is too large
Load Diff
82
3rdparty/libyuv/source/scale_rgb.cc
vendored
Normal file
82
3rdparty/libyuv/source/scale_rgb.cc
vendored
Normal file
@@ -0,0 +1,82 @@
|
||||
/*
|
||||
* Copyright 2022 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/scale.h" /* For FilterMode */
|
||||
|
||||
#include <assert.h>
|
||||
#include <limits.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "libyuv/convert_argb.h"
|
||||
#include "libyuv/convert_from_argb.h"
|
||||
#include "libyuv/row.h"
|
||||
#include "libyuv/scale_argb.h"
|
||||
#include "libyuv/scale_rgb.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Scale a 24 bit image.
|
||||
// Converts to ARGB as intermediate step
|
||||
|
||||
LIBYUV_API
|
||||
int RGBScale(const uint8_t* src_rgb,
|
||||
int src_stride_rgb,
|
||||
int src_width,
|
||||
int src_height,
|
||||
uint8_t* dst_rgb,
|
||||
int dst_stride_rgb,
|
||||
int dst_width,
|
||||
int dst_height,
|
||||
enum FilterMode filtering) {
|
||||
int r;
|
||||
if (!src_rgb || !dst_rgb || src_width <= 0 || src_width > INT_MAX / 4 ||
|
||||
src_height == 0 || dst_width <= 0 || dst_width > INT_MAX / 4 ||
|
||||
dst_height <= 0) {
|
||||
return -1;
|
||||
}
|
||||
const int abs_src_height = (src_height < 0) ? -src_height : src_height;
|
||||
const uint64_t src_argb_size = (uint64_t)src_width * abs_src_height * 4;
|
||||
const uint64_t dst_argb_size = (uint64_t)dst_width * dst_height * 4;
|
||||
if (src_argb_size > (UINT64_MAX - dst_argb_size)) {
|
||||
return -1; // Invalid size.
|
||||
}
|
||||
const uint64_t argb_size = src_argb_size + dst_argb_size;
|
||||
if (argb_size > SIZE_MAX) {
|
||||
return -1; // Invalid size.
|
||||
}
|
||||
uint8_t* src_argb = (uint8_t*)malloc((size_t)argb_size);
|
||||
if (!src_argb) {
|
||||
return 1; // Out of memory runtime error.
|
||||
}
|
||||
uint8_t* dst_argb = src_argb + (size_t)src_argb_size;
|
||||
|
||||
r = RGB24ToARGB(src_rgb, src_stride_rgb, src_argb, src_width * 4, src_width,
|
||||
src_height);
|
||||
if (!r) {
|
||||
r = ARGBScale(src_argb, src_width * 4, src_width, abs_src_height, dst_argb,
|
||||
dst_width * 4, dst_width, dst_height, filtering);
|
||||
if (!r) {
|
||||
r = ARGBToRGB24(dst_argb, dst_width * 4, dst_rgb, dst_stride_rgb,
|
||||
dst_width, dst_height);
|
||||
}
|
||||
}
|
||||
free(src_argb);
|
||||
return r;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
1124
3rdparty/libyuv/source/scale_rvv.cc
vendored
Normal file
1124
3rdparty/libyuv/source/scale_rvv.cc
vendored
Normal file
File diff suppressed because it is too large
Load Diff
555
3rdparty/libyuv/source/scale_sme.cc
vendored
Normal file
555
3rdparty/libyuv/source/scale_sme.cc
vendored
Normal file
@@ -0,0 +1,555 @@
|
||||
/*
|
||||
* Copyright 2024 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/scale_row.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && \
|
||||
defined(__aarch64__)
|
||||
|
||||
__arm_locally_streaming void ScaleRowDown2_SME(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
// Streaming-SVE only, no use of ZA tile.
|
||||
(void)src_stride;
|
||||
int vl;
|
||||
asm volatile(
|
||||
"cntb %x[vl] \n"
|
||||
"subs %w[dst_width], %w[dst_width], %w[vl] \n"
|
||||
"b.lt 2f \n"
|
||||
|
||||
"1: \n"
|
||||
"ptrue p0.b \n"
|
||||
"ld2b {z0.b, z1.b}, p0/z, [%[src_ptr]] \n"
|
||||
"incb %[src_ptr], all, mul #2 \n"
|
||||
"subs %w[dst_width], %w[dst_width], %w[vl] \n"
|
||||
"st1b {z1.b}, p0, [%[dst_ptr]] \n"
|
||||
"incb %[dst_ptr] \n"
|
||||
"b.ge 1b \n"
|
||||
|
||||
"2: \n"
|
||||
"adds %w[dst_width], %w[dst_width], %w[vl] \n"
|
||||
"b.eq 99f \n"
|
||||
|
||||
"whilelt p0.b, wzr, %w[dst_width] \n"
|
||||
"ld2b {z0.b, z1.b}, p0/z, [%[src_ptr]] \n"
|
||||
"st1b {z1.b}, p0, [%[dst_ptr]] \n"
|
||||
|
||||
"99: \n"
|
||||
: [src_ptr] "+r"(src_ptr), // %[src_ptr]
|
||||
[dst_ptr] "+r"(dst), // %[dst_ptr]
|
||||
[dst_width] "+r"(dst_width), // %[dst_width]
|
||||
[vl] "=r"(vl) // %[vl]
|
||||
:
|
||||
: "memory", "cc", "z0", "z1", "p0");
|
||||
}
|
||||
|
||||
__arm_locally_streaming void ScaleRowDown2_16_SME(const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16_t* dst,
|
||||
int dst_width) {
|
||||
// Streaming-SVE only, no use of ZA tile.
|
||||
(void)src_stride;
|
||||
int vl;
|
||||
asm volatile(
|
||||
"cnth %x[vl] \n"
|
||||
"subs %w[dst_width], %w[dst_width], %w[vl] \n"
|
||||
"b.lt 2f \n"
|
||||
|
||||
"1: \n"
|
||||
"ptrue p0.h \n"
|
||||
"ld2h {z0.h, z1.h}, p0/z, [%[src_ptr]] \n"
|
||||
"incb %[src_ptr], all, mul #2 \n"
|
||||
"subs %w[dst_width], %w[dst_width], %w[vl] \n"
|
||||
"st1h {z1.h}, p0, [%[dst_ptr]] \n"
|
||||
"incb %[dst_ptr] \n"
|
||||
"b.ge 1b \n"
|
||||
|
||||
"2: \n"
|
||||
"adds %w[dst_width], %w[dst_width], %w[vl] \n"
|
||||
"b.eq 99f \n"
|
||||
|
||||
"whilelt p0.h, wzr, %w[dst_width] \n"
|
||||
"ld2h {z0.h, z1.h}, p0/z, [%[src_ptr]] \n"
|
||||
"st1h {z1.h}, p0, [%[dst_ptr]] \n"
|
||||
|
||||
"99: \n"
|
||||
: [src_ptr] "+r"(src_ptr), // %[src_ptr]
|
||||
[dst_ptr] "+r"(dst), // %[dst_ptr]
|
||||
[dst_width] "+r"(dst_width), // %[dst_width]
|
||||
[vl] "=r"(vl) // %[vl]
|
||||
:
|
||||
: "memory", "cc", "z0", "z1", "p0");
|
||||
}
|
||||
|
||||
__arm_locally_streaming void ScaleRowDown2Linear_SME(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
// Streaming-SVE only, no use of ZA tile.
|
||||
(void)src_stride;
|
||||
int vl;
|
||||
asm volatile(
|
||||
"cntb %x[vl] \n"
|
||||
"subs %w[dst_width], %w[dst_width], %w[vl] \n"
|
||||
"b.lt 2f \n"
|
||||
|
||||
"1: \n"
|
||||
"ptrue p0.b \n"
|
||||
"ld2b {z0.b, z1.b}, p0/z, [%[src_ptr]] \n"
|
||||
"incb %[src_ptr], all, mul #2 \n"
|
||||
"urhadd z0.b, p0/m, z0.b, z1.b \n"
|
||||
"subs %w[dst_width], %w[dst_width], %w[vl] \n"
|
||||
"st1b {z0.b}, p0, [%[dst_ptr]] \n"
|
||||
"incb %[dst_ptr] \n"
|
||||
"b.ge 1b \n"
|
||||
|
||||
"2: \n"
|
||||
"adds %w[dst_width], %w[dst_width], %w[vl] \n"
|
||||
"b.eq 99f \n"
|
||||
|
||||
"whilelt p0.b, wzr, %w[dst_width] \n"
|
||||
"ld2b {z0.b, z1.b}, p0/z, [%[src_ptr]] \n"
|
||||
"urhadd z0.b, p0/m, z0.b, z1.b \n"
|
||||
"st1b {z0.b}, p0, [%[dst_ptr]] \n"
|
||||
|
||||
"99: \n"
|
||||
: [src_ptr] "+r"(src_ptr), // %[src_ptr]
|
||||
[dst_ptr] "+r"(dst), // %[dst_ptr]
|
||||
[dst_width] "+r"(dst_width), // %[dst_width]
|
||||
[vl] "=r"(vl) // %[vl]
|
||||
:
|
||||
: "memory", "cc", "z0", "z1", "p0");
|
||||
}
|
||||
|
||||
__arm_locally_streaming void ScaleRowDown2Linear_16_SME(const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16_t* dst,
|
||||
int dst_width) {
|
||||
// Streaming-SVE only, no use of ZA tile.
|
||||
(void)src_stride;
|
||||
int vl;
|
||||
asm volatile(
|
||||
"cnth %x[vl] \n"
|
||||
"subs %w[dst_width], %w[dst_width], %w[vl] \n"
|
||||
"b.lt 2f \n"
|
||||
|
||||
"1: \n"
|
||||
"ptrue p0.h \n"
|
||||
"ld2h {z0.h, z1.h}, p0/z, [%[src_ptr]] \n"
|
||||
"incb %[src_ptr], all, mul #2 \n"
|
||||
"urhadd z0.h, p0/m, z0.h, z1.h \n"
|
||||
"subs %w[dst_width], %w[dst_width], %w[vl] \n"
|
||||
"st1h {z0.h}, p0, [%[dst_ptr]] \n"
|
||||
"incb %[dst_ptr] \n"
|
||||
"b.ge 1b \n"
|
||||
|
||||
"2: \n"
|
||||
"adds %w[dst_width], %w[dst_width], %w[vl] \n"
|
||||
"b.eq 99f \n"
|
||||
|
||||
"whilelt p0.h, wzr, %w[dst_width] \n"
|
||||
"ld2h {z0.h, z1.h}, p0/z, [%[src_ptr]] \n"
|
||||
"urhadd z0.h, p0/m, z0.h, z1.h \n"
|
||||
"st1h {z0.h}, p0, [%[dst_ptr]] \n"
|
||||
|
||||
"99: \n"
|
||||
: [src_ptr] "+r"(src_ptr), // %[src_ptr]
|
||||
[dst_ptr] "+r"(dst), // %[dst_ptr]
|
||||
[dst_width] "+r"(dst_width), // %[dst_width]
|
||||
[vl] "=r"(vl) // %[vl]
|
||||
:
|
||||
: "memory", "cc", "z0", "z1", "p0");
|
||||
}
|
||||
|
||||
#define SCALEROWDOWN2BOX_SVE \
|
||||
"ld2b {z0.b, z1.b}, p0/z, [%[src_ptr]] \n" \
|
||||
"ld2b {z2.b, z3.b}, p0/z, [%[src2_ptr]] \n" \
|
||||
"incb %[src_ptr], all, mul #2 \n" \
|
||||
"incb %[src2_ptr], all, mul #2 \n" \
|
||||
"uaddlb z4.h, z0.b, z1.b \n" \
|
||||
"uaddlt z5.h, z0.b, z1.b \n" \
|
||||
"uaddlb z6.h, z2.b, z3.b \n" \
|
||||
"uaddlt z7.h, z2.b, z3.b \n" \
|
||||
"add z4.h, z4.h, z6.h \n" \
|
||||
"add z5.h, z5.h, z7.h \n" \
|
||||
"rshrnb z0.b, z4.h, #2 \n" \
|
||||
"rshrnt z0.b, z5.h, #2 \n" \
|
||||
"subs %w[dst_width], %w[dst_width], %w[vl] \n" \
|
||||
"st1b {z0.b}, p0, [%[dst_ptr]] \n" \
|
||||
"incb %[dst_ptr] \n"
|
||||
|
||||
__arm_locally_streaming void ScaleRowDown2Box_SME(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
// Streaming-SVE only, no use of ZA tile.
|
||||
const uint8_t* src2_ptr = src_ptr + src_stride;
|
||||
int vl;
|
||||
asm volatile(
|
||||
"cntb %x[vl] \n"
|
||||
"subs %w[dst_width], %w[dst_width], %w[vl] \n"
|
||||
"b.lt 2f \n"
|
||||
|
||||
"ptrue p0.b \n"
|
||||
"1: \n" //
|
||||
SCALEROWDOWN2BOX_SVE
|
||||
"b.ge 1b \n"
|
||||
|
||||
"2: \n"
|
||||
"adds %w[dst_width], %w[dst_width], %w[vl] \n"
|
||||
"b.eq 99f \n"
|
||||
|
||||
"whilelt p0.b, wzr, %w[dst_width] \n" //
|
||||
SCALEROWDOWN2BOX_SVE
|
||||
|
||||
"99: \n"
|
||||
: [src_ptr] "+r"(src_ptr), // %[src_ptr]
|
||||
[src2_ptr] "+r"(src2_ptr), // %[src2_ptr]
|
||||
[dst_ptr] "+r"(dst), // %[dst_ptr]
|
||||
[dst_width] "+r"(dst_width), // %[dst_width]
|
||||
[vl] "=r"(vl) // %[vl]
|
||||
:
|
||||
: "memory", "cc", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "p0");
|
||||
}
|
||||
|
||||
#undef SCALEROWDOWN2BOX_SVE
|
||||
|
||||
#define SCALEROWDOWN2BOX_16_SVE \
|
||||
"ld2h {z0.h, z1.h}, p0/z, [%[src_ptr]] \n" \
|
||||
"ld2h {z2.h, z3.h}, p0/z, [%[src2_ptr]] \n" \
|
||||
"incb %[src_ptr], all, mul #2 \n" \
|
||||
"incb %[src2_ptr], all, mul #2 \n" \
|
||||
"uaddlb z4.s, z0.h, z1.h \n" \
|
||||
"uaddlt z5.s, z0.h, z1.h \n" \
|
||||
"uaddlb z6.s, z2.h, z3.h \n" \
|
||||
"uaddlt z7.s, z2.h, z3.h \n" \
|
||||
"add z4.s, z4.s, z6.s \n" \
|
||||
"add z5.s, z5.s, z7.s \n" \
|
||||
"rshrnb z0.h, z4.s, #2 \n" \
|
||||
"rshrnt z0.h, z5.s, #2 \n" \
|
||||
"subs %w[dst_width], %w[dst_width], %w[vl] \n" \
|
||||
"st1h {z0.h}, p0, [%[dst_ptr]] \n" \
|
||||
"incb %[dst_ptr] \n"
|
||||
|
||||
__arm_locally_streaming void ScaleRowDown2Box_16_SME(const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16_t* dst,
|
||||
int dst_width) {
|
||||
// Streaming-SVE only, no use of ZA tile.
|
||||
const uint16_t* src2_ptr = src_ptr + src_stride;
|
||||
int vl;
|
||||
asm volatile(
|
||||
"cnth %x[vl] \n"
|
||||
"subs %w[dst_width], %w[dst_width], %w[vl] \n"
|
||||
"b.lt 2f \n"
|
||||
|
||||
"ptrue p0.h \n"
|
||||
"1: \n" //
|
||||
SCALEROWDOWN2BOX_16_SVE
|
||||
"b.ge 1b \n"
|
||||
|
||||
"2: \n"
|
||||
"adds %w[dst_width], %w[dst_width], %w[vl] \n"
|
||||
"b.eq 99f \n"
|
||||
|
||||
"whilelt p0.h, wzr, %w[dst_width] \n" //
|
||||
SCALEROWDOWN2BOX_16_SVE
|
||||
|
||||
"99: \n"
|
||||
: [src_ptr] "+r"(src_ptr), // %[src_ptr]
|
||||
[src2_ptr] "+r"(src2_ptr), // %[src2_ptr]
|
||||
[dst_ptr] "+r"(dst), // %[dst_ptr]
|
||||
[dst_width] "+r"(dst_width), // %[dst_width]
|
||||
[vl] "=r"(vl) // %[vl]
|
||||
:
|
||||
: "memory", "cc", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "p0");
|
||||
}
|
||||
|
||||
#undef SCALEROWDOWN2BOX_16_SVE
|
||||
|
||||
__arm_locally_streaming void ScaleUVRowDown2_SME(const uint8_t* src_uv,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_uv,
|
||||
int dst_width) {
|
||||
// Streaming-SVE only, no use of ZA tile.
|
||||
(void)src_stride;
|
||||
int vl;
|
||||
asm volatile(
|
||||
"cnth %x[vl] \n"
|
||||
"subs %w[dst_width], %w[dst_width], %w[vl] \n"
|
||||
"b.lt 2f \n"
|
||||
|
||||
"1: \n"
|
||||
"ptrue p0.b \n"
|
||||
"ld2h {z0.h, z1.h}, p0/z, [%[src_uv]] \n"
|
||||
"incb %[src_uv], all, mul #2 \n"
|
||||
"subs %w[dst_width], %w[dst_width], %w[vl] \n"
|
||||
"st1h {z1.h}, p0, [%[dst_uv]] \n"
|
||||
"incb %[dst_uv] \n"
|
||||
"b.ge 1b \n"
|
||||
|
||||
"2: \n"
|
||||
"adds %w[dst_width], %w[dst_width], %w[vl] \n"
|
||||
"b.eq 99f \n"
|
||||
|
||||
"whilelt p0.h, wzr, %w[dst_width] \n"
|
||||
"ld2h {z0.h, z1.h}, p0/z, [%[src_uv]] \n"
|
||||
"st1h {z1.h}, p0, [%[dst_uv]] \n"
|
||||
|
||||
"99: \n"
|
||||
: [src_uv] "+r"(src_uv), // %[src_uv]
|
||||
[dst_uv] "+r"(dst_uv), // %[dst_uv]
|
||||
[dst_width] "+r"(dst_width), // %[dst_width]
|
||||
[vl] "=r"(vl) // %[vl]
|
||||
:
|
||||
: "memory", "cc", "z0", "z1", "p0");
|
||||
}
|
||||
|
||||
__arm_locally_streaming void ScaleUVRowDown2Linear_SME(const uint8_t* src_uv,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_uv,
|
||||
int dst_width) {
|
||||
// Streaming-SVE only, no use of ZA tile.
|
||||
(void)src_stride;
|
||||
int vl;
|
||||
asm volatile(
|
||||
"cnth %x[vl] \n"
|
||||
"ptrue p1.b \n"
|
||||
"subs %w[dst_width], %w[dst_width], %w[vl] \n"
|
||||
"b.lt 2f \n"
|
||||
|
||||
"ptrue p0.h \n"
|
||||
"1: \n"
|
||||
"ld2h {z0.h, z1.h}, p0/z, [%[src_uv]] \n"
|
||||
"incb %[src_uv], all, mul #2 \n"
|
||||
"urhadd z0.b, p1/m, z0.b, z1.b \n"
|
||||
"st1h {z0.h}, p0, [%[dst_uv]] \n"
|
||||
"incb %[dst_uv], all, mul #1 \n"
|
||||
"subs %w[dst_width], %w[dst_width], %w[vl] \n"
|
||||
"b.ge 1b \n"
|
||||
|
||||
"2: \n"
|
||||
"adds %w[dst_width], %w[dst_width], %w[vl] \n"
|
||||
"b.eq 99f \n"
|
||||
|
||||
"whilelt p0.h, wzr, %w[dst_width] \n"
|
||||
"ld2h {z0.h, z1.h}, p0/z, [%[src_uv]] \n"
|
||||
"urhadd z0.b, p1/m, z0.b, z1.b \n"
|
||||
"st1h {z0.h}, p0, [%[dst_uv]] \n"
|
||||
|
||||
"99: \n"
|
||||
: [src_uv] "+r"(src_uv), // %[src_uv]
|
||||
[dst_uv] "+r"(dst_uv), // %[dst_uv]
|
||||
[dst_width] "+r"(dst_width), // %[dst_width]
|
||||
[vl] "=r"(vl) // %[vl]
|
||||
:
|
||||
: "z0", "z1", "p0", "p1");
|
||||
}
|
||||
|
||||
#define SCALEUVROWDOWN2BOX_SVE \
|
||||
"ld2h {z0.h, z1.h}, p0/z, [%[src_uv]] \n" \
|
||||
"ld2h {z2.h, z3.h}, p0/z, [%[src2_uv]] \n" \
|
||||
"incb %[src_uv], all, mul #2 \n" \
|
||||
"incb %[src2_uv], all, mul #2 \n" \
|
||||
"uaddlb z4.h, z0.b, z1.b \n" \
|
||||
"uaddlt z5.h, z0.b, z1.b \n" \
|
||||
"uaddlb z6.h, z2.b, z3.b \n" \
|
||||
"uaddlt z7.h, z2.b, z3.b \n" \
|
||||
"add z4.h, z4.h, z6.h \n" \
|
||||
"add z5.h, z5.h, z7.h \n" \
|
||||
"rshrnb z0.b, z4.h, #2 \n" \
|
||||
"rshrnt z0.b, z5.h, #2 \n" \
|
||||
"st1h {z0.h}, p0, [%[dst_uv]] \n" \
|
||||
"incb %[dst_uv], all, mul #1 \n"
|
||||
|
||||
__arm_locally_streaming void ScaleUVRowDown2Box_SME(const uint8_t* src_uv,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_uv,
|
||||
int dst_width) {
|
||||
// Streaming-SVE only, no use of ZA tile.
|
||||
const uint8_t* src2_uv = src_uv + src_stride;
|
||||
int vl;
|
||||
asm volatile(
|
||||
"cnth %x[vl] \n"
|
||||
"ptrue p1.b \n"
|
||||
"subs %w[dst_width], %w[dst_width], %w[vl] \n"
|
||||
"b.lt 2f \n"
|
||||
|
||||
"ptrue p0.h \n"
|
||||
"1: \n" //
|
||||
SCALEUVROWDOWN2BOX_SVE
|
||||
"subs %w[dst_width], %w[dst_width], %w[vl] \n"
|
||||
"b.ge 1b \n"
|
||||
|
||||
"2: \n"
|
||||
"adds %w[dst_width], %w[dst_width], %w[vl] \n"
|
||||
"b.eq 99f \n"
|
||||
|
||||
"whilelt p0.h, wzr, %w[dst_width] \n" //
|
||||
SCALEUVROWDOWN2BOX_SVE
|
||||
|
||||
"99: \n"
|
||||
: [src_uv] "+r"(src_uv), // %[src_uv]
|
||||
[src2_uv] "+r"(src2_uv), // %[src2_uv]
|
||||
[dst_uv] "+r"(dst_uv), // %[dst_uv]
|
||||
[dst_width] "+r"(dst_width), // %[dst_width]
|
||||
[vl] "=r"(vl) // %[vl]
|
||||
:
|
||||
: "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "p0", "p1");
|
||||
}
|
||||
|
||||
#undef SCALEUVROWDOWN2BOX_SVE
|
||||
|
||||
__arm_locally_streaming void ScaleARGBRowDown2_SME(const uint8_t* src_argb,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_argb,
|
||||
int dst_width) {
|
||||
// Streaming-SVE only, no use of ZA tile.
|
||||
(void)src_stride;
|
||||
int vl;
|
||||
asm volatile(
|
||||
"cntw %x[vl] \n"
|
||||
"subs %w[dst_width], %w[dst_width], %w[vl] \n"
|
||||
"b.lt 2f \n"
|
||||
|
||||
"1: \n"
|
||||
"ptrue p0.b \n"
|
||||
"ld2w {z0.s, z1.s}, p0/z, [%[src_argb]] \n"
|
||||
"incb %[src_argb], all, mul #2 \n"
|
||||
"subs %w[dst_width], %w[dst_width], %w[vl] \n"
|
||||
"st1w {z1.s}, p0, [%[dst_argb]] \n"
|
||||
"incb %[dst_argb] \n"
|
||||
"b.ge 1b \n"
|
||||
|
||||
"2: \n"
|
||||
"adds %w[dst_width], %w[dst_width], %w[vl] \n"
|
||||
"b.eq 99f \n"
|
||||
|
||||
"whilelt p0.s, wzr, %w[dst_width] \n"
|
||||
"ld2w {z0.s, z1.s}, p0/z, [%[src_argb]] \n"
|
||||
"st1w {z1.s}, p0, [%[dst_argb]] \n"
|
||||
|
||||
"99: \n"
|
||||
: [src_argb] "+r"(src_argb), // %[src_argb]
|
||||
[dst_argb] "+r"(dst_argb), // %[dst_argb]
|
||||
[dst_width] "+r"(dst_width), // %[dst_width]
|
||||
[vl] "=r"(vl) // %[vl]
|
||||
:
|
||||
: "memory", "cc", "z0", "z1", "p0");
|
||||
}
|
||||
|
||||
__arm_locally_streaming void ScaleARGBRowDown2Linear_SME(
|
||||
const uint8_t* src_argb,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_argb,
|
||||
int dst_width) {
|
||||
// Streaming-SVE only, no use of ZA tile.
|
||||
(void)src_stride;
|
||||
int vl;
|
||||
asm volatile(
|
||||
"cntw %x[vl] \n"
|
||||
"ptrue p1.b \n"
|
||||
"subs %w[dst_width], %w[dst_width], %w[vl] \n"
|
||||
"b.lt 2f \n"
|
||||
|
||||
"ptrue p0.s \n"
|
||||
"1: \n"
|
||||
"ld2w {z0.s, z1.s}, p0/z, [%[src_argb]] \n"
|
||||
"incb %[src_argb], all, mul #2 \n"
|
||||
"urhadd z0.b, p1/m, z0.b, z1.b \n"
|
||||
"st1w {z0.s}, p0, [%[dst_argb]] \n"
|
||||
"incb %[dst_argb], all, mul #1 \n"
|
||||
"subs %w[dst_width], %w[dst_width], %w[vl] \n"
|
||||
"b.ge 1b \n"
|
||||
|
||||
"2: \n"
|
||||
"adds %w[dst_width], %w[dst_width], %w[vl] \n"
|
||||
"b.eq 99f \n"
|
||||
|
||||
"whilelt p0.s, wzr, %w[dst_width] \n"
|
||||
"ld2w {z0.s, z1.s}, p0/z, [%[src_argb]] \n"
|
||||
"urhadd z0.b, p1/m, z0.b, z1.b \n"
|
||||
"st1w {z0.s}, p0, [%[dst_argb]] \n"
|
||||
|
||||
"99: \n"
|
||||
: [src_argb] "+r"(src_argb), // %[src_argb]
|
||||
[dst_argb] "+r"(dst_argb), // %[dst_argb]
|
||||
[dst_width] "+r"(dst_width), // %[dst_width]
|
||||
[vl] "=r"(vl) // %[vl]
|
||||
:
|
||||
: "memory", "cc", "z0", "z1", "p0", "p1");
|
||||
}
|
||||
|
||||
#define SCALEARGBROWDOWN2BOX_SVE \
|
||||
"ld2w {z0.s, z1.s}, p0/z, [%[src_argb]] \n" \
|
||||
"ld2w {z2.s, z3.s}, p0/z, [%[src2_argb]] \n" \
|
||||
"incb %[src_argb], all, mul #2 \n" \
|
||||
"incb %[src2_argb], all, mul #2 \n" \
|
||||
"uaddlb z4.h, z0.b, z1.b \n" \
|
||||
"uaddlt z5.h, z0.b, z1.b \n" \
|
||||
"uaddlb z6.h, z2.b, z3.b \n" \
|
||||
"uaddlt z7.h, z2.b, z3.b \n" \
|
||||
"add z4.h, z4.h, z6.h \n" \
|
||||
"add z5.h, z5.h, z7.h \n" \
|
||||
"rshrnb z0.b, z4.h, #2 \n" \
|
||||
"rshrnt z0.b, z5.h, #2 \n" \
|
||||
"st1w {z0.s}, p0, [%[dst_argb]] \n" \
|
||||
"incb %[dst_argb], all, mul #1 \n"
|
||||
|
||||
__arm_locally_streaming void ScaleARGBRowDown2Box_SME(const uint8_t* src_argb,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_argb,
|
||||
int dst_width) {
|
||||
// Streaming-SVE only, no use of ZA tile.
|
||||
const uint8_t* src2_argb = src_argb + src_stride;
|
||||
int vl;
|
||||
asm volatile(
|
||||
"cntw %x[vl] \n"
|
||||
"ptrue p1.b \n"
|
||||
"subs %w[dst_width], %w[dst_width], %w[vl] \n"
|
||||
"b.lt 2f \n"
|
||||
|
||||
"ptrue p0.s \n"
|
||||
"1: \n" //
|
||||
SCALEARGBROWDOWN2BOX_SVE
|
||||
"subs %w[dst_width], %w[dst_width], %w[vl] \n"
|
||||
"b.ge 1b \n"
|
||||
|
||||
"2: \n"
|
||||
"adds %w[dst_width], %w[dst_width], %w[vl] \n"
|
||||
"b.eq 99f \n"
|
||||
|
||||
"whilelt p0.s, wzr, %w[dst_width] \n" //
|
||||
SCALEARGBROWDOWN2BOX_SVE
|
||||
|
||||
"99: \n"
|
||||
: [src_argb] "+r"(src_argb), // %[src_argb]
|
||||
[src2_argb] "+r"(src2_argb), // %[src2_argb]
|
||||
[dst_argb] "+r"(dst_argb), // %[dst_argb]
|
||||
[dst_width] "+r"(dst_width), // %[dst_width]
|
||||
[vl] "=r"(vl) // %[vl]
|
||||
:
|
||||
: "memory", "cc", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "p0",
|
||||
"p1");
|
||||
}
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) &&
|
||||
// defined(__aarch64__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
1159
3rdparty/libyuv/source/scale_uv.cc
vendored
Normal file
1159
3rdparty/libyuv/source/scale_uv.cc
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1392
3rdparty/libyuv/source/scale_win.cc
vendored
Normal file
1392
3rdparty/libyuv/source/scale_win.cc
vendored
Normal file
File diff suppressed because it is too large
Load Diff
35
3rdparty/libyuv/source/test.sh
vendored
Normal file
35
3rdparty/libyuv/source/test.sh
vendored
Normal file
@@ -0,0 +1,35 @@
|
||||
#!/bin/bash
|
||||
set -x
|
||||
|
||||
function runbenchmark1 {
|
||||
perf record /google/src/cloud/fbarchard/clean/google3/blaze-bin/third_party/libyuv/libyuv_test --gunit_filter=*$1 --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=1000 --libyuv_flags=-1 --libyuv_cpu_info=-1
|
||||
perf report | grep AVX
|
||||
}
|
||||
|
||||
runbenchmark1 ABGRToI420
|
||||
runbenchmark1 Android420ToI420
|
||||
runbenchmark1 ARGBToI420
|
||||
runbenchmark1 Convert16To8Plane
|
||||
runbenchmark1 ConvertToARGB
|
||||
runbenchmark1 ConvertToI420
|
||||
runbenchmark1 CopyPlane
|
||||
runbenchmark1 H010ToAB30
|
||||
runbenchmark1 H010ToAR30
|
||||
runbenchmark1 HalfFloatPlane
|
||||
runbenchmark1 I010ToAB30
|
||||
runbenchmark1 I010ToAR30
|
||||
runbenchmark1 I420Copy
|
||||
runbenchmark1 I420Psnr
|
||||
runbenchmark1 I420Scale
|
||||
runbenchmark1 I420Ssim
|
||||
runbenchmark1 I420ToARGB
|
||||
runbenchmark1 I420ToNV12
|
||||
runbenchmark1 I420ToUYVY
|
||||
runbenchmark1 I422ToI420
|
||||
runbenchmark1 InitCpuFlags
|
||||
runbenchmark1 J420ToARGB
|
||||
runbenchmark1 NV12ToARGB
|
||||
runbenchmark1 NV12ToI420
|
||||
runbenchmark1 NV12ToI420Rotate
|
||||
runbenchmark1 SetCpuFlags
|
||||
runbenchmark1 YUY2ToI420
|
||||
62
3rdparty/libyuv/source/video_common.cc
vendored
Normal file
62
3rdparty/libyuv/source/video_common.cc
vendored
Normal file
@@ -0,0 +1,62 @@
|
||||
/*
|
||||
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/video_common.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct FourCCAliasEntry {
|
||||
uint32_t alias;
|
||||
uint32_t canonical;
|
||||
};
|
||||
|
||||
#define NUM_ALIASES 18
|
||||
static const struct FourCCAliasEntry kFourCCAliases[NUM_ALIASES] = {
|
||||
{FOURCC_IYUV, FOURCC_I420},
|
||||
{FOURCC_YU12, FOURCC_I420},
|
||||
{FOURCC_YU16, FOURCC_I422},
|
||||
{FOURCC_YU24, FOURCC_I444},
|
||||
{FOURCC_YUYV, FOURCC_YUY2},
|
||||
{FOURCC_YUVS, FOURCC_YUY2}, // kCMPixelFormat_422YpCbCr8_yuvs
|
||||
{FOURCC_HDYC, FOURCC_UYVY},
|
||||
{FOURCC_2VUY, FOURCC_UYVY}, // kCMPixelFormat_422YpCbCr8
|
||||
{FOURCC_JPEG, FOURCC_MJPG}, // Note: JPEG has DHT while MJPG does not.
|
||||
{FOURCC_DMB1, FOURCC_MJPG},
|
||||
{FOURCC_BA81, FOURCC_BGGR}, // deprecated.
|
||||
{FOURCC_RGB3, FOURCC_RAW},
|
||||
{FOURCC_BGR3, FOURCC_24BG},
|
||||
{FOURCC_CM32, FOURCC_BGRA}, // kCMPixelFormat_32ARGB
|
||||
{FOURCC_CM24, FOURCC_RAW}, // kCMPixelFormat_24RGB
|
||||
{FOURCC_L555, FOURCC_RGBO}, // kCMPixelFormat_16LE555
|
||||
{FOURCC_L565, FOURCC_RGBP}, // kCMPixelFormat_16LE565
|
||||
{FOURCC_5551, FOURCC_RGBO}, // kCMPixelFormat_16LE5551
|
||||
};
|
||||
// TODO(fbarchard): Consider mapping kCMPixelFormat_32BGRA to FOURCC_ARGB.
|
||||
// {FOURCC_BGRA, FOURCC_ARGB}, // kCMPixelFormat_32BGRA
|
||||
|
||||
LIBYUV_API
|
||||
uint32_t CanonicalFourCC(uint32_t fourcc) {
|
||||
int i;
|
||||
for (i = 0; i < NUM_ALIASES; ++i) {
|
||||
if (kFourCCAliases[i].alias == fourcc) {
|
||||
return kFourCCAliases[i].canonical;
|
||||
}
|
||||
}
|
||||
// Not an alias, so return it as-is.
|
||||
return fourcc;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
Reference in New Issue
Block a user