From 5878cbc47eee6d4cc1d1281b57613a9984ddf6a6 Mon Sep 17 00:00:00 2001 From: ttwu Date: Wed, 31 Jan 2024 10:45:50 +0800 Subject: [PATCH] added IDCT rvv code from Andes --- simd/rvv_andes/jidctfst-rvv.c | 377 ++++++++++++++++++++++++++ simd/rvv_andes/jidctint-rvv.c | 489 ++++++++++++++++++++++++++++++++++ simd/rvv_andes/jidctred-rvv.c | 436 ++++++++++++++++++++++++++++++ 3 files changed, 1302 insertions(+) create mode 100644 simd/rvv_andes/jidctfst-rvv.c create mode 100644 simd/rvv_andes/jidctint-rvv.c create mode 100644 simd/rvv_andes/jidctred-rvv.c diff --git a/simd/rvv_andes/jidctfst-rvv.c b/simd/rvv_andes/jidctfst-rvv.c new file mode 100644 index 000000000..3ee5043f6 --- /dev/null +++ b/simd/rvv_andes/jidctfst-rvv.c @@ -0,0 +1,377 @@ +/* +* jidctfst-rvv.c - fast integer IDCT (RISC-V RVV) +* +* Copyright (c) 2012-2024 Andes Technology Corporation +* All rights reserved. +*/ +/* + * jidctfst-neon.c - fast integer IDCT (Arm Neon) + * + * Copyright (C) 2020, Arm Limited. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "../../jinclude.h" +#include "../../jpeglib.h" +#include "../../jsimd.h" +#include "../../jdct.h" +#include "../../jsimddct.h" +#include "../jsimd.h" + +#include + + +/* jsimd_idct_ifast_rvv() performs dequantization and a fast, not so accurate + * inverse DCT (Discrete Cosine Transform) on one block of coefficients. It + * uses the same calculations and produces exactly the same output as IJG's + * original jpeg_idct_ifast() function, which can be found in jidctfst.c. + * + * Scaled integer constants are used to avoid floating-point arithmetic: + * 0.082392200 = 2688 * 2^-15 + * 0.414213562 = 13568 * 2^-15 + * 0.847759065 = 27776 * 2^-15 + * 0.613125930 = 20096 * 2^-15 + * + * See jidctfst.c for further details of the IDCT algorithm. Where possible, + * the variable names and comments here in jsimd_idct_ifast_rvv() match up + * with those in jpeg_idct_ifast(). + */ + +#define PASS1_BITS 2 + +#define F_0_082 2688 +#define F_0_414 13568 +#define F_0_847 27776 +#define F_0_613 20096 + + +static const int16_t idct_ifast_consts[] = { + F_0_082, F_0_414, F_0_847, F_0_613 +}; + +void jsimd_idct_ifast_rvv(void *dct_table, JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + IFAST_MULT_TYPE *quantptr = dct_table; + vint16m8_t rows_all_i16m8; + + /* Load DCT coefficients. */ + size_t vl = 8; + vint16m1_t row0 = __riscv_vle16_v_i16m1(coef_block + 0 * DCTSIZE, vl); + vint16m1_t row1 = __riscv_vle16_v_i16m1(coef_block + 1 * DCTSIZE, vl); + vint16m1_t row2 = __riscv_vle16_v_i16m1(coef_block + 2 * DCTSIZE, vl); + vint16m1_t row3 = __riscv_vle16_v_i16m1(coef_block + 3 * DCTSIZE, vl); + vint16m1_t row4 = __riscv_vle16_v_i16m1(coef_block + 4 * DCTSIZE, vl); + vint16m1_t row5 = __riscv_vle16_v_i16m1(coef_block + 5 * DCTSIZE, vl); + vint16m1_t row6 = __riscv_vle16_v_i16m1(coef_block + 6 * DCTSIZE, vl); + vint16m1_t row7 = __riscv_vle16_v_i16m1(coef_block + 7 * DCTSIZE, vl); + + /* Load quantization table values for DC coefficients. */ + vint16m1_t quant_row0 = __riscv_vle16_v_i16m1(quantptr + 0 * DCTSIZE, vl); + + /* Dequantize DC coefficients. */ + row0 = __riscv_vmul_vv_i16m1(row0, quant_row0, vl); + + /* Construct bitmap to test if all AC coefficients are 0. */ + vint16m1_t bitmap = __riscv_vor_vv_i16m1(row1, row2, vl); + bitmap = __riscv_vor_vv_i16m1(bitmap, row3, vl); + bitmap = __riscv_vor_vv_i16m1(bitmap, row4, vl); + bitmap = __riscv_vor_vv_i16m1(bitmap, row5, vl); + bitmap = __riscv_vor_vv_i16m1(bitmap, row6, vl); + bitmap = __riscv_vor_vv_i16m1(bitmap, row7, vl); + + uint16_t ac_bitmap; + { + vuint16m1_t vec_zero = __riscv_vmv_s_x_u16m1(0, vl); + vuint16m1_t tmp_u16m1 = __riscv_vreinterpret_v_i16m1_u16m1(bitmap); + ac_bitmap = __riscv_vmv_x_s_u16m1_u16(__riscv_vredor_vs_u16m1_u16m1(tmp_u16m1, vec_zero, vl)); + } + + /* Load IDCT conversion constants. */ + if (0 == ac_bitmap) + { + /* All AC coefficients are zero. + * Compute DC values and duplicate into vectors. + */ + // combine vectors + vint16m4_t tmp_i16m4 = __riscv_vlmul_ext_v_i16m1_i16m4(row0); + tmp_i16m4 = __riscv_vslideup_vx_i16m4(tmp_i16m4, tmp_i16m4, DCTSIZE2/8, DCTSIZE2/4); + tmp_i16m4 = __riscv_vslideup_vx_i16m4(tmp_i16m4, tmp_i16m4, DCTSIZE2/4, DCTSIZE2/2); + rows_all_i16m8 = __riscv_vlmul_ext_v_i16m4_i16m8(tmp_i16m4); + rows_all_i16m8 = __riscv_vslideup_vx_i16m8(rows_all_i16m8, rows_all_i16m8, DCTSIZE2/2, DCTSIZE2); + } + else { + /* full IDCT calculation. */ + + /* Load quantization table. */ + vint16m1_t quant_row1 = __riscv_vle16_v_i16m1(quantptr + 1 * DCTSIZE, vl); + vint16m1_t quant_row2 = __riscv_vle16_v_i16m1(quantptr + 2 * DCTSIZE, vl); + vint16m1_t quant_row3 = __riscv_vle16_v_i16m1(quantptr + 3 * DCTSIZE, vl); + vint16m1_t quant_row4 = __riscv_vle16_v_i16m1(quantptr + 4 * DCTSIZE, vl); + vint16m1_t quant_row5 = __riscv_vle16_v_i16m1(quantptr + 5 * DCTSIZE, vl); + vint16m1_t quant_row6 = __riscv_vle16_v_i16m1(quantptr + 6 * DCTSIZE, vl); + vint16m1_t quant_row7 = __riscv_vle16_v_i16m1(quantptr + 7 * DCTSIZE, vl); + + /* Even part: dequantize DCT coefficients. */ + vint16m1_t tmp0 = __riscv_vmv_v_v_i16m1(row0, vl); + vint16m1_t tmp1 = __riscv_vmul_vv_i16m1(row2, quant_row2, vl); + vint16m1_t tmp2 = __riscv_vmul_vv_i16m1(row4, quant_row4, vl); + vint16m1_t tmp3 = __riscv_vmul_vv_i16m1(row6, quant_row6, vl); + + vint16m1_t tmp10 = __riscv_vadd_vv_i16m1(tmp0, tmp2, vl); /* phase 3 */ + vint16m1_t tmp11 = __riscv_vsub_vv_i16m1(tmp0, tmp2, vl); + vint16m1_t tmp13 = __riscv_vadd_vv_i16m1(tmp1, tmp3, vl); /* phases 5-3 */ + vint16m1_t tmp12; + { + vint16m1_t tmp1_sub_tmp3 = __riscv_vsub_vv_i16m1(tmp1, tmp3, vl); + tmp12 = __riscv_vsmul_vx_i16m1(tmp1_sub_tmp3, idct_ifast_consts[1], vl); + tmp12 = __riscv_vadd_vv_i16m1(tmp12, tmp1_sub_tmp3, vl); + tmp12 = __riscv_vsub_vv_i16m1(tmp12, tmp13, vl); + } + + tmp0 = __riscv_vadd_vv_i16m1(tmp10, tmp13, vl); /* phase 2 */ + tmp3 = __riscv_vsub_vv_i16m1(tmp10, tmp13, vl); + tmp1 = __riscv_vadd_vv_i16m1(tmp11, tmp12, vl); + tmp2 = __riscv_vsub_vv_i16m1(tmp11, tmp12, vl); + + /* Odd part: dequantize DCT coefficients. */ + vint16m1_t tmp4 = __riscv_vmul_vv_i16m1(__riscv_vmv_v_v_i16m1(row1, vl), quant_row1, vl); + vint16m1_t tmp5 = __riscv_vmul_vv_i16m1(__riscv_vmv_v_v_i16m1(row3, vl), quant_row3, vl); + vint16m1_t tmp6 = __riscv_vmul_vv_i16m1(__riscv_vmv_v_v_i16m1(row5, vl), quant_row5, vl); + vint16m1_t tmp7 = __riscv_vmul_vv_i16m1(__riscv_vmv_v_v_i16m1(row7, vl), quant_row7, vl); + + vint16m1_t z13 = __riscv_vadd_vv_i16m1(tmp5, tmp6, vl); /* phase 6 */ + vint16m1_t neg_z10 = __riscv_vsub_vv_i16m1(tmp5, tmp6, vl); + vint16m1_t z11 = __riscv_vadd_vv_i16m1(tmp4, tmp7, vl); + vint16m1_t z12 = __riscv_vsub_vv_i16m1(tmp4, tmp7, vl); + + tmp7 = __riscv_vadd_vv_i16m1(z11, z13, vl); /* phase 5 */ + { + vint16m1_t z11_sub_z13 = __riscv_vsub_vv_i16m1(z11, z13, vl); + tmp11 = __riscv_vsmul_vx_i16m1(z11_sub_z13, idct_ifast_consts[1], vl); + tmp11 = __riscv_vadd_vv_i16m1(tmp11, z11_sub_z13, vl); + } + + { + vint16m1_t z10_add_z12 = __riscv_vsub_vv_i16m1(z12, neg_z10, vl); + vint16m1_t z5 = __riscv_vsmul_vx_i16m1(z10_add_z12, idct_ifast_consts[2], vl); + z5 = __riscv_vadd_vv_i16m1(z5, z10_add_z12, vl); + + tmp10 = __riscv_vsmul_vx_i16m1(z12, idct_ifast_consts[0], vl); + tmp10 = __riscv_vadd_vv_i16m1(tmp10, z12, vl); + tmp10 = __riscv_vsub_vv_i16m1(tmp10, z5, vl); + + tmp12 = __riscv_vsmul_vx_i16m1(neg_z10, idct_ifast_consts[3], vl); + vint16m1_t tmp_neg_z10_d = __riscv_vadd_vv_i16m1(neg_z10, neg_z10, vl); + tmp12 = __riscv_vadd_vv_i16m1(tmp12, tmp_neg_z10_d, vl); + tmp12 = __riscv_vadd_vv_i16m1(tmp12, z5, vl); + } + + tmp6 = __riscv_vsub_vv_i16m1(tmp12, tmp7, vl); /* phase 2 */ + tmp5 = __riscv_vsub_vv_i16m1(tmp11, tmp6, vl); + tmp4 = __riscv_vadd_vv_i16m1(tmp10, tmp5, vl); + + row0 = __riscv_vadd_vv_i16m1(tmp0, tmp7, vl); + row7 = __riscv_vsub_vv_i16m1(tmp0, tmp7, vl); + row1 = __riscv_vadd_vv_i16m1(tmp1, tmp6, vl); + row6 = __riscv_vsub_vv_i16m1(tmp1, tmp6, vl); + row2 = __riscv_vadd_vv_i16m1(tmp2, tmp5, vl); + row5 = __riscv_vsub_vv_i16m1(tmp2, tmp5, vl); + row4 = __riscv_vadd_vv_i16m1(tmp3, tmp4, vl); + row3 = __riscv_vsub_vv_i16m1(tmp3, tmp4, vl); + + // combine vectors + vint16m4_t rows_0123_i16m4 = __riscv_vlmul_ext_v_i16m1_i16m4(row0); + rows_0123_i16m4 = __riscv_vslideup_vx_i16m4(rows_0123_i16m4, __riscv_vlmul_ext_v_i16m1_i16m4(row1), 8, 32); + rows_0123_i16m4 = __riscv_vslideup_vx_i16m4(rows_0123_i16m4, __riscv_vlmul_ext_v_i16m1_i16m4(row2), 16, 32); + rows_0123_i16m4 = __riscv_vslideup_vx_i16m4(rows_0123_i16m4, __riscv_vlmul_ext_v_i16m1_i16m4(row2), 24, 32); + + vint16m4_t rows_4567_i16m4 = __riscv_vlmul_ext_v_i16m1_i16m4(row4); + rows_4567_i16m4 = __riscv_vslideup_vx_i16m4(rows_4567_i16m4, __riscv_vlmul_ext_v_i16m1_i16m4(row5), 8, 32); + rows_4567_i16m4 = __riscv_vslideup_vx_i16m4(rows_4567_i16m4, __riscv_vlmul_ext_v_i16m1_i16m4(row6), 16, 32); + rows_4567_i16m4 = __riscv_vslideup_vx_i16m4(rows_4567_i16m4, __riscv_vlmul_ext_v_i16m1_i16m4(row7), 24, 32); + + rows_all_i16m8 = __riscv_vlmul_ext_v_i16m4_i16m8(rows_0123_i16m4); + rows_all_i16m8 = __riscv_vslideup_vx_i16m8(rows_all_i16m8, __riscv_vlmul_ext_v_i16m4_i16m8(rows_4567_i16m4), 32, 64); + } + + /* Transpose rows to work on columns in pass 2. */ + const uint8_t trans_index8x8_u8[DCTSIZE2] = + { + 0, 8, 16, 24, 32, 40, 48, 56, + 1, 9, 17, 25, 33, 41, 49, 57, + 2, 10, 18, 26, 34, 42, 50, 58, + 3, 11, 19, 27, 35, 43, 51, 59, + 4, 12, 20, 28, 36, 44, 52, 60, + 5, 13, 21, 29, 37, 45, 53, 61, + 6, 14, 22, 30, 38, 46, 54, 62, + 7, 15, 23, 31, 39, 47, 55, 63, + }; + + // load transpose look-up table + vuint8m4_t vg_reg8 = __riscv_vle8_v_u8m4(trans_index8x8_u8, DCTSIZE2); + + // interpret to u16 & transpose + vint16m8_t vg_reg16 = __riscv_vrgather(rows_all_i16m8, __riscv_vzext_vf2_u16m8(vg_reg8, DCTSIZE2), \ + DCTSIZE2); + int16_t workspace[DCTSIZE2]; /* buffers data between passes */ + __riscv_vse16_v_i16m8(&workspace[0], vg_reg16, DCTSIZE2); + vint16m1_t col0 = __riscv_vle16_v_i16m1(&workspace[8*0], vl); + vint16m1_t col1 = __riscv_vle16_v_i16m1(&workspace[8*1], vl); + vint16m1_t col2 = __riscv_vle16_v_i16m1(&workspace[8*2], vl); + vint16m1_t col3 = __riscv_vle16_v_i16m1(&workspace[8*3], vl); + vint16m1_t col4 = __riscv_vle16_v_i16m1(&workspace[8*4], vl); + vint16m1_t col5 = __riscv_vle16_v_i16m1(&workspace[8*5], vl); + vint16m1_t col6 = __riscv_vle16_v_i16m1(&workspace[8*6], vl); + vint16m1_t col7 = __riscv_vle16_v_i16m1(&workspace[8*7], vl); + + /* 1-D IDCT, pass 2 */ + + /* Even part */ + vint16m1_t tmp10 = __riscv_vadd_vv_i16m1(col0, col4, vl); + vint16m1_t tmp11 = __riscv_vsub_vv_i16m1(col0, col4, vl); + vint16m1_t tmp13 = __riscv_vadd_vv_i16m1(col2, col6, vl); + vint16m1_t tmp12; + { + vint16m1_t col2_sub_col6 = __riscv_vsub_vv_i16m1(col2, col6, vl); + tmp12 = __riscv_vsmul_vx_i16m1(col2_sub_col6, idct_ifast_consts[1], vl); + tmp12 = __riscv_vadd_vv_i16m1(tmp12, col2_sub_col6, vl); + tmp12 = __riscv_vsub_vv_i16m1(tmp12, tmp13, vl); + } + + vint16m1_t tmp0 = __riscv_vadd_vv_i16m1(tmp10, tmp13, vl); + vint16m1_t tmp3 = __riscv_vsub_vv_i16m1(tmp10, tmp13, vl); + vint16m1_t tmp1 = __riscv_vadd_vv_i16m1(tmp11, tmp12, vl); + vint16m1_t tmp2 = __riscv_vsub_vv_i16m1(tmp11, tmp12, vl); + + /* Odd part */ + vint16m1_t z13 = __riscv_vadd_vv_i16m1(col5, col3, vl); + vint16m1_t neg_z10 = __riscv_vsub_vv_i16m1(col3, col5, vl); + vint16m1_t z11 = __riscv_vadd_vv_i16m1(col1, col7, vl); + vint16m1_t z12 = __riscv_vsub_vv_i16m1(col1, col7, vl); + + vint16m1_t tmp7 = __riscv_vadd_vv_i16m1(z11, z13, vl); /* phase 5 */ + { + vint16m1_t z11_sub_z13 = __riscv_vsub_vv_i16m1(z11, z13, vl); + tmp11 = __riscv_vsmul_vx_i16m1(z11_sub_z13, idct_ifast_consts[1], vl); + tmp11 = __riscv_vadd_vv_i16m1(tmp11, z11_sub_z13, vl); + } + + { + vint16m1_t z10_add_z12 = __riscv_vsub_vv_i16m1(z12, neg_z10, vl); + vint16m1_t z5 = __riscv_vsmul_vx_i16m1(z10_add_z12, idct_ifast_consts[2], vl); + z5 = __riscv_vadd_vv_i16m1(z5, z10_add_z12, vl); + + tmp10 = __riscv_vsmul_vx_i16m1(z12, idct_ifast_consts[0], vl); + tmp10 = __riscv_vadd_vv_i16m1(tmp10, z12, vl); + tmp10 = __riscv_vsub_vv_i16m1(tmp10, z5, vl); + + tmp12 = __riscv_vsmul_vx_i16m1(neg_z10, idct_ifast_consts[3], vl); + vint16m1_t tmp_neg_z10_d = __riscv_vadd_vv_i16m1(neg_z10, neg_z10, vl); + tmp12 = __riscv_vadd_vv_i16m1(tmp12, tmp_neg_z10_d, vl); + tmp12 = __riscv_vadd_vv_i16m1(tmp12, z5, vl); + } + + vint16m1_t tmp6 = __riscv_vsub_vv_i16m1(tmp12, tmp7, vl); /* phase 2 */ + vint16m1_t tmp5 = __riscv_vsub_vv_i16m1(tmp11, tmp6, vl); + vint16m1_t tmp4 = __riscv_vadd_vv_i16m1(tmp10, tmp5, vl); + + col0 = __riscv_vadd_vv_i16m1(tmp0, tmp7, vl); + col7 = __riscv_vsub_vv_i16m1(tmp0, tmp7, vl); + col1 = __riscv_vadd_vv_i16m1(tmp1, tmp6, vl); + col6 = __riscv_vsub_vv_i16m1(tmp1, tmp6, vl); + col2 = __riscv_vadd_vv_i16m1(tmp2, tmp5, vl); + col5 = __riscv_vsub_vv_i16m1(tmp2, tmp5, vl); + col4 = __riscv_vadd_vv_i16m1(tmp3, tmp4, vl); + col3 = __riscv_vsub_vv_i16m1(tmp3, tmp4, vl); + + /* Scale down by a factor of 8, narrowing to 8-bit. */ + /* Clamp to range [0-255]. */ + vint8m2_t tmp_i8m2; + vuint8m2_t u8m2_col0123, u8m2_col4567; + vint16m4_t i16m4_col0123 = __riscv_vlmul_ext_v_i16m1_i16m4(col0); + i16m4_col0123 = __riscv_vslideup_vx_i16m4(i16m4_col0123, __riscv_vlmul_ext_v_i16m1_i16m4(col1), 8, 16); + i16m4_col0123 = __riscv_vslideup_vx_i16m4(i16m4_col0123, __riscv_vlmul_ext_v_i16m1_i16m4(col2), 16, 24); + i16m4_col0123 = __riscv_vslideup_vx_i16m4(i16m4_col0123, __riscv_vlmul_ext_v_i16m1_i16m4(col3), 24, 32); + tmp_i8m2 = __riscv_vnclip_wx_i8m2(i16m4_col0123, PASS1_BITS + 3, 32); // clamp -128 ~ 127 + u8m2_col0123 = __riscv_vreinterpret_v_i8m2_u8m2(tmp_i8m2); + u8m2_col0123 = __riscv_vadd_vx_u8m2(u8m2_col0123, CENTERJSAMPLE, 32); // clamp 0 ~ 255 + + vint16m4_t i16m4_col4567 = __riscv_vlmul_ext_v_i16m1_i16m4(col4); + i16m4_col4567 = __riscv_vslideup_vx_i16m4(i16m4_col4567, __riscv_vlmul_ext_v_i16m1_i16m4(col5), 8, 16); + i16m4_col4567 = __riscv_vslideup_vx_i16m4(i16m4_col4567, __riscv_vlmul_ext_v_i16m1_i16m4(col6), 16, 24); + i16m4_col4567 = __riscv_vslideup_vx_i16m4(i16m4_col4567, __riscv_vlmul_ext_v_i16m1_i16m4(col7), 24, 32); + tmp_i8m2 = __riscv_vnclip_wx_i8m2(i16m4_col4567, PASS1_BITS + 3, 32); // clamp -128 ~ 127 + u8m2_col4567 = __riscv_vreinterpret_v_i8m2_u8m2(tmp_i8m2); + u8m2_col4567 = __riscv_vadd_vx_u8m2(u8m2_col4567, CENTERJSAMPLE, 32); // clamp 0 ~ 255 + + vuint8m4_t u8m4_col_all = __riscv_vlmul_ext_v_u8m2_u8m4(u8m2_col0123); + u8m4_col_all = __riscv_vslideup_vx_u8m4(u8m4_col_all, __riscv_vlmul_ext_v_u8m2_u8m4(u8m2_col4567), 32, 64); + + /* Transpose block to prepare for store. */ + vuint8m4_t u8m4_trans_all = __riscv_vrgather_vv_u8m4(u8m4_col_all, vg_reg8, DCTSIZE2); + + // extract columns + vuint8mf2_t u8mf2_col_0, u8mf2_col_1, u8mf2_col_2, u8mf2_col_3; + vuint8mf2_t u8mf2_col_4, u8mf2_col_5, u8mf2_col_6, u8mf2_col_7; + + vuint8m4_t slidedown_m4 = __riscv_vslidedown_vx_u8m4(u8m4_trans_all, 0, vl); + u8mf2_col_0 = __riscv_vlmul_trunc_v_u8m4_u8mf2(slidedown_m4); + + slidedown_m4 = __riscv_vslidedown_vx_u8m4(u8m4_trans_all, 8, vl); + u8mf2_col_1 = __riscv_vlmul_trunc_v_u8m4_u8mf2(slidedown_m4); + + slidedown_m4 = __riscv_vslidedown_vx_u8m4(u8m4_trans_all, 16, vl); + u8mf2_col_2 = __riscv_vlmul_trunc_v_u8m4_u8mf2(slidedown_m4); + + slidedown_m4 = __riscv_vslidedown_vx_u8m4(u8m4_trans_all, 24, vl); + u8mf2_col_3 = __riscv_vlmul_trunc_v_u8m4_u8mf2(slidedown_m4); + + slidedown_m4 = __riscv_vslidedown_vx_u8m4(u8m4_trans_all, 32, vl); + u8mf2_col_4 = __riscv_vlmul_trunc_v_u8m4_u8mf2(slidedown_m4); + + slidedown_m4 = __riscv_vslidedown_vx_u8m4(u8m4_trans_all, 40, vl); + u8mf2_col_5 = __riscv_vlmul_trunc_v_u8m4_u8mf2(slidedown_m4); + + slidedown_m4 = __riscv_vslidedown_vx_u8m4(u8m4_trans_all, 48, vl); + u8mf2_col_6 = __riscv_vlmul_trunc_v_u8m4_u8mf2(slidedown_m4); + + slidedown_m4 = __riscv_vslidedown_vx_u8m4(u8m4_trans_all, 56, vl); + u8mf2_col_7 = __riscv_vlmul_trunc_v_u8m4_u8mf2(slidedown_m4); + + JSAMPROW outptr0 = output_buf[0] + output_col; + JSAMPROW outptr1 = output_buf[1] + output_col; + JSAMPROW outptr2 = output_buf[2] + output_col; + JSAMPROW outptr3 = output_buf[3] + output_col; + JSAMPROW outptr4 = output_buf[4] + output_col; + JSAMPROW outptr5 = output_buf[5] + output_col; + JSAMPROW outptr6 = output_buf[6] + output_col; + JSAMPROW outptr7 = output_buf[7] + output_col; + + /* Store DCT block to memory. */ + __riscv_vse8_v_u8mf2(outptr0, u8mf2_col_0, vl); + __riscv_vse8_v_u8mf2(outptr1, u8mf2_col_1, vl); + __riscv_vse8_v_u8mf2(outptr2, u8mf2_col_2, vl); + __riscv_vse8_v_u8mf2(outptr3, u8mf2_col_3, vl); + __riscv_vse8_v_u8mf2(outptr4, u8mf2_col_4, vl); + __riscv_vse8_v_u8mf2(outptr5, u8mf2_col_5, vl); + __riscv_vse8_v_u8mf2(outptr6, u8mf2_col_6, vl); + __riscv_vse8_v_u8mf2(outptr7, u8mf2_col_7, vl); +} + diff --git a/simd/rvv_andes/jidctint-rvv.c b/simd/rvv_andes/jidctint-rvv.c new file mode 100644 index 000000000..737ff2797 --- /dev/null +++ b/simd/rvv_andes/jidctint-rvv.c @@ -0,0 +1,489 @@ +/* +* jidctint-rvv.c - accurate integer IDCT (RISC-V RVV) +* +* Copyright (c) 2012-2024 Andes Technology Corporation +* All rights reserved. +*/ +/* + * jidctint-neon.c - accurate integer IDCT (Arm Neon) + * + * Copyright (C) 2020, Arm Limited. All Rights Reserved. + * Copyright (C) 2020, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "../../jinclude.h" +#include "../../jpeglib.h" +#include "../../jsimd.h" +#include "../../jdct.h" +#include "../../jsimddct.h" +#include "../jsimd.h" + +#include + + +#define CONST_BITS 13 +#define PASS1_BITS 2 + +#define DESCALE_P1 (CONST_BITS - PASS1_BITS) +#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3) + +/* The computation of the inverse DCT requires the use of constants known at + * compile time. Scaled integer constants are used to avoid floating-point + * arithmetic: + * 0.298631336 = 2446 * 2^-13 + * 0.390180644 = 3196 * 2^-13 + * 0.541196100 = 4433 * 2^-13 + * 0.765366865 = 6270 * 2^-13 + * 0.899976223 = 7373 * 2^-13 + * 1.175875602 = 9633 * 2^-13 + * 1.501321110 = 12299 * 2^-13 + * 1.847759065 = 15137 * 2^-13 + * 1.961570560 = 16069 * 2^-13 + * 2.053119869 = 16819 * 2^-13 + * 2.562915447 = 20995 * 2^-13 + * 3.072711026 = 25172 * 2^-13 + */ + +#define F_0_298 2446 +#define F_0_390 3196 +#define F_0_541 4433 +#define F_0_765 6270 +#define F_0_899 7373 +#define F_1_175 9633 +#define F_1_501 12299 +#define F_1_847 15137 +#define F_1_961 16069 +#define F_2_053 16819 +#define F_2_562 20995 +#define F_3_072 25172 + +#define F_1_175_MINUS_1_961 (F_1_175 - F_1_961) +#define F_1_175_MINUS_0_390 (F_1_175 - F_0_390) +#define F_0_541_MINUS_1_847 (F_0_541 - F_1_847) +#define F_3_072_MINUS_2_562 (F_3_072 - F_2_562) +#define F_0_298_MINUS_0_899 (F_0_298 - F_0_899) +#define F_1_501_MINUS_0_899 (F_1_501 - F_0_899) +#define F_2_053_MINUS_2_562 (F_2_053 - F_2_562) +#define F_0_541_PLUS_0_765 (F_0_541 + F_0_765) + + +static const int16_t idct_islow_consts[] = { + F_0_899, F_0_541, + F_2_562, F_0_298_MINUS_0_899, + F_1_501_MINUS_0_899, F_2_053_MINUS_2_562, + F_0_541_PLUS_0_765, F_1_175, + F_1_175_MINUS_0_390, F_0_541_MINUS_1_847, + F_3_072_MINUS_2_562, F_1_175_MINUS_1_961, + 0, 0, 0, 0 +}; + +#define TRANS_TABLE_U8_SIZE 64 +const uint8_t trans_index8x8_u8[TRANS_TABLE_U8_SIZE] = +{ +/* #0 #1 #2 #3 #7 #6 #5 #4 */ + 0, 8, 16, 24, 56, 48, 40, 32, + 1, 9, 17, 25, 57, 49, 41, 33, + 2, 10, 18, 26, 58, 50, 42, 34, + 3, 11, 19, 27, 59, 51, 43, 35, + 4, 12, 20, 28, 60, 52, 44, 36, + 5, 13, 21, 29, 61, 53, 45, 37, + 6, 14, 22, 30, 62, 54, 46, 38, + 7, 15, 23, 31, 63, 55, 47, 39, +}; + + +/* Forward declaration of regular and sparse IDCT helper functions */ + +static INLINE void jsimd_idct_islow_pass1_regular(vint16m1_t row0, + vint16m1_t row1, + vint16m1_t row2, + vint16m1_t row3, + vint16m1_t row4, + vint16m1_t row5, + vint16m1_t row6, + vint16m1_t row7, + vint16m1_t quant_row0, + vint16m1_t quant_row1, + vint16m1_t quant_row2, + vint16m1_t quant_row3, + vint16m1_t quant_row4, + vint16m1_t quant_row5, + vint16m1_t quant_row6, + vint16m1_t quant_row7, + vint16m8_t *cols_all_i16m8); + +static INLINE void jsimd_idct_islow_pass2_regular(vint16m8_t *cols_all_i16m8, + JSAMPARRAY output_buf, + JDIMENSION output_col); + + +/* Perform dequantization and inverse DCT on one block of coefficients. For + * reference, the C implementation (jpeg_idct_slow()) can be found in + * jidctint.c. + */ + +void jsimd_idct_islow_rvv(void *dct_table, JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + ISLOW_MULT_TYPE *quantptr = dct_table; + vint16m8_t cols_all_i16m8; + + /* Compute IDCT first pass on left 4x8 coefficient block. */ + + /* Load DCT coefficients. */ + size_t vl = 8; + vint16m1_t row0 = __riscv_vle16_v_i16m1(coef_block + 0 * DCTSIZE, vl); + vint16m1_t row1 = __riscv_vle16_v_i16m1(coef_block + 1 * DCTSIZE, vl); + vint16m1_t row2 = __riscv_vle16_v_i16m1(coef_block + 2 * DCTSIZE, vl); + vint16m1_t row3 = __riscv_vle16_v_i16m1(coef_block + 3 * DCTSIZE, vl); + vint16m1_t row4 = __riscv_vle16_v_i16m1(coef_block + 4 * DCTSIZE, vl); + vint16m1_t row5 = __riscv_vle16_v_i16m1(coef_block + 5 * DCTSIZE, vl); + vint16m1_t row6 = __riscv_vle16_v_i16m1(coef_block + 6 * DCTSIZE, vl); + vint16m1_t row7 = __riscv_vle16_v_i16m1(coef_block + 7 * DCTSIZE, vl); + + /* Load quantization table. */ + vint16m1_t quant_row0 = __riscv_vle16_v_i16m1(quantptr + 0 * DCTSIZE, vl); + vint16m1_t quant_row1 = __riscv_vle16_v_i16m1(quantptr + 1 * DCTSIZE, vl); + vint16m1_t quant_row2 = __riscv_vle16_v_i16m1(quantptr + 2 * DCTSIZE, vl); + vint16m1_t quant_row3 = __riscv_vle16_v_i16m1(quantptr + 3 * DCTSIZE, vl); + vint16m1_t quant_row4 = __riscv_vle16_v_i16m1(quantptr + 4 * DCTSIZE, vl); + vint16m1_t quant_row5 = __riscv_vle16_v_i16m1(quantptr + 5 * DCTSIZE, vl); + vint16m1_t quant_row6 = __riscv_vle16_v_i16m1(quantptr + 6 * DCTSIZE, vl); + vint16m1_t quant_row7 = __riscv_vle16_v_i16m1(quantptr + 7 * DCTSIZE, vl); + + /* Construct bitmap to test if DCT coefficients are 0. */ + vuint16m1_t vec_zero = __riscv_vmv_s_x_u16m1(0, vl); + vint16m1_t bitmap = __riscv_vor_vv_i16m1(row7, row6, vl); + bitmap = __riscv_vor_vv_i16m1(bitmap, row5, vl); + bitmap = __riscv_vor_vv_i16m1(bitmap, row4, vl); + bitmap = __riscv_vor_vv_i16m1(bitmap, row3, vl); + bitmap = __riscv_vor_vv_i16m1(bitmap, row2, vl); + bitmap = __riscv_vor_vv_i16m1(bitmap, row1, vl); + vuint16m1_t tmp_u16m1 = __riscv_vredor_vs_u16m1_u16m1(__riscv_vreinterpret_v_i16m1_u16m1(bitmap), vec_zero, vl); + uint16_t ac_bitmap = __riscv_vmv_x_s_u16m1_u16(tmp_u16m1); + if (0 == ac_bitmap) { + vint16m1_t dcval_i16m1 = __riscv_vmul_vv_i16m1(row0, quant_row0, vl); + dcval_i16m1 = __riscv_vsll_vx_i16m1(dcval_i16m1, PASS1_BITS, vl); + + // combine vectors + vint16m4_t tmp_i16m4 = __riscv_vlmul_ext_v_i16m1_i16m4(dcval_i16m1); + tmp_i16m4 = __riscv_vslideup_vx_i16m4(tmp_i16m4, tmp_i16m4, DCTSIZE2/8, DCTSIZE2/4); + tmp_i16m4 = __riscv_vslideup_vx_i16m4(tmp_i16m4, tmp_i16m4, DCTSIZE2/4, DCTSIZE2/2); + cols_all_i16m8 = __riscv_vlmul_ext_v_i16m4_i16m8(tmp_i16m4); + cols_all_i16m8 = __riscv_vslideup_vx_i16m8(cols_all_i16m8, cols_all_i16m8, DCTSIZE2/2, DCTSIZE2); + } else { + jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5, + row6, row7, quant_row0, quant_row1, + quant_row2, quant_row3, quant_row4, + quant_row5, quant_row6, quant_row7, + &cols_all_i16m8); + } + + /* Second pass: compute IDCT on rows in workspace. */ + jsimd_idct_islow_pass2_regular(&cols_all_i16m8, output_buf, output_col); +} + + +/* +* This "regular" version assumes that no optimization can be made to the IDCT +* calculation, since no useful set of AC coefficients is all 0. +* +* The original C implementation of the accurate IDCT (jpeg_idct_slow()) can be +* found in jidctint.c. Algorithmic changes made here are documented inline. +*/ + +static INLINE void jsimd_idct_islow_pass1_regular(vint16m1_t row0, + vint16m1_t row1, + vint16m1_t row2, + vint16m1_t row3, + vint16m1_t row4, + vint16m1_t row5, + vint16m1_t row6, + vint16m1_t row7, + vint16m1_t quant_row0, + vint16m1_t quant_row1, + vint16m1_t quant_row2, + vint16m1_t quant_row3, + vint16m1_t quant_row4, + vint16m1_t quant_row5, + vint16m1_t quant_row6, + vint16m1_t quant_row7, + vint16m8_t *cols_all_i16m8) +{ + /* Even part */ + size_t vl = 8; + vint16m1_t z2_s16 = __riscv_vmul_vv_i16m1(row2, quant_row2, vl); + vint16m1_t z3_s16 = __riscv_vmul_vv_i16m1(row6, quant_row6, vl); + vint32m2_t tmp2 = __riscv_vwmul_vx_i32m2(z2_s16, idct_islow_consts[1], vl); + vint32m2_t tmp3 = __riscv_vwmul_vx_i32m2(z2_s16, idct_islow_consts[6], vl); + tmp2 = __riscv_vwmacc_vx_i32m2(tmp2, idct_islow_consts[9], z3_s16, vl); + tmp3 = __riscv_vwmacc_vx_i32m2(tmp3, idct_islow_consts[1], z3_s16, vl); + + z2_s16 = __riscv_vmul_vv_i16m1(row0, quant_row0, vl); + z3_s16 = __riscv_vmul_vv_i16m1(row4, quant_row4, vl); + vint32m2_t tmp0 = __riscv_vwmul_vx_i32m2(__riscv_vadd_vv_i16m1(z2_s16, z3_s16, vl), 8192, vl); // 8192 = 2^(CONST_BITS) + vint32m2_t tmp1 = __riscv_vwmul_vx_i32m2(__riscv_vsub_vv_i16m1(z2_s16, z3_s16, vl), 8192, vl); + + vint32m2_t tmp10 = __riscv_vadd_vv_i32m2(tmp0, tmp3, vl); + vint32m2_t tmp13 = __riscv_vsub_vv_i32m2(tmp0, tmp3, vl); + vint32m2_t tmp11 = __riscv_vadd_vv_i32m2(tmp1, tmp2, vl); + vint32m2_t tmp12 = __riscv_vsub_vv_i32m2(tmp1, tmp2, vl); + + /* Odd part */ + vint16m1_t tmp0_s16 = __riscv_vmul_vv_i16m1(row7, quant_row7, vl); + vint16m1_t tmp1_s16 = __riscv_vmul_vv_i16m1(row5, quant_row5, vl); + vint16m1_t tmp2_s16 = __riscv_vmul_vv_i16m1(row3, quant_row3, vl); + vint16m1_t tmp3_s16 = __riscv_vmul_vv_i16m1(row1, quant_row1, vl); + z3_s16 = __riscv_vadd_vv_i16m1(tmp0_s16, tmp2_s16, vl); + vint16m1_t z4_s16 = __riscv_vadd_vv_i16m1(tmp1_s16, tmp3_s16, vl); + + vint32m2_t z3 = __riscv_vwmul_vx_i32m2(z3_s16, idct_islow_consts[11], vl); + vint32m2_t z4 = __riscv_vwmul_vx_i32m2(z3_s16, idct_islow_consts[ 7], vl); + z3 = __riscv_vwmacc_vx_i32m2(z3, idct_islow_consts[7], z4_s16, vl); + z4 = __riscv_vwmacc_vx_i32m2(z4, idct_islow_consts[8], z4_s16, vl); + + tmp0 = __riscv_vwmul_vx_i32m2(tmp0_s16, idct_islow_consts[ 3], vl); + tmp1 = __riscv_vwmul_vx_i32m2(tmp1_s16, idct_islow_consts[ 5], vl); + tmp2 = __riscv_vwmul_vx_i32m2(tmp2_s16, idct_islow_consts[10], vl); + tmp3 = __riscv_vwmul_vx_i32m2(tmp3_s16, idct_islow_consts[ 4], vl); + + tmp0 = __riscv_vwmacc_vx_i32m2(tmp0, -idct_islow_consts[0], tmp3_s16, vl); + tmp1 = __riscv_vwmacc_vx_i32m2(tmp1, -idct_islow_consts[2], tmp2_s16, vl); + tmp2 = __riscv_vwmacc_vx_i32m2(tmp2, -idct_islow_consts[2], tmp1_s16, vl); + tmp3 = __riscv_vwmacc_vx_i32m2(tmp3, -idct_islow_consts[0], tmp0_s16, vl); + + tmp0 = __riscv_vadd_vv_i32m2(tmp0, z3, vl); + tmp1 = __riscv_vadd_vv_i32m2(tmp1, z4, vl); + tmp2 = __riscv_vadd_vv_i32m2(tmp2, z3, vl); + tmp3 = __riscv_vadd_vv_i32m2(tmp3, z4, vl); + + /* Final output stage: descale and narrow to 16-bit. */ + vint16m4_t cols_0123_i16m4, cols_7654_i16m4; + vint8m2_t tmp_i8m2; + vint16m4_t tmp_i16m4; + vint32m8_t tmp_i32m8; + + // combine (tmp10, tmp11, tmp12, tmp13) + vint32m4_t tmps_10_11 = __riscv_vlmul_ext_v_i32m2_i32m4(tmp10); + tmps_10_11 = __riscv_vslideup_vx_i32m4(tmps_10_11, __riscv_vlmul_ext_v_i32m2_i32m4(tmp11), 8, 16); + vint32m4_t tmps_12_13 = __riscv_vlmul_ext_v_i32m2_i32m4(tmp12); + tmps_12_13 = __riscv_vslideup_vx_i32m4(tmps_12_13, __riscv_vlmul_ext_v_i32m2_i32m4(tmp13), 8, 16); + + vint32m8_t tmps_10_11_12_13 = __riscv_vlmul_ext_v_i32m4_i32m8(tmps_10_11); + tmps_10_11_12_13 = __riscv_vslideup_vx_i32m8(tmps_10_11_12_13, __riscv_vlmul_ext_v_i32m4_i32m8(tmps_12_13), 16, 32); + + // combine (tmp3, tmp2, tmp1, tmp0) + vint32m4_t tmps_3_2 = __riscv_vlmul_ext_v_i32m2_i32m4(tmp3); + tmps_3_2 = __riscv_vslideup_vx_i32m4(tmps_3_2, __riscv_vlmul_ext_v_i32m2_i32m4(tmp2), 8, 16); + vint32m4_t tmps_1_0 = __riscv_vlmul_ext_v_i32m2_i32m4(tmp1); + tmps_1_0 = __riscv_vslideup_vx_i32m4(tmps_1_0, __riscv_vlmul_ext_v_i32m2_i32m4(tmp0), 8, 16); + + vint32m8_t tmps_3_2_1_0 = __riscv_vlmul_ext_v_i32m4_i32m8(tmps_3_2); + tmps_3_2_1_0 = __riscv_vslideup_vx_i32m8(tmps_3_2_1_0, __riscv_vlmul_ext_v_i32m4_i32m8(tmps_1_0), 16, 32); + + // col 0, 1, 2, 3 + tmp_i32m8 = __riscv_vadd_vv_i32m8(tmps_10_11_12_13, tmps_3_2_1_0, 32); + tmp_i32m8 = __riscv_vadd_vx_i32m8(tmp_i32m8, 1<<(DESCALE_P1-1), 32); // for Rounding + cols_0123_i16m4 = __riscv_vnsra_wx_i16m4(tmp_i32m8, DESCALE_P1, 32); + + // col 7, 6, 5, 4 + tmp_i32m8 = __riscv_vsub_vv_i32m8(tmps_10_11_12_13, tmps_3_2_1_0, 32); + tmp_i32m8 = __riscv_vadd_vx_i32m8(tmp_i32m8, 1<<(DESCALE_P1-1), 32); // for Rounding + cols_7654_i16m4 = __riscv_vnsra_wx_i16m4(tmp_i32m8, DESCALE_P1, 32); + + // combine vectors + *cols_all_i16m8 = __riscv_vlmul_ext_v_i16m4_i16m8(cols_0123_i16m4); + *cols_all_i16m8 = __riscv_vslideup_vx_i16m8(*cols_all_i16m8, __riscv_vlmul_ext_v_i16m4_i16m8(cols_7654_i16m4), 32, 64); +} + + +/* Perform the second pass of the accurate inverse DCT on a 4x8 block of + * coefficients. (To process the full 8x8 DCT block, this function-- or some + * other optimized variant-- needs to be called for both the right and left 4x8 + * blocks.) + * + * This "regular" version assumes that no optimization can be made to the IDCT + * calculation, since no useful set of coefficient values are all 0 after the + * first pass. + * + * Again, the original C implementation of the accurate IDCT (jpeg_idct_slow()) + * can be found in jidctint.c. Algorithmic changes made here are documented + * inline. + */ + +static INLINE void jsimd_idct_islow_pass2_regular(vint16m8_t *cols_all_i16m8, + JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + int16_t workspace[DCTSIZE2]; /* buffers data */ + + // load transpose look-up table + vuint8m4_t vg_reg8 = __riscv_vle8_v_u8m4(trans_index8x8_u8, TRANS_TABLE_U8_SIZE); + + // transpose + vint16m8_t trans_all_i16m8 = __riscv_vrgather_vv_i16m8(*cols_all_i16m8, \ + __riscv_vzext_vf2_u16m8(vg_reg8, TRANS_TABLE_U8_SIZE), TRANS_TABLE_U8_SIZE); + + __riscv_vse16_v_i16m8(&workspace[0], trans_all_i16m8, DCTSIZE2); + + /* Even part */ + size_t vl = 8; + vint16m1_t z2_s16 = __riscv_vle16_v_i16m1(workspace + 2 * DCTSIZE, vl); + vint16m1_t z3_s16 = __riscv_vle16_v_i16m1(workspace + 6 * DCTSIZE, vl); + + vint32m2_t tmp2 = __riscv_vwmul_vx_i32m2(z2_s16, idct_islow_consts[1], vl); + vint32m2_t tmp3 = __riscv_vwmul_vx_i32m2(z2_s16, idct_islow_consts[6], vl); + tmp2 = __riscv_vwmacc_vx_i32m2(tmp2, idct_islow_consts[9], z3_s16, vl); + tmp3 = __riscv_vwmacc_vx_i32m2(tmp3, idct_islow_consts[1], z3_s16, vl); + + z2_s16 = __riscv_vle16_v_i16m1(workspace + 0 * DCTSIZE, vl); + z3_s16 = __riscv_vle16_v_i16m1(workspace + 4 * DCTSIZE, vl); + + vint32m2_t tmp0 = __riscv_vwmul_vx_i32m2(__riscv_vadd_vv_i16m1(z2_s16, z3_s16, vl), 8192, vl); + vint32m2_t tmp1 = __riscv_vwmul_vx_i32m2(__riscv_vsub_vv_i16m1(z2_s16, z3_s16, vl), 8192, vl); + + vint32m2_t tmp10 = __riscv_vadd_vv_i32m2(tmp0, tmp3, vl); + vint32m2_t tmp13 = __riscv_vsub_vv_i32m2(tmp0, tmp3, vl); + vint32m2_t tmp11 = __riscv_vadd_vv_i32m2(tmp1, tmp2, vl); + vint32m2_t tmp12 = __riscv_vsub_vv_i32m2(tmp1, tmp2, vl); + + /* Odd part */ + vint16m1_t tmp0_s16 = __riscv_vle16_v_i16m1(workspace + 7 * DCTSIZE, vl); + vint16m1_t tmp1_s16 = __riscv_vle16_v_i16m1(workspace + 5 * DCTSIZE, vl); + vint16m1_t tmp2_s16 = __riscv_vle16_v_i16m1(workspace + 3 * DCTSIZE, vl); + vint16m1_t tmp3_s16 = __riscv_vle16_v_i16m1(workspace + 1 * DCTSIZE, vl); + + z3_s16 = __riscv_vadd_vv_i16m1(tmp0_s16, tmp2_s16, vl); + vint16m1_t z4_s16 = __riscv_vadd_vv_i16m1(tmp1_s16, tmp3_s16, vl); + + vint32m2_t z3 = __riscv_vwmul_vx_i32m2(z3_s16, idct_islow_consts[11], vl); + vint32m2_t z4 = __riscv_vwmul_vx_i32m2(z3_s16, idct_islow_consts[ 7], vl); + z3 = __riscv_vwmacc_vx_i32m2(z3, idct_islow_consts[7], z4_s16, vl); + z4 = __riscv_vwmacc_vx_i32m2(z4, idct_islow_consts[8], z4_s16, vl); + + tmp0 = __riscv_vwmul_vx_i32m2(tmp0_s16, idct_islow_consts[ 3], vl); + tmp1 = __riscv_vwmul_vx_i32m2(tmp1_s16, idct_islow_consts[ 5], vl); + tmp2 = __riscv_vwmul_vx_i32m2(tmp2_s16, idct_islow_consts[10], vl); + tmp3 = __riscv_vwmul_vx_i32m2(tmp3_s16, idct_islow_consts[ 4], vl); + + tmp0 = __riscv_vwmacc_vx_i32m2(tmp0, -idct_islow_consts[0], tmp3_s16, vl); + tmp1 = __riscv_vwmacc_vx_i32m2(tmp1, -idct_islow_consts[2], tmp2_s16, vl); + tmp2 = __riscv_vwmacc_vx_i32m2(tmp2, -idct_islow_consts[2], tmp1_s16, vl); + tmp3 = __riscv_vwmacc_vx_i32m2(tmp3, -idct_islow_consts[0], tmp0_s16, vl); + + tmp0 = __riscv_vadd_vv_i32m2(tmp0, z3, vl); + tmp1 = __riscv_vadd_vv_i32m2(tmp1, z4, vl); + tmp2 = __riscv_vadd_vv_i32m2(tmp2, z3, vl); + tmp3 = __riscv_vadd_vv_i32m2(tmp3, z4, vl); + +/* Final output stage: descale and narrow to 8-bit. */ +/* Clamp to range [0-255]. */ + vuint8m2_t cols_0123_u8m2, cols_7654_u8m2; + vint8m2_t tmp_i8m2; + vint16m4_t tmp_i16m4; + vint32m8_t tmp_i32m8; + + // combine (tmp10, tmp11, tmp12, tmp13) + vint32m4_t tmps_10_11 = __riscv_vlmul_ext_v_i32m2_i32m4(tmp10); + tmps_10_11 = __riscv_vslideup_vx_i32m4(tmps_10_11, __riscv_vlmul_ext_v_i32m2_i32m4(tmp11), 8, 16); + vint32m4_t tmps_12_13 = __riscv_vlmul_ext_v_i32m2_i32m4(tmp12); + tmps_12_13 = __riscv_vslideup_vx_i32m4(tmps_12_13, __riscv_vlmul_ext_v_i32m2_i32m4(tmp13), 8, 16); + + vint32m8_t tmps_10_11_12_13 = __riscv_vlmul_ext_v_i32m4_i32m8(tmps_10_11); + tmps_10_11_12_13 = __riscv_vslideup_vx_i32m8(tmps_10_11_12_13, __riscv_vlmul_ext_v_i32m4_i32m8(tmps_12_13), 16, 32); + + // combine (tmp3, tmp2, tmp1, tmp0) + vint32m4_t tmps_3_2 = __riscv_vlmul_ext_v_i32m2_i32m4(tmp3); + tmps_3_2 = __riscv_vslideup_vx_i32m4(tmps_3_2, __riscv_vlmul_ext_v_i32m2_i32m4(tmp2), 8, 16); + vint32m4_t tmps_1_0 = __riscv_vlmul_ext_v_i32m2_i32m4(tmp1); + tmps_1_0 = __riscv_vslideup_vx_i32m4(tmps_1_0, __riscv_vlmul_ext_v_i32m2_i32m4(tmp0), 8, 16); + + vint32m8_t tmps_3_2_1_0 = __riscv_vlmul_ext_v_i32m4_i32m8(tmps_3_2); + tmps_3_2_1_0 = __riscv_vslideup_vx_i32m8(tmps_3_2_1_0, __riscv_vlmul_ext_v_i32m4_i32m8(tmps_1_0), 16, 32); + + // col 0, 1, 2, 3 + tmp_i32m8 = __riscv_vadd_vv_i32m8(tmps_10_11_12_13, tmps_3_2_1_0, 32); + tmp_i16m4 = __riscv_vnsra_wx_i16m4(tmp_i32m8, 16, 32); + tmp_i8m2 = __riscv_vnclip_wx_i8m2(tmp_i16m4, DESCALE_P2 - 16, 32); + cols_0123_u8m2 = __riscv_vreinterpret_v_i8m2_u8m2(tmp_i8m2); + cols_0123_u8m2 = __riscv_vadd_vx_u8m2(cols_0123_u8m2, CENTERJSAMPLE, 32); + + // col 7, 6, 5, 4 + tmp_i32m8 = __riscv_vsub_vv_i32m8(tmps_10_11_12_13, tmps_3_2_1_0, 32); + tmp_i16m4 = __riscv_vnsra_wx_i16m4(tmp_i32m8, 16, 32); + tmp_i8m2 = __riscv_vnclip_wx_i8m2(tmp_i16m4, DESCALE_P2 - 16, 32); + cols_7654_u8m2 = __riscv_vreinterpret_v_i8m2_u8m2(tmp_i8m2); + cols_7654_u8m2 = __riscv_vadd_vx_u8m2(cols_7654_u8m2, CENTERJSAMPLE, 32); + + // combine vectors + vuint8m4_t cols_all_u8m4 = __riscv_vlmul_ext_v_u8m2_u8m4(cols_0123_u8m2); + cols_all_u8m4 = __riscv_vslideup_vx_u8m4(cols_all_u8m4, __riscv_vlmul_ext_v_u8m2_u8m4(cols_7654_u8m2), 32, 64); + + // transpose + vuint8m4_t trans_all_u8m4 = __riscv_vrgather(cols_all_u8m4, vg_reg8, TRANS_TABLE_U8_SIZE); + + // extract columns + vuint8mf2_t col_0_u8mf2, col_1_u8mf2, col_2_u8mf2, col_3_u8mf2; + vuint8mf2_t col_4_u8mf2, col_5_u8mf2, col_6_u8mf2, col_7_u8mf2; + + vuint8m4_t slidedown_m4 = __riscv_vslidedown_vx_u8m4(trans_all_u8m4, 0, vl); + col_0_u8mf2 = __riscv_vlmul_trunc_v_u8m4_u8mf2(slidedown_m4); + + slidedown_m4 = __riscv_vslidedown_vx_u8m4(trans_all_u8m4, 8, vl); + col_1_u8mf2 = __riscv_vlmul_trunc_v_u8m4_u8mf2(slidedown_m4); + + slidedown_m4 = __riscv_vslidedown_vx_u8m4(trans_all_u8m4, 16, vl); + col_2_u8mf2 = __riscv_vlmul_trunc_v_u8m4_u8mf2(slidedown_m4); + + slidedown_m4 = __riscv_vslidedown_vx_u8m4(trans_all_u8m4, 24, vl); + col_3_u8mf2 = __riscv_vlmul_trunc_v_u8m4_u8mf2(slidedown_m4); + + slidedown_m4 = __riscv_vslidedown_vx_u8m4(trans_all_u8m4, 32, vl); + col_4_u8mf2 = __riscv_vlmul_trunc_v_u8m4_u8mf2(slidedown_m4); + + slidedown_m4 = __riscv_vslidedown_vx_u8m4(trans_all_u8m4, 40, vl); + col_5_u8mf2 = __riscv_vlmul_trunc_v_u8m4_u8mf2(slidedown_m4); + + slidedown_m4 = __riscv_vslidedown_vx_u8m4(trans_all_u8m4, 48, vl); + col_6_u8mf2 = __riscv_vlmul_trunc_v_u8m4_u8mf2(slidedown_m4); + + slidedown_m4 = __riscv_vslidedown_vx_u8m4(trans_all_u8m4, 56, vl); + col_7_u8mf2 = __riscv_vlmul_trunc_v_u8m4_u8mf2(slidedown_m4); + + // store to memory + JSAMPROW outptr0 = output_buf[0] + output_col; + JSAMPROW outptr1 = output_buf[1] + output_col; + JSAMPROW outptr2 = output_buf[2] + output_col; + JSAMPROW outptr3 = output_buf[3] + output_col; + JSAMPROW outptr4 = output_buf[4] + output_col; + JSAMPROW outptr5 = output_buf[5] + output_col; + JSAMPROW outptr6 = output_buf[6] + output_col; + JSAMPROW outptr7 = output_buf[7] + output_col; + + __riscv_vse8_v_u8mf2(outptr0, col_0_u8mf2, vl); + __riscv_vse8_v_u8mf2(outptr1, col_1_u8mf2, vl); + __riscv_vse8_v_u8mf2(outptr2, col_2_u8mf2, vl); + __riscv_vse8_v_u8mf2(outptr3, col_3_u8mf2, vl); + __riscv_vse8_v_u8mf2(outptr4, col_4_u8mf2, vl); + __riscv_vse8_v_u8mf2(outptr5, col_5_u8mf2, vl); + __riscv_vse8_v_u8mf2(outptr6, col_6_u8mf2, vl); + __riscv_vse8_v_u8mf2(outptr7, col_7_u8mf2, vl); +} + diff --git a/simd/rvv_andes/jidctred-rvv.c b/simd/rvv_andes/jidctred-rvv.c new file mode 100644 index 000000000..4e03d6da4 --- /dev/null +++ b/simd/rvv_andes/jidctred-rvv.c @@ -0,0 +1,436 @@ +/* +* jidctred-rvv.c - reduced-size IDCT (RISC-V RVV) +* +* Copyright (c) 2012-2024 Andes Technology Corporation +* All rights reserved. +*/ +/* + * jidctred-neon.c - reduced-size IDCT (Arm Neon) + * + * Copyright (C) 2020, Arm Limited. All Rights Reserved. + * Copyright (C) 2020, D. R. Commander. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "../../jinclude.h" +#include "../../jpeglib.h" +#include "../../jsimd.h" +#include "../../jdct.h" +#include "../../jsimddct.h" +#include "../jsimd.h" + +#include + + +#define CONST_BITS 13 +#define PASS1_BITS 2 + +#define F_0_211 1730 +#define F_0_509 4176 +#define F_0_601 4926 +#define F_0_720 5906 +#define F_0_765 6270 +#define F_0_850 6967 +#define F_0_899 7373 +#define F_1_061 8697 +#define F_1_272 10426 +#define F_1_451 11893 +#define F_1_847 15137 +#define F_2_172 17799 +#define F_2_562 20995 +#define F_3_624 29692 + + +/* jsimd_idct_2x2_rvv() is an inverse DCT function that produces reduced-size + * 2x2 output from an 8x8 DCT block. It uses the same calculations and + * produces exactly the same output as IJG's original jpeg_idct_2x2() function + * from jpeg-6b, which can be found in jidctred.c. + * + * Scaled integer constants are used to avoid floating-point arithmetic: + * 0.720959822 = 5906 * 2^-13 + * 0.850430095 = 6967 * 2^-13 + * 1.272758580 = 10426 * 2^-13 + * 3.624509785 = 29692 * 2^-13 + * + * See jidctred.c for further details of the 2x2 IDCT algorithm. Where + * possible, the variable names and comments here in jsimd_idct_2x2_rvv() + * match up with those in jpeg_idct_2x2(). + */ + +static const int16_t jsimd_idct_2x2_consts[] = { + -F_0_720, F_0_850, -F_1_272, F_3_624 +}; + +void jsimd_idct_2x2_rvv(void *dct_table, JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + ISLOW_MULT_TYPE *quantptr = dct_table; + + /* Load DCT coefficients. */ + size_t vl = 8; + vint16m1_t row0 = __riscv_vle16_v_i16m1(coef_block + 0 * DCTSIZE, vl); + vint16m1_t row1 = __riscv_vle16_v_i16m1(coef_block + 1 * DCTSIZE, vl); + vint16m1_t row3 = __riscv_vle16_v_i16m1(coef_block + 3 * DCTSIZE, vl); + vint16m1_t row5 = __riscv_vle16_v_i16m1(coef_block + 5 * DCTSIZE, vl); + vint16m1_t row7 = __riscv_vle16_v_i16m1(coef_block + 7 * DCTSIZE, vl); + + /* Load quantization table values. */ + vint16m1_t quant_row0 = __riscv_vle16_v_i16m1(quantptr + 0 * DCTSIZE, vl); + vint16m1_t quant_row1 = __riscv_vle16_v_i16m1(quantptr + 1 * DCTSIZE, vl); + vint16m1_t quant_row3 = __riscv_vle16_v_i16m1(quantptr + 3 * DCTSIZE, vl); + vint16m1_t quant_row5 = __riscv_vle16_v_i16m1(quantptr + 5 * DCTSIZE, vl); + vint16m1_t quant_row7 = __riscv_vle16_v_i16m1(quantptr + 7 * DCTSIZE, vl); + + /* Dequantize DCT coefficients. */ + row0 = __riscv_vmul_vv_i16m1(row0, quant_row0, vl); + row1 = __riscv_vmul_vv_i16m1(row1, quant_row1, vl); + row3 = __riscv_vmul_vv_i16m1(row3, quant_row3, vl); + row5 = __riscv_vmul_vv_i16m1(row5, quant_row5, vl); + row7 = __riscv_vmul_vv_i16m1(row7, quant_row7, vl); + + /* Pass 1: process columns from input, put results in vectors row0 and + * row1. + */ + + /* Even part */ + vint32m2_t tmp10 = __riscv_vwmul_vx_i32m2(row0, 8192, vl); // 2 ^ (CONST_BITS) + tmp10 = __riscv_vsll_vx_i32m2 (tmp10, 2, vl); + + /* Odd part */ + vint32m2_t tmp0 = __riscv_vwmul_vx_i32m2(row1, jsimd_idct_2x2_consts[3], vl); + tmp0 = __riscv_vwmacc_vx_i32m2(tmp0, jsimd_idct_2x2_consts[2], row3, vl); + tmp0 = __riscv_vwmacc_vx_i32m2(tmp0, jsimd_idct_2x2_consts[1], row5, vl); + tmp0 = __riscv_vwmacc_vx_i32m2(tmp0, jsimd_idct_2x2_consts[0], row7, vl); + + /* Final output stage: descale and narrow to 16-bit. */ + vint32m2_t tmp_tt; + tmp_tt = __riscv_vadd_vv_i32m2(tmp10, tmp0, vl); + tmp_tt = __riscv_vadd_vx_i32m2(tmp_tt, 1<<(CONST_BITS-1), vl); + row0 = __riscv_vnsra_wx_i16m1(tmp_tt, CONST_BITS, vl); + + tmp_tt = __riscv_vsub_vv_i32m2(tmp10, tmp0, vl); + tmp_tt = __riscv_vadd_vx_i32m2(tmp_tt, 1<<(CONST_BITS-1), vl); + row1 = __riscv_vnsra_wx_i16m1(tmp_tt, CONST_BITS, vl); + + /* Transpose two rows, ready for second pass. */ + const uint8_t trans_index_tab_u8[16] = + { + 0, 8, + 1, 9, + 3, 11, + 5, 13, + 7, 15, + 0, 2, /* [10~13] output transpose index order */ + 1, 3, + 0, 0, + }; + + // extend m1 to m2 + vint16m2_t m2_row0 = __riscv_vlmul_ext_v_i16m1_i16m2(row0); + + // combine + m2_row0 = __riscv_vslideup_vx_i16m2(m2_row0, __riscv_vlmul_ext_v_i16m1_i16m2(row1), 8, 16); + + // load transpose look-up table + vuint8m1_t vg_reg8 = __riscv_vle8_v_u8m1(trans_index_tab_u8, 16); + + // saved for output index + vuint8m1_t index_order_u8m1 = __riscv_vslidedown_vx_u8m1(vg_reg8, 10, 4); + + // interpret to u16 & transpose + vint16m2_t vg_reg16 = __riscv_vrgather(m2_row0, __riscv_vzext_vf2_u16m2(vg_reg8, 10), 10); + + // extract columns + vl = 2; + vint16m1_t col0 = __riscv_vget_v_i16m2_i16m1(vg_reg16, 0); + vint16m1_t col1 = __riscv_vslidedown_vx_i16m1(col0, 2, vl); + vint16m1_t col3 = __riscv_vslidedown_vx_i16m1(col0, 4, vl); + vint16m1_t col5 = __riscv_vslidedown_vx_i16m1(col0, 6, vl); + + vint16m2_t slidedown_m2 = __riscv_vslidedown_vx_i16m2(vg_reg16, 8, vl); + vint16m1_t col7 = __riscv_vlmul_trunc_v_i16m2_i16m1(slidedown_m2); + + /* Pass 2: process two rows, store to output array. */ + + /* Even part: we're only interested in col0; the top half of tmp10 is "don't + * care." + */ + tmp10 = __riscv_vwmul_vx_i32m2(col0, 8192, vl); + tmp10 = __riscv_vsll_vx_i32m2 (tmp10, 2, vl); + + /* Odd part: we're only interested in the bottom half of tmp0. */ + tmp0 = __riscv_vwmul_vx_i32m2(col1, jsimd_idct_2x2_consts[3], vl); + tmp0 = __riscv_vwmacc_vx_i32m2(tmp0, jsimd_idct_2x2_consts[2], col3, vl); + tmp0 = __riscv_vwmacc_vx_i32m2(tmp0, jsimd_idct_2x2_consts[1], col5, vl); + tmp0 = __riscv_vwmacc_vx_i32m2(tmp0, jsimd_idct_2x2_consts[0], col7, vl); + + /* Final output stage: descale and clamp to range [0-255]. */ + /* Narrow to 8-bit and convert to unsigned. */ + vint32m2_t tmp_i32m2_0 = __riscv_vadd_vv_i32m2(tmp10, tmp0, 2); + vint32m2_t tmp_i32m2_1 = __riscv_vsub_vv_i32m2(tmp10, tmp0, 2); + tmp_i32m2_0 = __riscv_vslideup_vx_i32m2 (tmp_i32m2_0, tmp_i32m2_1, 2, 4); + vint16m1_t tmp_i16m1 = __riscv_vnsra_wx_i16m1(tmp_i32m2_0, 16, 4); + vint8mf2_t tmp_i8mf2 = __riscv_vnclip_wx_i8mf2(tmp_i16m1, CONST_BITS + PASS1_BITS + 3 + 2 - 16, 4); + vuint8mf2_t u8mf2_col0 = __riscv_vreinterpret_v_i8mf2_u8mf2(tmp_i8mf2); + u8mf2_col0 = __riscv_vadd_vx_u8mf2(u8mf2_col0, CENTERJSAMPLE, 4); + + /* Transpose */ + vuint8mf2_t v0_out = __riscv_vrgather_vv_u8mf2(u8mf2_col0, \ + __riscv_vlmul_trunc_v_u8m1_u8mf2(index_order_u8m1), 4); + vuint8mf2_t v1_out = __riscv_vslidedown_vx_u8mf2(v0_out, 2, 2); + + /* Store 2x2 block to memory. */ + JSAMPROW outptr0 = output_buf[0] + output_col; + JSAMPROW outptr1 = output_buf[1] + output_col; + __riscv_vse8_v_u8mf2(outptr0, v0_out, 2); + __riscv_vse8_v_u8mf2(outptr1, v1_out, 2); +} + + +/* jsimd_idct_4x4_rvv() is an inverse DCT function that produces reduced-size + * 4x4 output from an 8x8 DCT block. It uses the same calculations and + * produces exactly the same output as IJG's original jpeg_idct_4x4() function + * from jpeg-6b, which can be found in jidctred.c. + * + * Scaled integer constants are used to avoid floating-point arithmetic: + * 0.211164243 = 1730 * 2^-13 + * 0.509795579 = 4176 * 2^-13 + * 0.601344887 = 4926 * 2^-13 + * 0.765366865 = 6270 * 2^-13 + * 0.899976223 = 7373 * 2^-13 + * 1.061594337 = 8697 * 2^-13 + * 1.451774981 = 11893 * 2^-13 + * 1.847759065 = 15137 * 2^-13 + * 2.172734803 = 17799 * 2^-13 + * 2.562915447 = 20995 * 2^-13 + * + * See jidctred.c for further details of the 4x4 IDCT algorithm. Where + * possible, the variable names and comments here in jsimd_idct_4x4_rvv() + * match up with those in jpeg_idct_4x4(). + */ + +static const int16_t jsimd_idct_4x4_consts[] = { + F_1_847, -F_0_765, -F_0_211, F_1_451, + -F_2_172, F_1_061, -F_0_509, -F_0_601, + F_0_899, F_2_562, 0, 0 +}; + +void jsimd_idct_4x4_rvv(void *dct_table, JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + ptrdiff_t bstride; + ISLOW_MULT_TYPE *quantptr = dct_table; + vint16m4_t rows_0132_i16m4; + + /* Load DCT coefficients. */ + size_t vl = 8; + vint16m1_t row0 = __riscv_vle16_v_i16m1(coef_block + 0 * DCTSIZE, vl); + vint16m1_t row1 = __riscv_vle16_v_i16m1(coef_block + 1 * DCTSIZE, vl); + vint16m1_t row2 = __riscv_vle16_v_i16m1(coef_block + 2 * DCTSIZE, vl); + vint16m1_t row3 = __riscv_vle16_v_i16m1(coef_block + 3 * DCTSIZE, vl); + vint16m1_t row5 = __riscv_vle16_v_i16m1(coef_block + 5 * DCTSIZE, vl); + vint16m1_t row6 = __riscv_vle16_v_i16m1(coef_block + 6 * DCTSIZE, vl); + vint16m1_t row7 = __riscv_vle16_v_i16m1(coef_block + 7 * DCTSIZE, vl); + + /* Load quantization table values for DC coefficients. */ + /* Dequantize DC coefficients. */ + vint16m1_t quant_row0 = __riscv_vle16_v_i16m1(quantptr + 0 * DCTSIZE, vl); + row0 = __riscv_vmul_vv_i16m1(row0, quant_row0, vl); + + #define TRANS_TABLE_U8_SIZE 32 + const uint8_t trans_index_4x4_u8[TRANS_TABLE_U8_SIZE] = + { + 0, 8, 24, 16, + 1, 9, 25, 17, + 2, 10, 26, 18, + 3, 11, 27, 19, + 5, 13, 29, 21, + 6, 14, 30, 22, + 7, 15, 31, 23, + 0, 0, 0, 0, + }; + + /* load transpose look-up table */ + vuint8m2_t vg_reg8 = __riscv_vle8_v_u8m2(trans_index_4x4_u8, TRANS_TABLE_U8_SIZE); + + /* Construct bitmap to test if all AC coefficients are 0. */ + vuint16m1_t vec_zero = __riscv_vmv_s_x_u16m1(0, vl); + vint16m1_t bitmap = __riscv_vor_vv_i16m1(row7, row6, vl); + bitmap = __riscv_vor_vv_i16m1(bitmap, row5, vl); + bitmap = __riscv_vor_vv_i16m1(bitmap, row3, vl); + bitmap = __riscv_vor_vv_i16m1(bitmap, row2, vl); + bitmap = __riscv_vor_vv_i16m1(bitmap, row1, vl); + vuint16m1_t tmp_u16m1 = __riscv_vredor_vs_u16m1_u16m1(__riscv_vreinterpret_v_i16m1_u16m1(bitmap), vec_zero, vl); + uint16_t ac_bitmap = __riscv_vmv_x_s_u16m1_u16(tmp_u16m1); + if (0 == ac_bitmap) { + /* All AC coefficients are zero. */ + vint16m1_t dcval_i16m1 = __riscv_vsll_vx_i16m1(row0, PASS1_BITS, vl); + + /* combine vectors */ + rows_0132_i16m4 = __riscv_vlmul_ext_v_i16m1_i16m4(dcval_i16m1); + rows_0132_i16m4 = __riscv_vslideup_vx_i16m4(rows_0132_i16m4, rows_0132_i16m4, DCTSIZE2/8, DCTSIZE2/4); + rows_0132_i16m4 = __riscv_vslideup_vx_i16m4(rows_0132_i16m4, rows_0132_i16m4, DCTSIZE2/4, DCTSIZE2/2); + } else { + /* Load quantization table. */ + vint16m1_t quant_row1 = __riscv_vle16_v_i16m1(quantptr + 1 * DCTSIZE, vl); + vint16m1_t quant_row2 = __riscv_vle16_v_i16m1(quantptr + 2 * DCTSIZE, vl); + vint16m1_t quant_row3 = __riscv_vle16_v_i16m1(quantptr + 3 * DCTSIZE, vl); + vint16m1_t quant_row5 = __riscv_vle16_v_i16m1(quantptr + 5 * DCTSIZE, vl); + vint16m1_t quant_row6 = __riscv_vle16_v_i16m1(quantptr + 6 * DCTSIZE, vl); + vint16m1_t quant_row7 = __riscv_vle16_v_i16m1(quantptr + 7 * DCTSIZE, vl); + + /* Even part */ + vint32m2_t tmp0, tmp2; + tmp0 = __riscv_vwmul_vx_i32m2(row0, 16384, vl); // 16384 = 2^(CONST_BITS + 1) + row2 = __riscv_vmul_vv_i16m1(row2, quant_row2, vl); + row6 = __riscv_vmul_vv_i16m1(row6, quant_row6, vl); + tmp2 = __riscv_vwmul_vx_i32m2(row2, jsimd_idct_4x4_consts[0], vl); + tmp2 = __riscv_vwmacc_vx_i32m2(tmp2, jsimd_idct_4x4_consts[1], row6, vl); + vint32m2_t tmp10 = __riscv_vadd_vv_i32m2(tmp0, tmp2, vl); + vint32m2_t tmp12 = __riscv_vsub_vv_i32m2(tmp0, tmp2, vl); + + /* Odd part */ + row7 = __riscv_vmul_vv_i16m1(row7, quant_row7, vl); + row5 = __riscv_vmul_vv_i16m1(row5, quant_row5, vl); + row3 = __riscv_vmul_vv_i16m1(row3, quant_row3, vl); + row1 = __riscv_vmul_vv_i16m1(row1, quant_row1, vl); + + tmp0 = __riscv_vwmul_vx_i32m2 (row7, jsimd_idct_4x4_consts[2], vl); + tmp0 = __riscv_vwmacc_vx_i32m2(tmp0, jsimd_idct_4x4_consts[3], row5, vl); + tmp0 = __riscv_vwmacc_vx_i32m2(tmp0, jsimd_idct_4x4_consts[4], row3, vl); + tmp0 = __riscv_vwmacc_vx_i32m2(tmp0, jsimd_idct_4x4_consts[5], row1, vl); + + tmp2 = __riscv_vwmul_vx_i32m2 (row7, jsimd_idct_4x4_consts[6], vl); + tmp2 = __riscv_vwmacc_vx_i32m2(tmp2, jsimd_idct_4x4_consts[7], row5, vl); + tmp2 = __riscv_vwmacc_vx_i32m2(tmp2, jsimd_idct_4x4_consts[8], row3, vl); + tmp2 = __riscv_vwmacc_vx_i32m2(tmp2, jsimd_idct_4x4_consts[9], row1, vl); + + /* Final output stage */ + vint32m4_t tmp10_tmp12_i32m4 = __riscv_vlmul_ext_v_i32m2_i32m4(tmp10); + tmp10_tmp12_i32m4 = __riscv_vslideup_vx_i32m4(tmp10_tmp12_i32m4, __riscv_vlmul_ext_v_i32m2_i32m4(tmp12), 8, 16); + vint32m4_t tmp2_tmp0_i32m4 = __riscv_vlmul_ext_v_i32m2_i32m4(tmp2); + tmp2_tmp0_i32m4 = __riscv_vslideup_vx_i32m4(tmp2_tmp0_i32m4, __riscv_vlmul_ext_v_i32m2_i32m4(tmp0), 8, 16); + + vint32m4_t tmp_add_i32m4 = __riscv_vadd_vv_i32m4(tmp10_tmp12_i32m4, tmp2_tmp0_i32m4, 16); + vint32m4_t tmp_sub_i32m4 = __riscv_vsub_vv_i32m4(tmp10_tmp12_i32m4, tmp2_tmp0_i32m4, 16); + tmp_add_i32m4 = __riscv_vadd_vx_i32m4(tmp_add_i32m4, 1<<(CONST_BITS - PASS1_BITS + 1 - 1), 16); + tmp_sub_i32m4 = __riscv_vadd_vx_i32m4(tmp_sub_i32m4, 1<<(CONST_BITS - PASS1_BITS + 1 - 1), 16); + vint16m2_t tmp_rows_01 = __riscv_vnsra_wx_i16m2(tmp_add_i32m4, (CONST_BITS - PASS1_BITS + 1), 16); + vint16m2_t tmp_rows_32 = __riscv_vnsra_wx_i16m2(tmp_sub_i32m4, (CONST_BITS - PASS1_BITS + 1), 16); + + /* combine vectors */ + rows_0132_i16m4 = __riscv_vlmul_ext_v_i16m2_i16m4(tmp_rows_01); + rows_0132_i16m4 = __riscv_vslideup_vx_i16m4(rows_0132_i16m4, __riscv_vlmul_ext_v_i16m2_i16m4(tmp_rows_32), 16, 32); + } + + /* Transpose 8x4 block to perform IDCT on rows in second pass. */ + + /* interpret to u16 & transpose */ + vint16m4_t vg_reg16 = __riscv_vrgather(rows_0132_i16m4, __riscv_vzext_vf2_u16m4(vg_reg8, 28), 28); + + /* extract columns */ + vl = 4; + vint16m1_t col0 = __riscv_vget_v_i16m4_i16m1(vg_reg16, 0); + vint16m1_t col1 = __riscv_vslidedown_vx_i16m1(col0, 4, vl); + + vint16m4_t slidedown_m4 = __riscv_vslidedown_vx_i16m4(vg_reg16, 8, 8); + vint16m2_t tmp_i16m2 = __riscv_vlmul_trunc_v_i16m4_i16m2(slidedown_m4); + vint16m1_t col2 = __riscv_vlmul_trunc_v_i16m2_i16m1(tmp_i16m2); + tmp_i16m2 = __riscv_vslidedown_vx_i16m2(tmp_i16m2, 4, vl); + vint16m1_t col3 = __riscv_vlmul_trunc_v_i16m2_i16m1(tmp_i16m2); + + slidedown_m4 = __riscv_vslidedown_vx_i16m4(vg_reg16, 16, 8); + tmp_i16m2 = __riscv_vlmul_trunc_v_i16m4_i16m2(slidedown_m4); + vint16m1_t col5 = __riscv_vlmul_trunc_v_i16m2_i16m1(tmp_i16m2); + tmp_i16m2 = __riscv_vslidedown_vx_i16m2(tmp_i16m2, 4, vl); + vint16m1_t col6 = __riscv_vlmul_trunc_v_i16m2_i16m1(tmp_i16m2); + + slidedown_m4 = __riscv_vslidedown_vx_i16m4(vg_reg16, 24, vl); + vint16m1_t col7 = __riscv_vlmul_trunc_v_i16m4_i16m1(slidedown_m4); + + /* Commence second pass of IDCT. */ + + /* Even part */ + vint32m2_t tmp0, tmp2, tmp10, tmp12; + tmp0 = __riscv_vwmul_vx_i32m2(col0, 16384, vl); // 2^(CONST_BITS + 1) + tmp2 = __riscv_vwmul_vx_i32m2 (col2, jsimd_idct_4x4_consts[0], vl); + tmp2 = __riscv_vwmacc_vx_i32m2(tmp2, jsimd_idct_4x4_consts[1], col6, vl); + tmp10 = __riscv_vadd_vv_i32m2(tmp0, tmp2, vl); + tmp12 = __riscv_vsub_vv_i32m2(tmp0, tmp2, vl); + + /* Odd part */ + tmp0 = __riscv_vwmul_vx_i32m2 (col7, jsimd_idct_4x4_consts[2], vl); + tmp0 = __riscv_vwmacc_vx_i32m2(tmp0, jsimd_idct_4x4_consts[3], col5, vl); + tmp0 = __riscv_vwmacc_vx_i32m2(tmp0, jsimd_idct_4x4_consts[4], col3, vl); + tmp0 = __riscv_vwmacc_vx_i32m2(tmp0, jsimd_idct_4x4_consts[5], col1, vl); + + tmp2 = __riscv_vwmul_vx_i32m2 (col7, jsimd_idct_4x4_consts[6], vl); + tmp2 = __riscv_vwmacc_vx_i32m2(tmp2, jsimd_idct_4x4_consts[7], col5, vl); + tmp2 = __riscv_vwmacc_vx_i32m2(tmp2, jsimd_idct_4x4_consts[8], col3, vl); + tmp2 = __riscv_vwmacc_vx_i32m2(tmp2, jsimd_idct_4x4_consts[9], col1, vl); + + /* Final output stage: descale and clamp to range [0-255]. */ + vuint8mf2_t u8mf2_col01, u8mf2_col23; + vint8mf2_t tmp_i8mf2; + vint16m1_t tmp_i16m1; + vint32m2_t tmp_i32m2_0, tmp_i32m2_1; + tmp_i32m2_0 = __riscv_vadd_vv_i32m2(tmp10, tmp2, 4); + tmp_i32m2_1 = __riscv_vadd_vv_i32m2(tmp12, tmp0, 4); + tmp_i32m2_0 = __riscv_vslideup_vx_i32m2(tmp_i32m2_0, tmp_i32m2_1, 4, 8); + tmp_i16m1 = __riscv_vnsra_wx_i16m1(tmp_i32m2_0, 16, 8); + tmp_i8mf2 = __riscv_vnclip_wx_i8mf2(tmp_i16m1, CONST_BITS + PASS1_BITS + 3 + 1 - 16, 8); + u8mf2_col01 = __riscv_vreinterpret_v_i8mf2_u8mf2(tmp_i8mf2); + u8mf2_col01 = __riscv_vadd_vx_u8mf2(u8mf2_col01, CENTERJSAMPLE, 8); + + tmp_i32m2_0 = __riscv_vsub_vv_i32m2(tmp12, tmp0, 4); + tmp_i32m2_1 = __riscv_vsub_vv_i32m2(tmp10, tmp2, 4); + tmp_i32m2_0 = __riscv_vslideup_vx_i32m2(tmp_i32m2_0, tmp_i32m2_1, 4, 8); + tmp_i16m1 = __riscv_vnsra_wx_i16m1(tmp_i32m2_0, 16, 8); + tmp_i8mf2 = __riscv_vnclip_wx_i8mf2(tmp_i16m1, CONST_BITS + PASS1_BITS + 3 + 1 - 16, 8); + u8mf2_col23 = __riscv_vreinterpret_v_i8mf2_u8mf2(tmp_i8mf2); + u8mf2_col23 = __riscv_vadd_vx_u8mf2(u8mf2_col23, CENTERJSAMPLE, 8); + + vuint8m1_t u8m1_col0123 = __riscv_vslideup_vx_u8m1(__riscv_vlmul_ext_v_u8mf2_u8m1(u8mf2_col01), + __riscv_vlmul_ext_v_u8mf2_u8m1(u8mf2_col23), 8, 16); + + /* Transpose */ + uint8_t out_index_tab[16] = + { + 0, 4, 8, 12, // output transpose index order + 1, 5, 9, 13, + 2, 6, 10, 14, + 3, 7, 11, 15, + }; + vuint8m1_t index_order_u8m1 = __riscv_vle8_v_u8m1(out_index_tab, 16); + vuint8m1_t v0_out, v1_out, v2_out, v3_out; + v0_out = __riscv_vrgather_vv_u8m1(u8m1_col0123, index_order_u8m1, 16); + v1_out = __riscv_vslidedown_vx_u8m1(v0_out, 4, 4); + v2_out = __riscv_vslidedown_vx_u8m1(v0_out, 8, 4); + v3_out = __riscv_vslidedown_vx_u8m1(v0_out, 12, 4); + + /* Store 4x4 block to memory. */ + JSAMPROW outptr0 = output_buf[0] + output_col; + JSAMPROW outptr1 = output_buf[1] + output_col; + JSAMPROW outptr2 = output_buf[2] + output_col; + JSAMPROW outptr3 = output_buf[3] + output_col; + __riscv_vse8_v_u8m1(outptr0, v0_out, 4); + __riscv_vse8_v_u8m1(outptr1, v1_out, 4); + __riscv_vse8_v_u8m1(outptr2, v2_out, 4); + __riscv_vse8_v_u8m1(outptr3, v3_out, 4); +}