diff --git a/simd/rvv_andes/jidctfst-rvv.c b/simd/rvv_andes/jidctfst-rvv.c
new file mode 100644
index 000000000..3ee5043f6
--- /dev/null
+++ b/simd/rvv_andes/jidctfst-rvv.c
@@ -0,0 +1,377 @@
+/*
+* jidctfst-rvv.c - fast integer IDCT (RISC-V RVV)
+*
+* Copyright (c) 2012-2024 Andes Technology Corporation
+* All rights reserved.
+*/
+/*
+ * jidctfst-neon.c - fast integer IDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <riscv_vector.h>
+
+
+/* jsimd_idct_ifast_rvv() performs dequantization and a fast, not so accurate
+ * inverse DCT (Discrete Cosine Transform) on one block of coefficients.  It
+ * uses the same calculations and produces exactly the same output as IJG's
+ * original jpeg_idct_ifast() function, which can be found in jidctfst.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.082392200 =  2688 * 2^-15
+ *    0.414213562 = 13568 * 2^-15
+ *    0.847759065 = 27776 * 2^-15
+ *    0.613125930 = 20096 * 2^-15
+ *
+ * See jidctfst.c for further details of the IDCT algorithm.  Where possible,
+ * the variable names and comments here in jsimd_idct_ifast_rvv() match up
+ * with those in jpeg_idct_ifast().
+ */
+
+#define PASS1_BITS  2
+
+#define F_0_082  2688
+#define F_0_414  13568
+#define F_0_847  27776
+#define F_0_613  20096
+
+
+static const int16_t idct_ifast_consts[] = {
+  F_0_082, F_0_414, F_0_847, F_0_613
+};
+
+void jsimd_idct_ifast_rvv(void *dct_table, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  IFAST_MULT_TYPE *quantptr = dct_table;
+  vint16m8_t rows_all_i16m8;
+
+  /* Load DCT coefficients. */
+  size_t vl = 8;
+  vint16m1_t row0 = __riscv_vle16_v_i16m1(coef_block + 0 * DCTSIZE, vl);
+  vint16m1_t row1 = __riscv_vle16_v_i16m1(coef_block + 1 * DCTSIZE, vl);
+  vint16m1_t row2 = __riscv_vle16_v_i16m1(coef_block + 2 * DCTSIZE, vl);
+  vint16m1_t row3 = __riscv_vle16_v_i16m1(coef_block + 3 * DCTSIZE, vl);
+  vint16m1_t row4 = __riscv_vle16_v_i16m1(coef_block + 4 * DCTSIZE, vl);
+  vint16m1_t row5 = __riscv_vle16_v_i16m1(coef_block + 5 * DCTSIZE, vl);
+  vint16m1_t row6 = __riscv_vle16_v_i16m1(coef_block + 6 * DCTSIZE, vl);
+  vint16m1_t row7 = __riscv_vle16_v_i16m1(coef_block + 7 * DCTSIZE, vl);
+
+  /* Load quantization table values for DC coefficients. */
+  vint16m1_t quant_row0 = __riscv_vle16_v_i16m1(quantptr + 0 * DCTSIZE, vl);
+
+  /* Dequantize DC coefficients. */
+  row0 = __riscv_vmul_vv_i16m1(row0, quant_row0, vl);
+
+  /* Construct bitmap to test if all AC coefficients are 0. */
+  vint16m1_t bitmap = __riscv_vor_vv_i16m1(row1, row2, vl);
+  bitmap = __riscv_vor_vv_i16m1(bitmap, row3, vl);
+  bitmap = __riscv_vor_vv_i16m1(bitmap, row4, vl);
+  bitmap = __riscv_vor_vv_i16m1(bitmap, row5, vl);
+  bitmap = __riscv_vor_vv_i16m1(bitmap, row6, vl);
+  bitmap = __riscv_vor_vv_i16m1(bitmap, row7, vl);
+
+  uint16_t ac_bitmap;
+  {
+  vuint16m1_t vec_zero = __riscv_vmv_s_x_u16m1(0, vl);
+  vuint16m1_t tmp_u16m1 = __riscv_vreinterpret_v_i16m1_u16m1(bitmap);
+  ac_bitmap = __riscv_vmv_x_s_u16m1_u16(__riscv_vredor_vs_u16m1_u16m1(tmp_u16m1, vec_zero, vl));
+  }
+
+  /* Load IDCT conversion constants. */
+  if (0 == ac_bitmap)
+  {
+    /* All AC coefficients are zero.
+     * Compute DC values and duplicate into vectors.
+     */
+    // combine vectors
+    vint16m4_t tmp_i16m4 = __riscv_vlmul_ext_v_i16m1_i16m4(row0);
+    tmp_i16m4 = __riscv_vslideup_vx_i16m4(tmp_i16m4, tmp_i16m4, DCTSIZE2/8, DCTSIZE2/4);
+    tmp_i16m4 = __riscv_vslideup_vx_i16m4(tmp_i16m4, tmp_i16m4, DCTSIZE2/4, DCTSIZE2/2);
+    rows_all_i16m8 = __riscv_vlmul_ext_v_i16m4_i16m8(tmp_i16m4);
+    rows_all_i16m8 = __riscv_vslideup_vx_i16m8(rows_all_i16m8, rows_all_i16m8, DCTSIZE2/2, DCTSIZE2);
+  } 
+  else {
+    /* full IDCT calculation. */
+
+    /* Load quantization table. */
+    vint16m1_t quant_row1 = __riscv_vle16_v_i16m1(quantptr + 1 * DCTSIZE, vl);
+    vint16m1_t quant_row2 = __riscv_vle16_v_i16m1(quantptr + 2 * DCTSIZE, vl);
+    vint16m1_t quant_row3 = __riscv_vle16_v_i16m1(quantptr + 3 * DCTSIZE, vl);
+    vint16m1_t quant_row4 = __riscv_vle16_v_i16m1(quantptr + 4 * DCTSIZE, vl);
+    vint16m1_t quant_row5 = __riscv_vle16_v_i16m1(quantptr + 5 * DCTSIZE, vl);
+    vint16m1_t quant_row6 = __riscv_vle16_v_i16m1(quantptr + 6 * DCTSIZE, vl);
+    vint16m1_t quant_row7 = __riscv_vle16_v_i16m1(quantptr + 7 * DCTSIZE, vl);
+
+    /* Even part: dequantize DCT coefficients. */
+    vint16m1_t tmp0 = __riscv_vmv_v_v_i16m1(row0, vl);
+    vint16m1_t tmp1 = __riscv_vmul_vv_i16m1(row2, quant_row2, vl);
+    vint16m1_t tmp2 = __riscv_vmul_vv_i16m1(row4, quant_row4, vl);
+    vint16m1_t tmp3 = __riscv_vmul_vv_i16m1(row6, quant_row6, vl);
+
+    vint16m1_t tmp10 = __riscv_vadd_vv_i16m1(tmp0, tmp2, vl);	/* phase 3 */
+    vint16m1_t tmp11 = __riscv_vsub_vv_i16m1(tmp0, tmp2, vl);
+    vint16m1_t tmp13 = __riscv_vadd_vv_i16m1(tmp1, tmp3, vl);	/* phases 5-3 */
+    vint16m1_t tmp12;
+    {
+    vint16m1_t tmp1_sub_tmp3 = __riscv_vsub_vv_i16m1(tmp1, tmp3, vl);
+    tmp12 = __riscv_vsmul_vx_i16m1(tmp1_sub_tmp3, idct_ifast_consts[1], vl);
+    tmp12 = __riscv_vadd_vv_i16m1(tmp12, tmp1_sub_tmp3, vl);
+    tmp12 = __riscv_vsub_vv_i16m1(tmp12, tmp13, vl);
+    }
+
+    tmp0 = __riscv_vadd_vv_i16m1(tmp10, tmp13, vl);			/* phase 2 */
+    tmp3 = __riscv_vsub_vv_i16m1(tmp10, tmp13, vl);
+    tmp1 = __riscv_vadd_vv_i16m1(tmp11, tmp12, vl);
+    tmp2 = __riscv_vsub_vv_i16m1(tmp11, tmp12, vl);
+
+    /* Odd part: dequantize DCT coefficients. */
+    vint16m1_t tmp4 = __riscv_vmul_vv_i16m1(__riscv_vmv_v_v_i16m1(row1, vl), quant_row1, vl);
+    vint16m1_t tmp5 = __riscv_vmul_vv_i16m1(__riscv_vmv_v_v_i16m1(row3, vl), quant_row3, vl);
+    vint16m1_t tmp6 = __riscv_vmul_vv_i16m1(__riscv_vmv_v_v_i16m1(row5, vl), quant_row5, vl);
+    vint16m1_t tmp7 = __riscv_vmul_vv_i16m1(__riscv_vmv_v_v_i16m1(row7, vl), quant_row7, vl);
+
+    vint16m1_t z13 = __riscv_vadd_vv_i16m1(tmp5, tmp6, vl);	/* phase 6 */
+    vint16m1_t neg_z10 = __riscv_vsub_vv_i16m1(tmp5, tmp6, vl);
+    vint16m1_t z11 = __riscv_vadd_vv_i16m1(tmp4, tmp7, vl);
+    vint16m1_t z12 = __riscv_vsub_vv_i16m1(tmp4, tmp7, vl);
+
+    tmp7 = __riscv_vadd_vv_i16m1(z11, z13, vl);				/* phase 5 */
+    {
+    vint16m1_t z11_sub_z13 = __riscv_vsub_vv_i16m1(z11, z13, vl);
+    tmp11 = __riscv_vsmul_vx_i16m1(z11_sub_z13, idct_ifast_consts[1], vl);
+    tmp11 = __riscv_vadd_vv_i16m1(tmp11, z11_sub_z13, vl);
+    }
+
+    {
+    vint16m1_t z10_add_z12 = __riscv_vsub_vv_i16m1(z12, neg_z10, vl);
+    vint16m1_t z5 = __riscv_vsmul_vx_i16m1(z10_add_z12, idct_ifast_consts[2], vl);
+    z5 = __riscv_vadd_vv_i16m1(z5, z10_add_z12, vl);
+
+    tmp10 = __riscv_vsmul_vx_i16m1(z12, idct_ifast_consts[0], vl);
+    tmp10 = __riscv_vadd_vv_i16m1(tmp10, z12, vl);
+    tmp10 = __riscv_vsub_vv_i16m1(tmp10, z5, vl);
+
+    tmp12 = __riscv_vsmul_vx_i16m1(neg_z10, idct_ifast_consts[3], vl);
+    vint16m1_t tmp_neg_z10_d = __riscv_vadd_vv_i16m1(neg_z10, neg_z10, vl);
+    tmp12 = __riscv_vadd_vv_i16m1(tmp12, tmp_neg_z10_d, vl);
+    tmp12 = __riscv_vadd_vv_i16m1(tmp12, z5, vl);
+    }
+
+    tmp6 = __riscv_vsub_vv_i16m1(tmp12, tmp7, vl);			/* phase 2 */
+    tmp5 = __riscv_vsub_vv_i16m1(tmp11, tmp6, vl);
+    tmp4 = __riscv_vadd_vv_i16m1(tmp10, tmp5, vl);
+
+    row0 = __riscv_vadd_vv_i16m1(tmp0, tmp7, vl);
+    row7 = __riscv_vsub_vv_i16m1(tmp0, tmp7, vl);
+    row1 = __riscv_vadd_vv_i16m1(tmp1, tmp6, vl);
+    row6 = __riscv_vsub_vv_i16m1(tmp1, tmp6, vl);
+    row2 = __riscv_vadd_vv_i16m1(tmp2, tmp5, vl);
+    row5 = __riscv_vsub_vv_i16m1(tmp2, tmp5, vl);
+    row4 = __riscv_vadd_vv_i16m1(tmp3, tmp4, vl);
+    row3 = __riscv_vsub_vv_i16m1(tmp3, tmp4, vl);
+
+    // combine vectors
+    vint16m4_t rows_0123_i16m4 = __riscv_vlmul_ext_v_i16m1_i16m4(row0);
+    rows_0123_i16m4 = __riscv_vslideup_vx_i16m4(rows_0123_i16m4, __riscv_vlmul_ext_v_i16m1_i16m4(row1),  8, 32);
+    rows_0123_i16m4 = __riscv_vslideup_vx_i16m4(rows_0123_i16m4, __riscv_vlmul_ext_v_i16m1_i16m4(row2), 16, 32);
+    rows_0123_i16m4 = __riscv_vslideup_vx_i16m4(rows_0123_i16m4, __riscv_vlmul_ext_v_i16m1_i16m4(row2), 24, 32);
+
+    vint16m4_t rows_4567_i16m4 = __riscv_vlmul_ext_v_i16m1_i16m4(row4);
+    rows_4567_i16m4 = __riscv_vslideup_vx_i16m4(rows_4567_i16m4, __riscv_vlmul_ext_v_i16m1_i16m4(row5),  8, 32);
+    rows_4567_i16m4 = __riscv_vslideup_vx_i16m4(rows_4567_i16m4, __riscv_vlmul_ext_v_i16m1_i16m4(row6), 16, 32);
+    rows_4567_i16m4 = __riscv_vslideup_vx_i16m4(rows_4567_i16m4, __riscv_vlmul_ext_v_i16m1_i16m4(row7), 24, 32);
+
+    rows_all_i16m8 = __riscv_vlmul_ext_v_i16m4_i16m8(rows_0123_i16m4);
+    rows_all_i16m8 = __riscv_vslideup_vx_i16m8(rows_all_i16m8, __riscv_vlmul_ext_v_i16m4_i16m8(rows_4567_i16m4), 32, 64);
+  }
+
+  /* Transpose rows to work on columns in pass 2. */
+  const uint8_t trans_index8x8_u8[DCTSIZE2] =
+  {
+    0,  8, 16, 24, 32, 40, 48, 56,
+    1,  9, 17, 25, 33, 41, 49, 57,
+    2, 10, 18, 26, 34, 42, 50, 58,
+    3, 11, 19, 27, 35, 43, 51, 59,
+    4, 12, 20, 28, 36, 44, 52, 60,
+    5, 13, 21, 29, 37, 45, 53, 61,
+    6, 14, 22, 30, 38, 46, 54, 62,
+    7, 15, 23, 31, 39, 47, 55, 63,
+  };
+
+  // load transpose look-up table
+  vuint8m4_t vg_reg8 = __riscv_vle8_v_u8m4(trans_index8x8_u8, DCTSIZE2);
+
+  // interpret to u16 & transpose
+  vint16m8_t vg_reg16 = __riscv_vrgather(rows_all_i16m8, __riscv_vzext_vf2_u16m8(vg_reg8, DCTSIZE2), \
+	DCTSIZE2);
+  int16_t workspace[DCTSIZE2];      /* buffers data between passes */
+  __riscv_vse16_v_i16m8(&workspace[0], vg_reg16, DCTSIZE2);
+  vint16m1_t col0 = __riscv_vle16_v_i16m1(&workspace[8*0], vl);
+  vint16m1_t col1 = __riscv_vle16_v_i16m1(&workspace[8*1], vl);
+  vint16m1_t col2 = __riscv_vle16_v_i16m1(&workspace[8*2], vl);
+  vint16m1_t col3 = __riscv_vle16_v_i16m1(&workspace[8*3], vl);
+  vint16m1_t col4 = __riscv_vle16_v_i16m1(&workspace[8*4], vl);
+  vint16m1_t col5 = __riscv_vle16_v_i16m1(&workspace[8*5], vl);
+  vint16m1_t col6 = __riscv_vle16_v_i16m1(&workspace[8*6], vl);
+  vint16m1_t col7 = __riscv_vle16_v_i16m1(&workspace[8*7], vl);
+
+  /* 1-D IDCT, pass 2 */
+
+  /* Even part */
+  vint16m1_t tmp10 = __riscv_vadd_vv_i16m1(col0, col4, vl);
+  vint16m1_t tmp11 = __riscv_vsub_vv_i16m1(col0, col4, vl);
+  vint16m1_t tmp13 = __riscv_vadd_vv_i16m1(col2, col6, vl);
+  vint16m1_t tmp12;
+  {
+  vint16m1_t col2_sub_col6 = __riscv_vsub_vv_i16m1(col2, col6, vl);
+  tmp12 = __riscv_vsmul_vx_i16m1(col2_sub_col6, idct_ifast_consts[1], vl);
+  tmp12 = __riscv_vadd_vv_i16m1(tmp12, col2_sub_col6, vl);
+  tmp12 = __riscv_vsub_vv_i16m1(tmp12, tmp13, vl);
+  }
+
+  vint16m1_t tmp0 = __riscv_vadd_vv_i16m1(tmp10, tmp13, vl);
+  vint16m1_t tmp3 = __riscv_vsub_vv_i16m1(tmp10, tmp13, vl);
+  vint16m1_t tmp1 = __riscv_vadd_vv_i16m1(tmp11, tmp12, vl);
+  vint16m1_t tmp2 = __riscv_vsub_vv_i16m1(tmp11, tmp12, vl);
+
+  /* Odd part */
+  vint16m1_t z13 = __riscv_vadd_vv_i16m1(col5, col3, vl);
+  vint16m1_t neg_z10 = __riscv_vsub_vv_i16m1(col3, col5, vl);
+  vint16m1_t z11 = __riscv_vadd_vv_i16m1(col1, col7, vl);
+  vint16m1_t z12 = __riscv_vsub_vv_i16m1(col1, col7, vl);
+ 
+  vint16m1_t tmp7 = __riscv_vadd_vv_i16m1(z11, z13, vl);	/* phase 5 */
+  {
+  vint16m1_t z11_sub_z13 = __riscv_vsub_vv_i16m1(z11, z13, vl);
+  tmp11 = __riscv_vsmul_vx_i16m1(z11_sub_z13, idct_ifast_consts[1], vl);
+  tmp11 = __riscv_vadd_vv_i16m1(tmp11, z11_sub_z13, vl);
+  }
+ 
+  {
+  vint16m1_t z10_add_z12 = __riscv_vsub_vv_i16m1(z12, neg_z10, vl);
+  vint16m1_t z5 = __riscv_vsmul_vx_i16m1(z10_add_z12, idct_ifast_consts[2], vl);
+  z5 = __riscv_vadd_vv_i16m1(z5, z10_add_z12, vl);
+ 
+  tmp10 = __riscv_vsmul_vx_i16m1(z12, idct_ifast_consts[0], vl);
+  tmp10 = __riscv_vadd_vv_i16m1(tmp10, z12, vl);
+  tmp10 = __riscv_vsub_vv_i16m1(tmp10, z5, vl);
+ 
+  tmp12 = __riscv_vsmul_vx_i16m1(neg_z10, idct_ifast_consts[3], vl);
+  vint16m1_t tmp_neg_z10_d = __riscv_vadd_vv_i16m1(neg_z10, neg_z10, vl);
+  tmp12 = __riscv_vadd_vv_i16m1(tmp12, tmp_neg_z10_d, vl);
+  tmp12 = __riscv_vadd_vv_i16m1(tmp12, z5, vl);
+  }
+ 
+  vint16m1_t tmp6 = __riscv_vsub_vv_i16m1(tmp12, tmp7, vl);	/* phase 2 */
+  vint16m1_t tmp5 = __riscv_vsub_vv_i16m1(tmp11, tmp6, vl);
+  vint16m1_t tmp4 = __riscv_vadd_vv_i16m1(tmp10, tmp5, vl);
+ 
+  col0 = __riscv_vadd_vv_i16m1(tmp0, tmp7, vl);
+  col7 = __riscv_vsub_vv_i16m1(tmp0, tmp7, vl);
+  col1 = __riscv_vadd_vv_i16m1(tmp1, tmp6, vl);
+  col6 = __riscv_vsub_vv_i16m1(tmp1, tmp6, vl);
+  col2 = __riscv_vadd_vv_i16m1(tmp2, tmp5, vl);
+  col5 = __riscv_vsub_vv_i16m1(tmp2, tmp5, vl);
+  col4 = __riscv_vadd_vv_i16m1(tmp3, tmp4, vl);
+  col3 = __riscv_vsub_vv_i16m1(tmp3, tmp4, vl);
+
+  /* Scale down by a factor of 8, narrowing to 8-bit. */
+  /* Clamp to range [0-255]. */
+  vint8m2_t tmp_i8m2;
+  vuint8m2_t u8m2_col0123, u8m2_col4567;
+  vint16m4_t i16m4_col0123 = __riscv_vlmul_ext_v_i16m1_i16m4(col0);
+  i16m4_col0123 = __riscv_vslideup_vx_i16m4(i16m4_col0123, __riscv_vlmul_ext_v_i16m1_i16m4(col1),  8, 16);
+  i16m4_col0123 = __riscv_vslideup_vx_i16m4(i16m4_col0123, __riscv_vlmul_ext_v_i16m1_i16m4(col2), 16, 24);
+  i16m4_col0123 = __riscv_vslideup_vx_i16m4(i16m4_col0123, __riscv_vlmul_ext_v_i16m1_i16m4(col3), 24, 32);
+  tmp_i8m2 = __riscv_vnclip_wx_i8m2(i16m4_col0123, PASS1_BITS + 3, 32);	// clamp -128 ~ 127
+  u8m2_col0123 = __riscv_vreinterpret_v_i8m2_u8m2(tmp_i8m2);
+  u8m2_col0123 = __riscv_vadd_vx_u8m2(u8m2_col0123, CENTERJSAMPLE, 32);	// clamp 0 ~ 255
+
+  vint16m4_t i16m4_col4567 = __riscv_vlmul_ext_v_i16m1_i16m4(col4);
+  i16m4_col4567 = __riscv_vslideup_vx_i16m4(i16m4_col4567, __riscv_vlmul_ext_v_i16m1_i16m4(col5),  8, 16);
+  i16m4_col4567 = __riscv_vslideup_vx_i16m4(i16m4_col4567, __riscv_vlmul_ext_v_i16m1_i16m4(col6), 16, 24);
+  i16m4_col4567 = __riscv_vslideup_vx_i16m4(i16m4_col4567, __riscv_vlmul_ext_v_i16m1_i16m4(col7), 24, 32);
+  tmp_i8m2 = __riscv_vnclip_wx_i8m2(i16m4_col4567, PASS1_BITS + 3, 32);	// clamp -128 ~ 127
+  u8m2_col4567 = __riscv_vreinterpret_v_i8m2_u8m2(tmp_i8m2);
+  u8m2_col4567 = __riscv_vadd_vx_u8m2(u8m2_col4567, CENTERJSAMPLE, 32);	// clamp 0 ~ 255
+
+  vuint8m4_t u8m4_col_all = __riscv_vlmul_ext_v_u8m2_u8m4(u8m2_col0123);
+  u8m4_col_all = __riscv_vslideup_vx_u8m4(u8m4_col_all, __riscv_vlmul_ext_v_u8m2_u8m4(u8m2_col4567), 32, 64);
+
+  /* Transpose block to prepare for store. */
+  vuint8m4_t u8m4_trans_all = __riscv_vrgather_vv_u8m4(u8m4_col_all, vg_reg8, DCTSIZE2);
+
+  // extract columns
+  vuint8mf2_t u8mf2_col_0, u8mf2_col_1, u8mf2_col_2, u8mf2_col_3;
+  vuint8mf2_t u8mf2_col_4, u8mf2_col_5, u8mf2_col_6, u8mf2_col_7;
+
+  vuint8m4_t slidedown_m4 = __riscv_vslidedown_vx_u8m4(u8m4_trans_all, 0, vl);
+  u8mf2_col_0 = __riscv_vlmul_trunc_v_u8m4_u8mf2(slidedown_m4);
+
+  slidedown_m4 = __riscv_vslidedown_vx_u8m4(u8m4_trans_all, 8, vl);
+  u8mf2_col_1 = __riscv_vlmul_trunc_v_u8m4_u8mf2(slidedown_m4);
+
+  slidedown_m4 = __riscv_vslidedown_vx_u8m4(u8m4_trans_all, 16, vl);
+  u8mf2_col_2 = __riscv_vlmul_trunc_v_u8m4_u8mf2(slidedown_m4);
+
+  slidedown_m4 = __riscv_vslidedown_vx_u8m4(u8m4_trans_all, 24, vl);
+  u8mf2_col_3 = __riscv_vlmul_trunc_v_u8m4_u8mf2(slidedown_m4);
+
+  slidedown_m4 = __riscv_vslidedown_vx_u8m4(u8m4_trans_all, 32, vl);
+  u8mf2_col_4 = __riscv_vlmul_trunc_v_u8m4_u8mf2(slidedown_m4);
+
+  slidedown_m4 = __riscv_vslidedown_vx_u8m4(u8m4_trans_all, 40, vl);
+  u8mf2_col_5 = __riscv_vlmul_trunc_v_u8m4_u8mf2(slidedown_m4);
+
+  slidedown_m4 = __riscv_vslidedown_vx_u8m4(u8m4_trans_all, 48, vl);
+  u8mf2_col_6 = __riscv_vlmul_trunc_v_u8m4_u8mf2(slidedown_m4);
+
+  slidedown_m4 = __riscv_vslidedown_vx_u8m4(u8m4_trans_all, 56, vl);
+  u8mf2_col_7 = __riscv_vlmul_trunc_v_u8m4_u8mf2(slidedown_m4);
+
+  JSAMPROW outptr0 = output_buf[0] + output_col;
+  JSAMPROW outptr1 = output_buf[1] + output_col;
+  JSAMPROW outptr2 = output_buf[2] + output_col;
+  JSAMPROW outptr3 = output_buf[3] + output_col;
+  JSAMPROW outptr4 = output_buf[4] + output_col;
+  JSAMPROW outptr5 = output_buf[5] + output_col;
+  JSAMPROW outptr6 = output_buf[6] + output_col;
+  JSAMPROW outptr7 = output_buf[7] + output_col;
+
+  /* Store DCT block to memory. */
+  __riscv_vse8_v_u8mf2(outptr0, u8mf2_col_0, vl);
+  __riscv_vse8_v_u8mf2(outptr1, u8mf2_col_1, vl);
+  __riscv_vse8_v_u8mf2(outptr2, u8mf2_col_2, vl);
+  __riscv_vse8_v_u8mf2(outptr3, u8mf2_col_3, vl);
+  __riscv_vse8_v_u8mf2(outptr4, u8mf2_col_4, vl);
+  __riscv_vse8_v_u8mf2(outptr5, u8mf2_col_5, vl);
+  __riscv_vse8_v_u8mf2(outptr6, u8mf2_col_6, vl);
+  __riscv_vse8_v_u8mf2(outptr7, u8mf2_col_7, vl);
+}
+
diff --git a/simd/rvv_andes/jidctint-rvv.c b/simd/rvv_andes/jidctint-rvv.c
new file mode 100644
index 000000000..737ff2797
--- /dev/null
+++ b/simd/rvv_andes/jidctint-rvv.c
@@ -0,0 +1,489 @@
+/*
+* jidctint-rvv.c - accurate integer IDCT (RISC-V RVV)
+*
+* Copyright (c) 2012-2024 Andes Technology Corporation
+* All rights reserved.
+*/
+/*
+ * jidctint-neon.c - accurate integer IDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <riscv_vector.h>
+
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+
+/* The computation of the inverse DCT requires the use of constants known at
+ * compile time.  Scaled integer constants are used to avoid floating-point
+ * arithmetic:
+ *    0.298631336 =  2446 * 2^-13
+ *    0.390180644 =  3196 * 2^-13
+ *    0.541196100 =  4433 * 2^-13
+ *    0.765366865 =  6270 * 2^-13
+ *    0.899976223 =  7373 * 2^-13
+ *    1.175875602 =  9633 * 2^-13
+ *    1.501321110 = 12299 * 2^-13
+ *    1.847759065 = 15137 * 2^-13
+ *    1.961570560 = 16069 * 2^-13
+ *    2.053119869 = 16819 * 2^-13
+ *    2.562915447 = 20995 * 2^-13
+ *    3.072711026 = 25172 * 2^-13
+ */
+
+#define F_0_298  2446
+#define F_0_390  3196
+#define F_0_541  4433
+#define F_0_765  6270
+#define F_0_899  7373
+#define F_1_175  9633
+#define F_1_501  12299
+#define F_1_847  15137
+#define F_1_961  16069
+#define F_2_053  16819
+#define F_2_562  20995
+#define F_3_072  25172
+
+#define F_1_175_MINUS_1_961  (F_1_175 - F_1_961)
+#define F_1_175_MINUS_0_390  (F_1_175 - F_0_390)
+#define F_0_541_MINUS_1_847  (F_0_541 - F_1_847)
+#define F_3_072_MINUS_2_562  (F_3_072 - F_2_562)
+#define F_0_298_MINUS_0_899  (F_0_298 - F_0_899)
+#define F_1_501_MINUS_0_899  (F_1_501 - F_0_899)
+#define F_2_053_MINUS_2_562  (F_2_053 - F_2_562)
+#define F_0_541_PLUS_0_765   (F_0_541 + F_0_765)
+
+
+static const int16_t idct_islow_consts[] = {
+  F_0_899,             F_0_541,
+  F_2_562,             F_0_298_MINUS_0_899,
+  F_1_501_MINUS_0_899, F_2_053_MINUS_2_562,
+  F_0_541_PLUS_0_765,  F_1_175,
+  F_1_175_MINUS_0_390, F_0_541_MINUS_1_847,
+  F_3_072_MINUS_2_562, F_1_175_MINUS_1_961,
+  0, 0, 0, 0
+};
+
+#define TRANS_TABLE_U8_SIZE 64
+const uint8_t trans_index8x8_u8[TRANS_TABLE_U8_SIZE] =
+{
+/* #0  #1  #2  #3  #7  #6  #5  #4  */
+	0,  8, 16, 24, 56, 48, 40, 32,
+	1,  9, 17, 25, 57, 49, 41, 33,
+	2, 10, 18, 26, 58, 50, 42, 34,
+	3, 11, 19, 27, 59, 51, 43, 35,
+	4, 12, 20, 28, 60, 52, 44, 36,
+	5, 13, 21, 29, 61, 53, 45, 37,
+	6, 14, 22, 30, 62, 54, 46, 38,
+	7, 15, 23, 31, 63, 55, 47, 39,
+};
+
+
+/* Forward declaration of regular and sparse IDCT helper functions */
+
+static INLINE void jsimd_idct_islow_pass1_regular(vint16m1_t row0,
+                                                  vint16m1_t row1,
+                                                  vint16m1_t row2,
+                                                  vint16m1_t row3,
+                                                  vint16m1_t row4,
+                                                  vint16m1_t row5,
+                                                  vint16m1_t row6,
+                                                  vint16m1_t row7,
+                                                  vint16m1_t quant_row0,
+                                                  vint16m1_t quant_row1,
+                                                  vint16m1_t quant_row2,
+                                                  vint16m1_t quant_row3,
+                                                  vint16m1_t quant_row4,
+                                                  vint16m1_t quant_row5,
+                                                  vint16m1_t quant_row6,
+                                                  vint16m1_t quant_row7,
+												  vint16m8_t *cols_all_i16m8);
+
+static INLINE void jsimd_idct_islow_pass2_regular(vint16m8_t *cols_all_i16m8,
+                                                  JSAMPARRAY output_buf,
+                                                  JDIMENSION output_col);
+
+
+/* Perform dequantization and inverse DCT on one block of coefficients.  For
+ * reference, the C implementation (jpeg_idct_slow()) can be found in
+ * jidctint.c.
+ */
+
+void jsimd_idct_islow_rvv(void *dct_table, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  ISLOW_MULT_TYPE *quantptr = dct_table;
+  vint16m8_t cols_all_i16m8;
+
+  /* Compute IDCT first pass on left 4x8 coefficient block. */
+
+  /* Load DCT coefficients. */
+  size_t vl = 8;
+  vint16m1_t row0 = __riscv_vle16_v_i16m1(coef_block + 0 * DCTSIZE, vl);
+  vint16m1_t row1 = __riscv_vle16_v_i16m1(coef_block + 1 * DCTSIZE, vl);
+  vint16m1_t row2 = __riscv_vle16_v_i16m1(coef_block + 2 * DCTSIZE, vl);
+  vint16m1_t row3 = __riscv_vle16_v_i16m1(coef_block + 3 * DCTSIZE, vl);
+  vint16m1_t row4 = __riscv_vle16_v_i16m1(coef_block + 4 * DCTSIZE, vl);
+  vint16m1_t row5 = __riscv_vle16_v_i16m1(coef_block + 5 * DCTSIZE, vl);
+  vint16m1_t row6 = __riscv_vle16_v_i16m1(coef_block + 6 * DCTSIZE, vl);
+  vint16m1_t row7 = __riscv_vle16_v_i16m1(coef_block + 7 * DCTSIZE, vl);
+
+  /* Load quantization table. */
+  vint16m1_t quant_row0 = __riscv_vle16_v_i16m1(quantptr + 0 * DCTSIZE, vl);
+  vint16m1_t quant_row1 = __riscv_vle16_v_i16m1(quantptr + 1 * DCTSIZE, vl);
+  vint16m1_t quant_row2 = __riscv_vle16_v_i16m1(quantptr + 2 * DCTSIZE, vl);
+  vint16m1_t quant_row3 = __riscv_vle16_v_i16m1(quantptr + 3 * DCTSIZE, vl);
+  vint16m1_t quant_row4 = __riscv_vle16_v_i16m1(quantptr + 4 * DCTSIZE, vl);
+  vint16m1_t quant_row5 = __riscv_vle16_v_i16m1(quantptr + 5 * DCTSIZE, vl);
+  vint16m1_t quant_row6 = __riscv_vle16_v_i16m1(quantptr + 6 * DCTSIZE, vl);
+  vint16m1_t quant_row7 = __riscv_vle16_v_i16m1(quantptr + 7 * DCTSIZE, vl);
+
+  /* Construct bitmap to test if DCT coefficients are 0. */
+  vuint16m1_t vec_zero = __riscv_vmv_s_x_u16m1(0, vl);
+  vint16m1_t bitmap = __riscv_vor_vv_i16m1(row7, row6, vl);
+  bitmap = __riscv_vor_vv_i16m1(bitmap, row5, vl);
+  bitmap = __riscv_vor_vv_i16m1(bitmap, row4, vl);
+  bitmap = __riscv_vor_vv_i16m1(bitmap, row3, vl);
+  bitmap = __riscv_vor_vv_i16m1(bitmap, row2, vl);
+  bitmap = __riscv_vor_vv_i16m1(bitmap, row1, vl);
+  vuint16m1_t tmp_u16m1 = __riscv_vredor_vs_u16m1_u16m1(__riscv_vreinterpret_v_i16m1_u16m1(bitmap), vec_zero, vl);
+  uint16_t ac_bitmap = __riscv_vmv_x_s_u16m1_u16(tmp_u16m1);
+  if (0 == ac_bitmap) {
+    vint16m1_t dcval_i16m1 = __riscv_vmul_vv_i16m1(row0, quant_row0, vl);
+    dcval_i16m1 = __riscv_vsll_vx_i16m1(dcval_i16m1, PASS1_BITS, vl);
+
+	// combine vectors
+	vint16m4_t tmp_i16m4 = __riscv_vlmul_ext_v_i16m1_i16m4(dcval_i16m1);
+	tmp_i16m4 = __riscv_vslideup_vx_i16m4(tmp_i16m4, tmp_i16m4, DCTSIZE2/8, DCTSIZE2/4);
+	tmp_i16m4 = __riscv_vslideup_vx_i16m4(tmp_i16m4, tmp_i16m4, DCTSIZE2/4, DCTSIZE2/2);
+	cols_all_i16m8  = __riscv_vlmul_ext_v_i16m4_i16m8(tmp_i16m4);
+	cols_all_i16m8 = __riscv_vslideup_vx_i16m8(cols_all_i16m8, cols_all_i16m8, DCTSIZE2/2, DCTSIZE2);
+  } else {
+    jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
+                                   row6, row7, quant_row0, quant_row1,
+                                   quant_row2, quant_row3, quant_row4,
+                                   quant_row5, quant_row6, quant_row7,
+								   &cols_all_i16m8);
+  }
+
+  /* Second pass: compute IDCT on rows in workspace. */
+  jsimd_idct_islow_pass2_regular(&cols_all_i16m8, output_buf, output_col);
+}
+
+
+/*
+* This "regular" version assumes that no optimization can be made to the IDCT
+* calculation, since no useful set of AC coefficients is all 0.
+*
+* The original C implementation of the accurate IDCT (jpeg_idct_slow()) can be
+* found in jidctint.c.  Algorithmic changes made here are documented inline.
+*/
+
+static INLINE void jsimd_idct_islow_pass1_regular(vint16m1_t row0,
+                                                  vint16m1_t row1,
+                                                  vint16m1_t row2,
+                                                  vint16m1_t row3,
+                                                  vint16m1_t row4,
+                                                  vint16m1_t row5,
+                                                  vint16m1_t row6,
+                                                  vint16m1_t row7,
+                                                  vint16m1_t quant_row0,
+                                                  vint16m1_t quant_row1,
+                                                  vint16m1_t quant_row2,
+                                                  vint16m1_t quant_row3,
+                                                  vint16m1_t quant_row4,
+                                                  vint16m1_t quant_row5,
+                                                  vint16m1_t quant_row6,
+                                                  vint16m1_t quant_row7,
+												  vint16m8_t *cols_all_i16m8)
+{
+  /* Even part */
+  size_t vl = 8;
+  vint16m1_t z2_s16 = __riscv_vmul_vv_i16m1(row2, quant_row2, vl);
+  vint16m1_t z3_s16 = __riscv_vmul_vv_i16m1(row6, quant_row6, vl);
+  vint32m2_t tmp2 = __riscv_vwmul_vx_i32m2(z2_s16, idct_islow_consts[1], vl);
+  vint32m2_t tmp3 = __riscv_vwmul_vx_i32m2(z2_s16, idct_islow_consts[6], vl);
+  tmp2 = __riscv_vwmacc_vx_i32m2(tmp2, idct_islow_consts[9], z3_s16, vl);
+  tmp3 = __riscv_vwmacc_vx_i32m2(tmp3, idct_islow_consts[1], z3_s16, vl);
+ 
+  z2_s16 = __riscv_vmul_vv_i16m1(row0, quant_row0, vl);
+  z3_s16 = __riscv_vmul_vv_i16m1(row4, quant_row4, vl);
+  vint32m2_t tmp0 = __riscv_vwmul_vx_i32m2(__riscv_vadd_vv_i16m1(z2_s16, z3_s16, vl), 8192, vl);	// 8192 = 2^(CONST_BITS)
+  vint32m2_t tmp1 = __riscv_vwmul_vx_i32m2(__riscv_vsub_vv_i16m1(z2_s16, z3_s16, vl), 8192, vl);
+ 
+  vint32m2_t tmp10 = __riscv_vadd_vv_i32m2(tmp0, tmp3, vl);
+  vint32m2_t tmp13 = __riscv_vsub_vv_i32m2(tmp0, tmp3, vl);
+  vint32m2_t tmp11 = __riscv_vadd_vv_i32m2(tmp1, tmp2, vl);
+  vint32m2_t tmp12 = __riscv_vsub_vv_i32m2(tmp1, tmp2, vl);
+ 
+  /* Odd part */
+  vint16m1_t tmp0_s16 = __riscv_vmul_vv_i16m1(row7, quant_row7, vl);
+  vint16m1_t tmp1_s16 = __riscv_vmul_vv_i16m1(row5, quant_row5, vl);
+  vint16m1_t tmp2_s16 = __riscv_vmul_vv_i16m1(row3, quant_row3, vl);
+  vint16m1_t tmp3_s16 = __riscv_vmul_vv_i16m1(row1, quant_row1, vl);
+  z3_s16 = __riscv_vadd_vv_i16m1(tmp0_s16, tmp2_s16, vl);
+  vint16m1_t z4_s16 = __riscv_vadd_vv_i16m1(tmp1_s16, tmp3_s16, vl);
+ 
+  vint32m2_t z3 = __riscv_vwmul_vx_i32m2(z3_s16, idct_islow_consts[11], vl);
+  vint32m2_t z4 = __riscv_vwmul_vx_i32m2(z3_s16, idct_islow_consts[ 7], vl);
+  z3 = __riscv_vwmacc_vx_i32m2(z3, idct_islow_consts[7], z4_s16, vl);
+  z4 = __riscv_vwmacc_vx_i32m2(z4, idct_islow_consts[8], z4_s16, vl);
+ 
+  tmp0 = __riscv_vwmul_vx_i32m2(tmp0_s16, idct_islow_consts[ 3], vl);
+  tmp1 = __riscv_vwmul_vx_i32m2(tmp1_s16, idct_islow_consts[ 5], vl);
+  tmp2 = __riscv_vwmul_vx_i32m2(tmp2_s16, idct_islow_consts[10], vl);
+  tmp3 = __riscv_vwmul_vx_i32m2(tmp3_s16, idct_islow_consts[ 4], vl);
+ 
+  tmp0 = __riscv_vwmacc_vx_i32m2(tmp0, -idct_islow_consts[0], tmp3_s16, vl);
+  tmp1 = __riscv_vwmacc_vx_i32m2(tmp1, -idct_islow_consts[2], tmp2_s16, vl);
+  tmp2 = __riscv_vwmacc_vx_i32m2(tmp2, -idct_islow_consts[2], tmp1_s16, vl);
+  tmp3 = __riscv_vwmacc_vx_i32m2(tmp3, -idct_islow_consts[0], tmp0_s16, vl);
+ 
+  tmp0 = __riscv_vadd_vv_i32m2(tmp0, z3, vl);
+  tmp1 = __riscv_vadd_vv_i32m2(tmp1, z4, vl);
+  tmp2 = __riscv_vadd_vv_i32m2(tmp2, z3, vl);
+  tmp3 = __riscv_vadd_vv_i32m2(tmp3, z4, vl);
+
+  /* Final output stage: descale and narrow to 16-bit. */
+  vint16m4_t cols_0123_i16m4, cols_7654_i16m4;
+  vint8m2_t tmp_i8m2;
+  vint16m4_t tmp_i16m4;
+  vint32m8_t tmp_i32m8;
+
+  // combine (tmp10, tmp11, tmp12, tmp13)
+  vint32m4_t tmps_10_11 = __riscv_vlmul_ext_v_i32m2_i32m4(tmp10);
+  tmps_10_11 = __riscv_vslideup_vx_i32m4(tmps_10_11, __riscv_vlmul_ext_v_i32m2_i32m4(tmp11), 8, 16);
+  vint32m4_t tmps_12_13 = __riscv_vlmul_ext_v_i32m2_i32m4(tmp12);
+  tmps_12_13 = __riscv_vslideup_vx_i32m4(tmps_12_13, __riscv_vlmul_ext_v_i32m2_i32m4(tmp13), 8, 16);
+
+  vint32m8_t tmps_10_11_12_13 = __riscv_vlmul_ext_v_i32m4_i32m8(tmps_10_11);
+  tmps_10_11_12_13 = __riscv_vslideup_vx_i32m8(tmps_10_11_12_13, __riscv_vlmul_ext_v_i32m4_i32m8(tmps_12_13),  16, 32);
+
+  // combine (tmp3, tmp2, tmp1, tmp0)
+  vint32m4_t tmps_3_2 = __riscv_vlmul_ext_v_i32m2_i32m4(tmp3);
+  tmps_3_2 = __riscv_vslideup_vx_i32m4(tmps_3_2, __riscv_vlmul_ext_v_i32m2_i32m4(tmp2), 8, 16);
+  vint32m4_t tmps_1_0 = __riscv_vlmul_ext_v_i32m2_i32m4(tmp1);
+  tmps_1_0 = __riscv_vslideup_vx_i32m4(tmps_1_0, __riscv_vlmul_ext_v_i32m2_i32m4(tmp0), 8, 16);
+
+  vint32m8_t tmps_3_2_1_0 = __riscv_vlmul_ext_v_i32m4_i32m8(tmps_3_2);
+  tmps_3_2_1_0 = __riscv_vslideup_vx_i32m8(tmps_3_2_1_0, __riscv_vlmul_ext_v_i32m4_i32m8(tmps_1_0),  16, 32);
+
+  // col 0, 1, 2, 3
+  tmp_i32m8 = __riscv_vadd_vv_i32m8(tmps_10_11_12_13, tmps_3_2_1_0, 32);
+  tmp_i32m8 = __riscv_vadd_vx_i32m8(tmp_i32m8, 1<<(DESCALE_P1-1), 32);	// for Rounding
+  cols_0123_i16m4 = __riscv_vnsra_wx_i16m4(tmp_i32m8, DESCALE_P1, 32);
+
+  // col 7, 6, 5, 4
+  tmp_i32m8 = __riscv_vsub_vv_i32m8(tmps_10_11_12_13, tmps_3_2_1_0, 32);
+  tmp_i32m8 = __riscv_vadd_vx_i32m8(tmp_i32m8, 1<<(DESCALE_P1-1), 32);	// for Rounding
+  cols_7654_i16m4 = __riscv_vnsra_wx_i16m4(tmp_i32m8, DESCALE_P1, 32);
+
+  // combine vectors
+  *cols_all_i16m8 = __riscv_vlmul_ext_v_i16m4_i16m8(cols_0123_i16m4);
+  *cols_all_i16m8 = __riscv_vslideup_vx_i16m8(*cols_all_i16m8, __riscv_vlmul_ext_v_i16m4_i16m8(cols_7654_i16m4), 32, 64);
+}
+
+
+/* Perform the second pass of the accurate inverse DCT on a 4x8 block of
+ * coefficients.  (To process the full 8x8 DCT block, this function-- or some
+ * other optimized variant-- needs to be called for both the right and left 4x8
+ * blocks.)
+ *
+ * This "regular" version assumes that no optimization can be made to the IDCT
+ * calculation, since no useful set of coefficient values are all 0 after the
+ * first pass.
+ *
+ * Again, the original C implementation of the accurate IDCT (jpeg_idct_slow())
+ * can be found in jidctint.c.  Algorithmic changes made here are documented
+ * inline.
+ */
+
+static INLINE void jsimd_idct_islow_pass2_regular(vint16m8_t *cols_all_i16m8,
+                                                  JSAMPARRAY output_buf,
+                                                  JDIMENSION output_col)
+{
+  int16_t workspace[DCTSIZE2];      /* buffers data */
+
+  // load transpose look-up table
+  vuint8m4_t vg_reg8 = __riscv_vle8_v_u8m4(trans_index8x8_u8, TRANS_TABLE_U8_SIZE);
+ 
+  // transpose
+  vint16m8_t trans_all_i16m8 = __riscv_vrgather_vv_i16m8(*cols_all_i16m8, \
+	__riscv_vzext_vf2_u16m8(vg_reg8, TRANS_TABLE_U8_SIZE), TRANS_TABLE_U8_SIZE);
+
+  __riscv_vse16_v_i16m8(&workspace[0], trans_all_i16m8, DCTSIZE2);
+
+  /* Even part */
+  size_t vl = 8;
+  vint16m1_t z2_s16 = __riscv_vle16_v_i16m1(workspace + 2 * DCTSIZE, vl);
+  vint16m1_t z3_s16 = __riscv_vle16_v_i16m1(workspace + 6 * DCTSIZE, vl);
+
+  vint32m2_t tmp2 = __riscv_vwmul_vx_i32m2(z2_s16, idct_islow_consts[1], vl);
+  vint32m2_t tmp3 = __riscv_vwmul_vx_i32m2(z2_s16, idct_islow_consts[6], vl);
+  tmp2 = __riscv_vwmacc_vx_i32m2(tmp2, idct_islow_consts[9], z3_s16, vl);
+  tmp3 = __riscv_vwmacc_vx_i32m2(tmp3, idct_islow_consts[1], z3_s16, vl);
+
+  z2_s16 = __riscv_vle16_v_i16m1(workspace + 0 * DCTSIZE, vl);
+  z3_s16 = __riscv_vle16_v_i16m1(workspace + 4 * DCTSIZE, vl);
+
+  vint32m2_t tmp0 = __riscv_vwmul_vx_i32m2(__riscv_vadd_vv_i16m1(z2_s16, z3_s16, vl), 8192, vl);
+  vint32m2_t tmp1 = __riscv_vwmul_vx_i32m2(__riscv_vsub_vv_i16m1(z2_s16, z3_s16, vl), 8192, vl);
+
+  vint32m2_t tmp10 = __riscv_vadd_vv_i32m2(tmp0, tmp3, vl);
+  vint32m2_t tmp13 = __riscv_vsub_vv_i32m2(tmp0, tmp3, vl);
+  vint32m2_t tmp11 = __riscv_vadd_vv_i32m2(tmp1, tmp2, vl);
+  vint32m2_t tmp12 = __riscv_vsub_vv_i32m2(tmp1, tmp2, vl);
+
+  /* Odd part */
+  vint16m1_t tmp0_s16 = __riscv_vle16_v_i16m1(workspace + 7 * DCTSIZE, vl);
+  vint16m1_t tmp1_s16 = __riscv_vle16_v_i16m1(workspace + 5 * DCTSIZE, vl);
+  vint16m1_t tmp2_s16 = __riscv_vle16_v_i16m1(workspace + 3 * DCTSIZE, vl);
+  vint16m1_t tmp3_s16 = __riscv_vle16_v_i16m1(workspace + 1 * DCTSIZE, vl);
+
+  z3_s16 = __riscv_vadd_vv_i16m1(tmp0_s16, tmp2_s16, vl);
+  vint16m1_t z4_s16 = __riscv_vadd_vv_i16m1(tmp1_s16, tmp3_s16, vl);
+
+  vint32m2_t z3 = __riscv_vwmul_vx_i32m2(z3_s16, idct_islow_consts[11], vl);
+  vint32m2_t z4 = __riscv_vwmul_vx_i32m2(z3_s16, idct_islow_consts[ 7], vl);
+  z3 = __riscv_vwmacc_vx_i32m2(z3, idct_islow_consts[7], z4_s16, vl);
+  z4 = __riscv_vwmacc_vx_i32m2(z4, idct_islow_consts[8], z4_s16, vl);
+
+  tmp0 = __riscv_vwmul_vx_i32m2(tmp0_s16, idct_islow_consts[ 3], vl);
+  tmp1 = __riscv_vwmul_vx_i32m2(tmp1_s16, idct_islow_consts[ 5], vl);
+  tmp2 = __riscv_vwmul_vx_i32m2(tmp2_s16, idct_islow_consts[10], vl);
+  tmp3 = __riscv_vwmul_vx_i32m2(tmp3_s16, idct_islow_consts[ 4], vl);
+
+  tmp0 = __riscv_vwmacc_vx_i32m2(tmp0, -idct_islow_consts[0], tmp3_s16, vl);
+  tmp1 = __riscv_vwmacc_vx_i32m2(tmp1, -idct_islow_consts[2], tmp2_s16, vl);
+  tmp2 = __riscv_vwmacc_vx_i32m2(tmp2, -idct_islow_consts[2], tmp1_s16, vl);
+  tmp3 = __riscv_vwmacc_vx_i32m2(tmp3, -idct_islow_consts[0], tmp0_s16, vl);
+
+  tmp0 = __riscv_vadd_vv_i32m2(tmp0, z3, vl);
+  tmp1 = __riscv_vadd_vv_i32m2(tmp1, z4, vl);
+  tmp2 = __riscv_vadd_vv_i32m2(tmp2, z3, vl);
+  tmp3 = __riscv_vadd_vv_i32m2(tmp3, z4, vl);
+
+/* Final output stage: descale and narrow to 8-bit. */
+/* Clamp to range [0-255]. */
+  vuint8m2_t cols_0123_u8m2, cols_7654_u8m2;
+  vint8m2_t tmp_i8m2;
+  vint16m4_t tmp_i16m4;
+  vint32m8_t tmp_i32m8;
+
+  // combine (tmp10, tmp11, tmp12, tmp13)
+  vint32m4_t tmps_10_11 = __riscv_vlmul_ext_v_i32m2_i32m4(tmp10);
+  tmps_10_11 = __riscv_vslideup_vx_i32m4(tmps_10_11, __riscv_vlmul_ext_v_i32m2_i32m4(tmp11), 8, 16);
+  vint32m4_t tmps_12_13 = __riscv_vlmul_ext_v_i32m2_i32m4(tmp12);
+  tmps_12_13 = __riscv_vslideup_vx_i32m4(tmps_12_13, __riscv_vlmul_ext_v_i32m2_i32m4(tmp13), 8, 16);
+
+  vint32m8_t tmps_10_11_12_13 = __riscv_vlmul_ext_v_i32m4_i32m8(tmps_10_11);
+  tmps_10_11_12_13 = __riscv_vslideup_vx_i32m8(tmps_10_11_12_13, __riscv_vlmul_ext_v_i32m4_i32m8(tmps_12_13),  16, 32);
+
+  // combine (tmp3, tmp2, tmp1, tmp0)
+  vint32m4_t tmps_3_2 = __riscv_vlmul_ext_v_i32m2_i32m4(tmp3);
+  tmps_3_2 = __riscv_vslideup_vx_i32m4(tmps_3_2, __riscv_vlmul_ext_v_i32m2_i32m4(tmp2), 8, 16);
+  vint32m4_t tmps_1_0 = __riscv_vlmul_ext_v_i32m2_i32m4(tmp1);
+  tmps_1_0 = __riscv_vslideup_vx_i32m4(tmps_1_0, __riscv_vlmul_ext_v_i32m2_i32m4(tmp0), 8, 16);
+
+  vint32m8_t tmps_3_2_1_0 = __riscv_vlmul_ext_v_i32m4_i32m8(tmps_3_2);
+  tmps_3_2_1_0 = __riscv_vslideup_vx_i32m8(tmps_3_2_1_0, __riscv_vlmul_ext_v_i32m4_i32m8(tmps_1_0),  16, 32);
+
+  // col 0, 1, 2, 3
+  tmp_i32m8 = __riscv_vadd_vv_i32m8(tmps_10_11_12_13, tmps_3_2_1_0, 32);
+  tmp_i16m4 = __riscv_vnsra_wx_i16m4(tmp_i32m8, 16, 32);
+  tmp_i8m2 = __riscv_vnclip_wx_i8m2(tmp_i16m4, DESCALE_P2 - 16, 32);
+  cols_0123_u8m2 = __riscv_vreinterpret_v_i8m2_u8m2(tmp_i8m2);
+  cols_0123_u8m2 = __riscv_vadd_vx_u8m2(cols_0123_u8m2, CENTERJSAMPLE, 32);
+
+  // col 7, 6, 5, 4
+  tmp_i32m8 = __riscv_vsub_vv_i32m8(tmps_10_11_12_13, tmps_3_2_1_0, 32);
+  tmp_i16m4 = __riscv_vnsra_wx_i16m4(tmp_i32m8, 16, 32);
+  tmp_i8m2 = __riscv_vnclip_wx_i8m2(tmp_i16m4, DESCALE_P2 - 16, 32);
+  cols_7654_u8m2 = __riscv_vreinterpret_v_i8m2_u8m2(tmp_i8m2);
+  cols_7654_u8m2 = __riscv_vadd_vx_u8m2(cols_7654_u8m2, CENTERJSAMPLE, 32);
+
+  // combine vectors
+  vuint8m4_t cols_all_u8m4 = __riscv_vlmul_ext_v_u8m2_u8m4(cols_0123_u8m2);
+  cols_all_u8m4 = __riscv_vslideup_vx_u8m4(cols_all_u8m4, __riscv_vlmul_ext_v_u8m2_u8m4(cols_7654_u8m2), 32, 64);
+
+  // transpose
+  vuint8m4_t trans_all_u8m4 = __riscv_vrgather(cols_all_u8m4, vg_reg8, TRANS_TABLE_U8_SIZE);
+
+  // extract columns
+  vuint8mf2_t col_0_u8mf2, col_1_u8mf2, col_2_u8mf2, col_3_u8mf2;
+  vuint8mf2_t col_4_u8mf2, col_5_u8mf2, col_6_u8mf2, col_7_u8mf2;
+
+  vuint8m4_t slidedown_m4 = __riscv_vslidedown_vx_u8m4(trans_all_u8m4, 0, vl);
+  col_0_u8mf2 = __riscv_vlmul_trunc_v_u8m4_u8mf2(slidedown_m4);
+
+  slidedown_m4 = __riscv_vslidedown_vx_u8m4(trans_all_u8m4, 8, vl);
+  col_1_u8mf2 = __riscv_vlmul_trunc_v_u8m4_u8mf2(slidedown_m4);
+
+  slidedown_m4 = __riscv_vslidedown_vx_u8m4(trans_all_u8m4, 16, vl);
+  col_2_u8mf2 = __riscv_vlmul_trunc_v_u8m4_u8mf2(slidedown_m4);
+
+  slidedown_m4 = __riscv_vslidedown_vx_u8m4(trans_all_u8m4, 24, vl);
+  col_3_u8mf2 = __riscv_vlmul_trunc_v_u8m4_u8mf2(slidedown_m4);
+
+  slidedown_m4 = __riscv_vslidedown_vx_u8m4(trans_all_u8m4, 32, vl);
+  col_4_u8mf2 = __riscv_vlmul_trunc_v_u8m4_u8mf2(slidedown_m4);
+
+  slidedown_m4 = __riscv_vslidedown_vx_u8m4(trans_all_u8m4, 40, vl);
+  col_5_u8mf2 = __riscv_vlmul_trunc_v_u8m4_u8mf2(slidedown_m4);
+
+  slidedown_m4 = __riscv_vslidedown_vx_u8m4(trans_all_u8m4, 48, vl);
+  col_6_u8mf2 = __riscv_vlmul_trunc_v_u8m4_u8mf2(slidedown_m4);
+
+  slidedown_m4 = __riscv_vslidedown_vx_u8m4(trans_all_u8m4, 56, vl);
+  col_7_u8mf2 = __riscv_vlmul_trunc_v_u8m4_u8mf2(slidedown_m4);
+
+  // store to memory
+  JSAMPROW outptr0 = output_buf[0] + output_col;
+  JSAMPROW outptr1 = output_buf[1] + output_col;
+  JSAMPROW outptr2 = output_buf[2] + output_col;
+  JSAMPROW outptr3 = output_buf[3] + output_col;
+  JSAMPROW outptr4 = output_buf[4] + output_col;
+  JSAMPROW outptr5 = output_buf[5] + output_col;
+  JSAMPROW outptr6 = output_buf[6] + output_col;
+  JSAMPROW outptr7 = output_buf[7] + output_col;
+
+   __riscv_vse8_v_u8mf2(outptr0, col_0_u8mf2, vl);
+   __riscv_vse8_v_u8mf2(outptr1, col_1_u8mf2, vl);
+   __riscv_vse8_v_u8mf2(outptr2, col_2_u8mf2, vl);
+   __riscv_vse8_v_u8mf2(outptr3, col_3_u8mf2, vl);
+   __riscv_vse8_v_u8mf2(outptr4, col_4_u8mf2, vl);
+   __riscv_vse8_v_u8mf2(outptr5, col_5_u8mf2, vl);
+   __riscv_vse8_v_u8mf2(outptr6, col_6_u8mf2, vl);
+   __riscv_vse8_v_u8mf2(outptr7, col_7_u8mf2, vl);
+}
+
diff --git a/simd/rvv_andes/jidctred-rvv.c b/simd/rvv_andes/jidctred-rvv.c
new file mode 100644
index 000000000..4e03d6da4
--- /dev/null
+++ b/simd/rvv_andes/jidctred-rvv.c
@@ -0,0 +1,436 @@
+/*
+* jidctred-rvv.c - reduced-size IDCT (RISC-V RVV)
+*
+* Copyright (c) 2012-2024 Andes Technology Corporation
+* All rights reserved.
+*/
+/*
+ * jidctred-neon.c - reduced-size IDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <riscv_vector.h>
+
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+
+#define F_0_211  1730
+#define F_0_509  4176
+#define F_0_601  4926
+#define F_0_720  5906
+#define F_0_765  6270
+#define F_0_850  6967
+#define F_0_899  7373
+#define F_1_061  8697
+#define F_1_272  10426
+#define F_1_451  11893
+#define F_1_847  15137
+#define F_2_172  17799
+#define F_2_562  20995
+#define F_3_624  29692
+
+
+/* jsimd_idct_2x2_rvv() is an inverse DCT function that produces reduced-size
+ * 2x2 output from an 8x8 DCT block.  It uses the same calculations and
+ * produces exactly the same output as IJG's original jpeg_idct_2x2() function
+ * from jpeg-6b, which can be found in jidctred.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.720959822 =  5906 * 2^-13
+ *    0.850430095 =  6967 * 2^-13
+ *    1.272758580 = 10426 * 2^-13
+ *    3.624509785 = 29692 * 2^-13
+ *
+ * See jidctred.c for further details of the 2x2 IDCT algorithm.  Where
+ * possible, the variable names and comments here in jsimd_idct_2x2_rvv()
+ * match up with those in jpeg_idct_2x2().
+ */
+
+static const int16_t jsimd_idct_2x2_consts[] = {
+  -F_0_720, F_0_850, -F_1_272, F_3_624
+};
+
+void jsimd_idct_2x2_rvv(void *dct_table, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  ISLOW_MULT_TYPE *quantptr = dct_table;
+
+  /* Load DCT coefficients. */
+  size_t vl = 8;
+  vint16m1_t row0 = __riscv_vle16_v_i16m1(coef_block + 0 * DCTSIZE, vl);
+  vint16m1_t row1 = __riscv_vle16_v_i16m1(coef_block + 1 * DCTSIZE, vl);
+  vint16m1_t row3 = __riscv_vle16_v_i16m1(coef_block + 3 * DCTSIZE, vl);
+  vint16m1_t row5 = __riscv_vle16_v_i16m1(coef_block + 5 * DCTSIZE, vl);
+  vint16m1_t row7 = __riscv_vle16_v_i16m1(coef_block + 7 * DCTSIZE, vl);
+
+  /* Load quantization table values. */
+  vint16m1_t quant_row0 = __riscv_vle16_v_i16m1(quantptr + 0 * DCTSIZE, vl);
+  vint16m1_t quant_row1 = __riscv_vle16_v_i16m1(quantptr + 1 * DCTSIZE, vl);
+  vint16m1_t quant_row3 = __riscv_vle16_v_i16m1(quantptr + 3 * DCTSIZE, vl);
+  vint16m1_t quant_row5 = __riscv_vle16_v_i16m1(quantptr + 5 * DCTSIZE, vl);
+  vint16m1_t quant_row7 = __riscv_vle16_v_i16m1(quantptr + 7 * DCTSIZE, vl);
+
+  /* Dequantize DCT coefficients. */
+  row0 = __riscv_vmul_vv_i16m1(row0, quant_row0, vl);
+  row1 = __riscv_vmul_vv_i16m1(row1, quant_row1, vl);
+  row3 = __riscv_vmul_vv_i16m1(row3, quant_row3, vl);
+  row5 = __riscv_vmul_vv_i16m1(row5, quant_row5, vl);
+  row7 = __riscv_vmul_vv_i16m1(row7, quant_row7, vl);
+
+  /* Pass 1: process columns from input, put results in vectors row0 and
+   * row1.
+   */
+
+  /* Even part */
+  vint32m2_t tmp10 = __riscv_vwmul_vx_i32m2(row0, 8192, vl);	// 2 ^ (CONST_BITS)
+  tmp10 = __riscv_vsll_vx_i32m2 (tmp10, 2, vl);
+
+  /* Odd part */
+  vint32m2_t tmp0 = __riscv_vwmul_vx_i32m2(row1, jsimd_idct_2x2_consts[3], vl);
+  tmp0 = __riscv_vwmacc_vx_i32m2(tmp0, jsimd_idct_2x2_consts[2], row3, vl);
+  tmp0 = __riscv_vwmacc_vx_i32m2(tmp0, jsimd_idct_2x2_consts[1], row5, vl);
+  tmp0 = __riscv_vwmacc_vx_i32m2(tmp0, jsimd_idct_2x2_consts[0], row7, vl);
+
+  /* Final output stage: descale and narrow to 16-bit. */
+  vint32m2_t tmp_tt;
+  tmp_tt = __riscv_vadd_vv_i32m2(tmp10, tmp0, vl);
+  tmp_tt = __riscv_vadd_vx_i32m2(tmp_tt, 1<<(CONST_BITS-1), vl);
+  row0 = __riscv_vnsra_wx_i16m1(tmp_tt, CONST_BITS, vl);
+
+  tmp_tt = __riscv_vsub_vv_i32m2(tmp10, tmp0, vl);
+  tmp_tt = __riscv_vadd_vx_i32m2(tmp_tt, 1<<(CONST_BITS-1), vl);
+  row1 = __riscv_vnsra_wx_i16m1(tmp_tt, CONST_BITS, vl);
+
+  /* Transpose two rows, ready for second pass. */
+  const uint8_t trans_index_tab_u8[16] =
+  {
+	0,  8,
+	1,  9,
+	3, 11,
+	5, 13,
+	7, 15,
+	0,  2, /* [10~13] output transpose index order */
+	1,  3,
+	0,  0,
+  };
+
+  // extend m1 to m2
+  vint16m2_t m2_row0 = __riscv_vlmul_ext_v_i16m1_i16m2(row0);
+  
+  // combine
+  m2_row0 = __riscv_vslideup_vx_i16m2(m2_row0, __riscv_vlmul_ext_v_i16m1_i16m2(row1),  8, 16);
+
+  // load transpose look-up table
+  vuint8m1_t vg_reg8 = __riscv_vle8_v_u8m1(trans_index_tab_u8, 16);
+
+  // saved for output index
+  vuint8m1_t index_order_u8m1 = __riscv_vslidedown_vx_u8m1(vg_reg8, 10, 4);
+
+  // interpret to u16 & transpose
+  vint16m2_t vg_reg16 = __riscv_vrgather(m2_row0, __riscv_vzext_vf2_u16m2(vg_reg8, 10), 10);
+
+  // extract columns
+  vl = 2;
+  vint16m1_t col0 = __riscv_vget_v_i16m2_i16m1(vg_reg16, 0);
+  vint16m1_t col1 = __riscv_vslidedown_vx_i16m1(col0, 2, vl);
+  vint16m1_t col3 = __riscv_vslidedown_vx_i16m1(col0, 4, vl);
+  vint16m1_t col5 = __riscv_vslidedown_vx_i16m1(col0, 6, vl);
+
+  vint16m2_t slidedown_m2 = __riscv_vslidedown_vx_i16m2(vg_reg16, 8, vl);
+  vint16m1_t col7 = __riscv_vlmul_trunc_v_i16m2_i16m1(slidedown_m2);
+
+  /* Pass 2: process two rows, store to output array. */
+
+  /* Even part: we're only interested in col0; the top half of tmp10 is "don't
+   * care."
+   */
+  tmp10 = __riscv_vwmul_vx_i32m2(col0, 8192, vl);
+  tmp10 = __riscv_vsll_vx_i32m2 (tmp10, 2, vl);
+
+  /* Odd part: we're only interested in the bottom half of tmp0. */
+  tmp0 = __riscv_vwmul_vx_i32m2(col1, jsimd_idct_2x2_consts[3], vl);
+  tmp0 = __riscv_vwmacc_vx_i32m2(tmp0, jsimd_idct_2x2_consts[2], col3, vl);
+  tmp0 = __riscv_vwmacc_vx_i32m2(tmp0, jsimd_idct_2x2_consts[1], col5, vl);
+  tmp0 = __riscv_vwmacc_vx_i32m2(tmp0, jsimd_idct_2x2_consts[0], col7, vl);
+
+  /* Final output stage: descale and clamp to range [0-255]. */
+  /* Narrow to 8-bit and convert to unsigned. */
+  vint32m2_t tmp_i32m2_0 = __riscv_vadd_vv_i32m2(tmp10, tmp0, 2);
+  vint32m2_t tmp_i32m2_1 = __riscv_vsub_vv_i32m2(tmp10, tmp0, 2);
+  tmp_i32m2_0 = __riscv_vslideup_vx_i32m2 (tmp_i32m2_0, tmp_i32m2_1, 2, 4);
+  vint16m1_t tmp_i16m1 = __riscv_vnsra_wx_i16m1(tmp_i32m2_0, 16, 4);
+  vint8mf2_t tmp_i8mf2 = __riscv_vnclip_wx_i8mf2(tmp_i16m1, CONST_BITS + PASS1_BITS + 3 + 2 - 16, 4);
+  vuint8mf2_t u8mf2_col0 = __riscv_vreinterpret_v_i8mf2_u8mf2(tmp_i8mf2);
+  u8mf2_col0 = __riscv_vadd_vx_u8mf2(u8mf2_col0, CENTERJSAMPLE, 4);
+
+  /* Transpose */
+  vuint8mf2_t v0_out = __riscv_vrgather_vv_u8mf2(u8mf2_col0, \
+		  __riscv_vlmul_trunc_v_u8m1_u8mf2(index_order_u8m1), 4);
+  vuint8mf2_t v1_out = __riscv_vslidedown_vx_u8mf2(v0_out, 2, 2);
+
+  /* Store 2x2 block to memory. */
+  JSAMPROW outptr0 = output_buf[0] + output_col;
+  JSAMPROW outptr1 = output_buf[1] + output_col;
+  __riscv_vse8_v_u8mf2(outptr0, v0_out, 2);
+  __riscv_vse8_v_u8mf2(outptr1, v1_out, 2);
+}
+
+
+/* jsimd_idct_4x4_rvv() is an inverse DCT function that produces reduced-size
+ * 4x4 output from an 8x8 DCT block.  It uses the same calculations and
+ * produces exactly the same output as IJG's original jpeg_idct_4x4() function
+ * from jpeg-6b, which can be found in jidctred.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.211164243 =  1730 * 2^-13
+ *    0.509795579 =  4176 * 2^-13
+ *    0.601344887 =  4926 * 2^-13
+ *    0.765366865 =  6270 * 2^-13
+ *    0.899976223 =  7373 * 2^-13
+ *    1.061594337 =  8697 * 2^-13
+ *    1.451774981 = 11893 * 2^-13
+ *    1.847759065 = 15137 * 2^-13
+ *    2.172734803 = 17799 * 2^-13
+ *    2.562915447 = 20995 * 2^-13
+ *
+ * See jidctred.c for further details of the 4x4 IDCT algorithm.  Where
+ * possible, the variable names and comments here in jsimd_idct_4x4_rvv()
+ * match up with those in jpeg_idct_4x4().
+ */
+
+static const int16_t jsimd_idct_4x4_consts[] = {
+  F_1_847, -F_0_765, -F_0_211,  F_1_451,
+ -F_2_172,  F_1_061, -F_0_509, -F_0_601,
+  F_0_899,  F_2_562,        0,        0
+};
+
+void jsimd_idct_4x4_rvv(void *dct_table, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  ptrdiff_t bstride;
+  ISLOW_MULT_TYPE *quantptr = dct_table;
+  vint16m4_t rows_0132_i16m4;
+
+  /* Load DCT coefficients. */
+  size_t vl = 8;
+  vint16m1_t row0 = __riscv_vle16_v_i16m1(coef_block + 0 * DCTSIZE, vl);
+  vint16m1_t row1 = __riscv_vle16_v_i16m1(coef_block + 1 * DCTSIZE, vl);
+  vint16m1_t row2 = __riscv_vle16_v_i16m1(coef_block + 2 * DCTSIZE, vl);
+  vint16m1_t row3 = __riscv_vle16_v_i16m1(coef_block + 3 * DCTSIZE, vl);
+  vint16m1_t row5 = __riscv_vle16_v_i16m1(coef_block + 5 * DCTSIZE, vl);
+  vint16m1_t row6 = __riscv_vle16_v_i16m1(coef_block + 6 * DCTSIZE, vl);
+  vint16m1_t row7 = __riscv_vle16_v_i16m1(coef_block + 7 * DCTSIZE, vl);
+
+  /* Load quantization table values for DC coefficients. */
+  /* Dequantize DC coefficients. */
+  vint16m1_t quant_row0 = __riscv_vle16_v_i16m1(quantptr + 0 * DCTSIZE, vl);
+  row0 = __riscv_vmul_vv_i16m1(row0, quant_row0, vl);
+
+  #define TRANS_TABLE_U8_SIZE 32
+  const uint8_t trans_index_4x4_u8[TRANS_TABLE_U8_SIZE] =
+  {
+	0,  8, 24, 16,
+	1,  9, 25, 17,
+	2, 10, 26, 18,
+	3, 11, 27, 19,
+	5, 13, 29, 21,
+	6, 14, 30, 22,
+	7, 15, 31, 23,
+	0,  0,  0,  0,
+  };
+
+  /* load transpose look-up table */
+  vuint8m2_t vg_reg8 = __riscv_vle8_v_u8m2(trans_index_4x4_u8, TRANS_TABLE_U8_SIZE);
+
+  /* Construct bitmap to test if all AC coefficients are 0. */
+  vuint16m1_t vec_zero = __riscv_vmv_s_x_u16m1(0, vl);
+  vint16m1_t bitmap = __riscv_vor_vv_i16m1(row7, row6, vl);
+  bitmap = __riscv_vor_vv_i16m1(bitmap, row5, vl);
+  bitmap = __riscv_vor_vv_i16m1(bitmap, row3, vl);
+  bitmap = __riscv_vor_vv_i16m1(bitmap, row2, vl);
+  bitmap = __riscv_vor_vv_i16m1(bitmap, row1, vl);
+  vuint16m1_t tmp_u16m1 = __riscv_vredor_vs_u16m1_u16m1(__riscv_vreinterpret_v_i16m1_u16m1(bitmap), vec_zero, vl);
+  uint16_t ac_bitmap = __riscv_vmv_x_s_u16m1_u16(tmp_u16m1);
+  if (0 == ac_bitmap) {
+    /* All AC coefficients are zero. */
+    vint16m1_t dcval_i16m1 = __riscv_vsll_vx_i16m1(row0, PASS1_BITS, vl);
+
+	/* combine vectors */
+	rows_0132_i16m4 = __riscv_vlmul_ext_v_i16m1_i16m4(dcval_i16m1);
+	rows_0132_i16m4 = __riscv_vslideup_vx_i16m4(rows_0132_i16m4, rows_0132_i16m4, DCTSIZE2/8, DCTSIZE2/4);
+	rows_0132_i16m4 = __riscv_vslideup_vx_i16m4(rows_0132_i16m4, rows_0132_i16m4, DCTSIZE2/4, DCTSIZE2/2);
+  } else {
+    /* Load quantization table. */
+    vint16m1_t quant_row1 = __riscv_vle16_v_i16m1(quantptr + 1 * DCTSIZE, vl);
+    vint16m1_t quant_row2 = __riscv_vle16_v_i16m1(quantptr + 2 * DCTSIZE, vl);
+    vint16m1_t quant_row3 = __riscv_vle16_v_i16m1(quantptr + 3 * DCTSIZE, vl);
+    vint16m1_t quant_row5 = __riscv_vle16_v_i16m1(quantptr + 5 * DCTSIZE, vl);
+    vint16m1_t quant_row6 = __riscv_vle16_v_i16m1(quantptr + 6 * DCTSIZE, vl);
+    vint16m1_t quant_row7 = __riscv_vle16_v_i16m1(quantptr + 7 * DCTSIZE, vl);
+
+    /* Even part */
+	vint32m2_t tmp0, tmp2;
+    tmp0 = __riscv_vwmul_vx_i32m2(row0, 16384, vl);	// 16384 = 2^(CONST_BITS + 1)
+	row2 = __riscv_vmul_vv_i16m1(row2, quant_row2, vl);
+	row6 = __riscv_vmul_vv_i16m1(row6, quant_row6, vl);
+	tmp2 = __riscv_vwmul_vx_i32m2(row2, jsimd_idct_4x4_consts[0], vl);
+	tmp2 = __riscv_vwmacc_vx_i32m2(tmp2, jsimd_idct_4x4_consts[1], row6, vl);
+	vint32m2_t tmp10 = __riscv_vadd_vv_i32m2(tmp0, tmp2, vl);
+	vint32m2_t tmp12 = __riscv_vsub_vv_i32m2(tmp0, tmp2, vl);
+
+    /* Odd part */
+	row7 = __riscv_vmul_vv_i16m1(row7, quant_row7, vl);
+	row5 = __riscv_vmul_vv_i16m1(row5, quant_row5, vl);
+	row3 = __riscv_vmul_vv_i16m1(row3, quant_row3, vl);
+	row1 = __riscv_vmul_vv_i16m1(row1, quant_row1, vl);
+
+	tmp0 = __riscv_vwmul_vx_i32m2 (row7, jsimd_idct_4x4_consts[2], vl);
+	tmp0 = __riscv_vwmacc_vx_i32m2(tmp0, jsimd_idct_4x4_consts[3], row5, vl);
+	tmp0 = __riscv_vwmacc_vx_i32m2(tmp0, jsimd_idct_4x4_consts[4], row3, vl);
+	tmp0 = __riscv_vwmacc_vx_i32m2(tmp0, jsimd_idct_4x4_consts[5], row1, vl);
+
+	tmp2 = __riscv_vwmul_vx_i32m2 (row7, jsimd_idct_4x4_consts[6], vl);
+	tmp2 = __riscv_vwmacc_vx_i32m2(tmp2, jsimd_idct_4x4_consts[7], row5, vl);
+	tmp2 = __riscv_vwmacc_vx_i32m2(tmp2, jsimd_idct_4x4_consts[8], row3, vl);
+	tmp2 = __riscv_vwmacc_vx_i32m2(tmp2, jsimd_idct_4x4_consts[9], row1, vl);
+
+    /* Final output stage */
+    vint32m4_t tmp10_tmp12_i32m4 = __riscv_vlmul_ext_v_i32m2_i32m4(tmp10);
+	tmp10_tmp12_i32m4 = __riscv_vslideup_vx_i32m4(tmp10_tmp12_i32m4, __riscv_vlmul_ext_v_i32m2_i32m4(tmp12), 8, 16);
+    vint32m4_t tmp2_tmp0_i32m4 = __riscv_vlmul_ext_v_i32m2_i32m4(tmp2);
+	tmp2_tmp0_i32m4 = __riscv_vslideup_vx_i32m4(tmp2_tmp0_i32m4, __riscv_vlmul_ext_v_i32m2_i32m4(tmp0), 8, 16);
+
+	vint32m4_t tmp_add_i32m4 = __riscv_vadd_vv_i32m4(tmp10_tmp12_i32m4, tmp2_tmp0_i32m4, 16);
+	vint32m4_t tmp_sub_i32m4 = __riscv_vsub_vv_i32m4(tmp10_tmp12_i32m4, tmp2_tmp0_i32m4, 16);
+	tmp_add_i32m4 = __riscv_vadd_vx_i32m4(tmp_add_i32m4, 1<<(CONST_BITS - PASS1_BITS + 1 - 1), 16);
+	tmp_sub_i32m4 = __riscv_vadd_vx_i32m4(tmp_sub_i32m4, 1<<(CONST_BITS - PASS1_BITS + 1 - 1), 16);
+	vint16m2_t tmp_rows_01 = __riscv_vnsra_wx_i16m2(tmp_add_i32m4, (CONST_BITS - PASS1_BITS + 1), 16);
+	vint16m2_t tmp_rows_32 = __riscv_vnsra_wx_i16m2(tmp_sub_i32m4, (CONST_BITS - PASS1_BITS + 1), 16);
+
+	/* combine vectors */
+	rows_0132_i16m4 = __riscv_vlmul_ext_v_i16m2_i16m4(tmp_rows_01);
+	rows_0132_i16m4 = __riscv_vslideup_vx_i16m4(rows_0132_i16m4, __riscv_vlmul_ext_v_i16m2_i16m4(tmp_rows_32), 16, 32);
+  }
+
+  /* Transpose 8x4 block to perform IDCT on rows in second pass. */
+
+  /* interpret to u16 & transpose */
+  vint16m4_t vg_reg16 = __riscv_vrgather(rows_0132_i16m4, __riscv_vzext_vf2_u16m4(vg_reg8, 28), 28);
+
+  /* extract columns */
+  vl = 4;
+  vint16m1_t col0 = __riscv_vget_v_i16m4_i16m1(vg_reg16, 0);
+  vint16m1_t col1 = __riscv_vslidedown_vx_i16m1(col0, 4, vl);
+
+  vint16m4_t slidedown_m4 = __riscv_vslidedown_vx_i16m4(vg_reg16, 8, 8);
+  vint16m2_t tmp_i16m2 = __riscv_vlmul_trunc_v_i16m4_i16m2(slidedown_m4);
+  vint16m1_t col2 = __riscv_vlmul_trunc_v_i16m2_i16m1(tmp_i16m2);
+  tmp_i16m2 = __riscv_vslidedown_vx_i16m2(tmp_i16m2, 4, vl);
+  vint16m1_t col3 = __riscv_vlmul_trunc_v_i16m2_i16m1(tmp_i16m2);
+
+  slidedown_m4 = __riscv_vslidedown_vx_i16m4(vg_reg16, 16, 8);
+  tmp_i16m2 = __riscv_vlmul_trunc_v_i16m4_i16m2(slidedown_m4);
+  vint16m1_t col5 = __riscv_vlmul_trunc_v_i16m2_i16m1(tmp_i16m2);
+  tmp_i16m2 = __riscv_vslidedown_vx_i16m2(tmp_i16m2, 4, vl);
+  vint16m1_t col6 = __riscv_vlmul_trunc_v_i16m2_i16m1(tmp_i16m2);
+
+  slidedown_m4 = __riscv_vslidedown_vx_i16m4(vg_reg16, 24, vl);
+  vint16m1_t col7 = __riscv_vlmul_trunc_v_i16m4_i16m1(slidedown_m4);
+
+  /* Commence second pass of IDCT. */
+
+  /* Even part */
+  vint32m2_t tmp0, tmp2, tmp10, tmp12;
+  tmp0 = __riscv_vwmul_vx_i32m2(col0, 16384, vl);	// 2^(CONST_BITS + 1)
+  tmp2 = __riscv_vwmul_vx_i32m2 (col2, jsimd_idct_4x4_consts[0], vl);
+  tmp2 = __riscv_vwmacc_vx_i32m2(tmp2, jsimd_idct_4x4_consts[1], col6, vl);
+  tmp10 = __riscv_vadd_vv_i32m2(tmp0, tmp2, vl);
+  tmp12 = __riscv_vsub_vv_i32m2(tmp0, tmp2, vl);
+
+  /* Odd part */
+  tmp0 = __riscv_vwmul_vx_i32m2 (col7, jsimd_idct_4x4_consts[2], vl);
+  tmp0 = __riscv_vwmacc_vx_i32m2(tmp0, jsimd_idct_4x4_consts[3], col5, vl);
+  tmp0 = __riscv_vwmacc_vx_i32m2(tmp0, jsimd_idct_4x4_consts[4], col3, vl);
+  tmp0 = __riscv_vwmacc_vx_i32m2(tmp0, jsimd_idct_4x4_consts[5], col1, vl);
+
+  tmp2 = __riscv_vwmul_vx_i32m2 (col7, jsimd_idct_4x4_consts[6], vl);
+  tmp2 = __riscv_vwmacc_vx_i32m2(tmp2, jsimd_idct_4x4_consts[7], col5, vl);
+  tmp2 = __riscv_vwmacc_vx_i32m2(tmp2, jsimd_idct_4x4_consts[8], col3, vl);
+  tmp2 = __riscv_vwmacc_vx_i32m2(tmp2, jsimd_idct_4x4_consts[9], col1, vl);
+
+ /* Final output stage: descale and clamp to range [0-255]. */
+  vuint8mf2_t u8mf2_col01, u8mf2_col23;
+  vint8mf2_t tmp_i8mf2;
+  vint16m1_t tmp_i16m1;
+  vint32m2_t tmp_i32m2_0, tmp_i32m2_1;
+  tmp_i32m2_0 = __riscv_vadd_vv_i32m2(tmp10, tmp2, 4);
+  tmp_i32m2_1 = __riscv_vadd_vv_i32m2(tmp12, tmp0, 4);
+  tmp_i32m2_0 = __riscv_vslideup_vx_i32m2(tmp_i32m2_0, tmp_i32m2_1, 4, 8);
+  tmp_i16m1 = __riscv_vnsra_wx_i16m1(tmp_i32m2_0, 16, 8);
+  tmp_i8mf2 = __riscv_vnclip_wx_i8mf2(tmp_i16m1, CONST_BITS + PASS1_BITS + 3 + 1 - 16, 8);
+  u8mf2_col01 = __riscv_vreinterpret_v_i8mf2_u8mf2(tmp_i8mf2);
+  u8mf2_col01 = __riscv_vadd_vx_u8mf2(u8mf2_col01, CENTERJSAMPLE, 8);
+
+  tmp_i32m2_0 = __riscv_vsub_vv_i32m2(tmp12, tmp0, 4);
+  tmp_i32m2_1 = __riscv_vsub_vv_i32m2(tmp10, tmp2, 4);
+  tmp_i32m2_0 = __riscv_vslideup_vx_i32m2(tmp_i32m2_0, tmp_i32m2_1, 4, 8);
+  tmp_i16m1 = __riscv_vnsra_wx_i16m1(tmp_i32m2_0, 16, 8);
+  tmp_i8mf2 = __riscv_vnclip_wx_i8mf2(tmp_i16m1, CONST_BITS + PASS1_BITS + 3 + 1 - 16, 8);
+  u8mf2_col23 = __riscv_vreinterpret_v_i8mf2_u8mf2(tmp_i8mf2);
+  u8mf2_col23 = __riscv_vadd_vx_u8mf2(u8mf2_col23, CENTERJSAMPLE, 8);
+
+  vuint8m1_t u8m1_col0123 = __riscv_vslideup_vx_u8m1(__riscv_vlmul_ext_v_u8mf2_u8m1(u8mf2_col01),
+		  __riscv_vlmul_ext_v_u8mf2_u8m1(u8mf2_col23), 8, 16);
+
+  /* Transpose */
+  uint8_t out_index_tab[16] =
+  {
+	0,  4,  8, 12, // output transpose index order
+	1,  5,  9, 13,
+	2,  6, 10, 14,
+	3,  7, 11, 15,
+  };
+  vuint8m1_t index_order_u8m1 = __riscv_vle8_v_u8m1(out_index_tab, 16);
+  vuint8m1_t v0_out, v1_out, v2_out, v3_out;
+  v0_out = __riscv_vrgather_vv_u8m1(u8m1_col0123, index_order_u8m1, 16);
+  v1_out = __riscv_vslidedown_vx_u8m1(v0_out,  4, 4);
+  v2_out = __riscv_vslidedown_vx_u8m1(v0_out,  8, 4);
+  v3_out = __riscv_vslidedown_vx_u8m1(v0_out, 12, 4);
+
+  /* Store 4x4 block to memory. */
+  JSAMPROW outptr0 = output_buf[0] + output_col;
+  JSAMPROW outptr1 = output_buf[1] + output_col;
+  JSAMPROW outptr2 = output_buf[2] + output_col;
+  JSAMPROW outptr3 = output_buf[3] + output_col;
+  __riscv_vse8_v_u8m1(outptr0, v0_out, 4);
+  __riscv_vse8_v_u8m1(outptr1, v1_out, 4);
+  __riscv_vse8_v_u8m1(outptr2, v2_out, 4);
+  __riscv_vse8_v_u8m1(outptr3, v3_out, 4);
+}