From 31230a559df4f0bb85a2c029dc70d576f27ffac3 Mon Sep 17 00:00:00 2001
From: sinceforYy <1017657683@qq.com>
Date: Tue, 27 Aug 2024 09:49:29 +0800
Subject: [PATCH] Zfa: Support Zfa extension

* Support fli.s, fli.d, fminm.s, fminm.d, fmaxm.s, fmaxm.d
* Support fround.s, fround.d, froundnx.s, froundnx.d, fcvtmod.w.d
* Support fleq.s, fleq.d, fltq.s, fltq.d
---
 src/main/scala/yunsuan/fpu/FloatAdder.scala   |  69 +++++++++--
 src/main/scala/yunsuan/package.scala          |   4 +
 src/main/scala/yunsuan/scalar/Convert.scala   |   4 +
 .../yunsuan/vector/VectorConvert/CVT64.scala  | 116 ++++++++++++++++--
 .../vector/VectorConvert/CVTparameter.scala   |   1 +
 .../vector/VectorConvert/Convert.scala        |  12 +-
 .../yunsuan/vector/VectorConvert/VCVT.scala   |   8 +-
 src/test/scala/top/VectorSimTop.scala         |   4 +
 8 files changed, 193 insertions(+), 25 deletions(-)

diff --git a/src/main/scala/yunsuan/fpu/FloatAdder.scala b/src/main/scala/yunsuan/fpu/FloatAdder.scala
index c4713ca..1f5059a 100644
--- a/src/main/scala/yunsuan/fpu/FloatAdder.scala
+++ b/src/main/scala/yunsuan/fpu/FloatAdder.scala
@@ -83,7 +83,9 @@ class FloatAdder() extends Module  {
   val is_flt    = io.op_code === FaddOpCode.flt
   val is_fle    = io.op_code === FaddOpCode.fle
   val is_fclass = io.op_code === FaddOpCode.fclass
-  val resultNeedBox = RegEnable(is_add || is_sub || is_min || is_max || is_fsgnj || is_fsgnjn || is_fsgnjx, fire)
+  val is_fminm  = io.op_code === FaddOpCode.fminm
+  val is_fmaxm  = io.op_code === FaddOpCode.fmaxm
+  val resultNeedBox = RegEnable(is_add || is_sub || is_min || is_max || is_fsgnj || is_fsgnjn || is_fsgnjx || is_fminm || is_fmaxm, fire)
   val fp_f64_result = F64_result
   val fp_f32_result = Cat(Fill(32, resultNeedBox), F32_result)
   val fp_f16_result = Cat(Fill(48, resultNeedBox), F16_result)
@@ -177,8 +179,10 @@ private[fpu] class FloatAdderF32F16MixedPipeline(val is_print:Boolean = false,va
   )
   val fp_a_is_NAN        = io.fp_aIsFpCanonicalNAN | Efp_a_is_all_one & fp_a_mantissa_isnot_zero
   val fp_a_is_SNAN       = !io.fp_aIsFpCanonicalNAN & Efp_a_is_all_one & fp_a_mantissa_isnot_zero & !fp_a_to32(significandWidth-2)
+  val fp_a_is_QNAN       = !io.fp_aIsFpCanonicalNAN & Efp_a_is_all_one & fp_a_mantissa_isnot_zero &  fp_a_to32(significandWidth-2)
   val fp_b_is_NAN        = io.fp_bIsFpCanonicalNAN | Efp_b_is_all_one & fp_b_mantissa_isnot_zero
   val fp_b_is_SNAN       = !io.fp_bIsFpCanonicalNAN & Efp_b_is_all_one & fp_b_mantissa_isnot_zero & !fp_b_to32(significandWidth-2)
+  val fp_b_is_QNAN       = !io.fp_bIsFpCanonicalNAN & Efp_b_is_all_one & fp_b_mantissa_isnot_zero &  fp_b_to32(significandWidth-2)
   val fp_a_is_infinite   = !io.fp_aIsFpCanonicalNAN & Efp_a_is_all_one & (!fp_a_mantissa_isnot_zero)
   val fp_b_is_infinite   = !io.fp_bIsFpCanonicalNAN & Efp_b_is_all_one & (!fp_b_mantissa_isnot_zero)
   val fp_a_is_zero       = !io.fp_aIsFpCanonicalNAN & Efp_a_is_zero & !fp_a_mantissa_isnot_zero
@@ -228,6 +232,10 @@ private[fpu] class FloatAdderF32F16MixedPipeline(val is_print:Boolean = false,va
     val is_fsgnjn = io.op_code === FaddOpCode.fsgnjn
     val is_fsgnjx = io.op_code === FaddOpCode.fsgnjx
     val is_fclass = io.op_code === FaddOpCode.fclass
+    val is_fminm  = io.op_code === FaddOpCode.fminm
+    val is_fmaxm  = io.op_code === FaddOpCode.fmaxm
+    val is_fleq   = io.op_code === FaddOpCode.fleq
+    val is_fltq   = io.op_code === FaddOpCode.fltq
 
     val fp_a_sign = fp_a_to32.head(1)
     val fp_b_sign = fp_b_to32.head(1)
@@ -250,6 +258,8 @@ private[fpu] class FloatAdderF32F16MixedPipeline(val is_print:Boolean = false,va
     val result_flt = Wire(UInt(floatWidth.W))
     val result_fle = Wire(UInt(floatWidth.W))
     val result_fclass = Wire(UInt(floatWidth.W))
+    val result_fminm = Wire(UInt(floatWidth.W))
+    val result_fmaxm = Wire(UInt(floatWidth.W))
     val in_NAN = Mux(res_is_f32, Cat(0.U(1.W),Fill(9, 1.U(1.W)),0.U(22.W)), Cat(0.U(17.W),Fill(6, 1.U(1.W)),0.U(9.W)))
     val fp_aFix = Mux(io.fp_aIsFpCanonicalNAN, in_NAN, io.fp_a)
     val fp_bFix = Mux(io.fp_bIsFpCanonicalNAN, in_NAN, io.fp_b)
@@ -308,18 +318,32 @@ private[fpu] class FloatAdderF32F16MixedPipeline(val is_print:Boolean = false,va
       fp_a_is_SNAN,
       fp_a_is_NAN & !fp_a_is_SNAN
     )))
+    result_fminm := Mux(!fp_a_is_NAN & !fp_b_is_NAN,
+      Mux(fp_b_is_less || (fp_b_sign.asBool && fp_b_is_zero && fp_a_is_zero),
+        fp_b_16_or_32,
+        fp_a_16_or_32),
+      out_NAN
+    )
+    result_fmaxm := Mux(!fp_a_is_NAN & !fp_b_is_NAN,
+      Mux(fp_b_is_greater.asBool || (!fp_b_sign.asBool && fp_b_is_zero && fp_a_is_zero),
+        fp_b_16_or_32,
+        fp_a_16_or_32),
+      out_NAN
+    )
 
     val result_stage0 = Mux1H(
       Seq(
         is_min,
         is_max,
         is_feq,
-        is_flt,
-        is_fle,
+        is_flt | is_fltq,
+        is_fle | is_fleq,
         is_fsgnj,
         is_fsgnjn,
         is_fsgnjx,
         is_fclass,
+        is_fminm,
+        is_fmaxm,
       ),
       Seq(
         result_min,
@@ -331,11 +355,15 @@ private[fpu] class FloatAdderF32F16MixedPipeline(val is_print:Boolean = false,va
         result_fsgnjn,
         result_fsgnjx,
         result_fclass,
+        result_fminm,
+        result_fmaxm,
       )
     )
     val fflags_NV_stage0 = ((is_min | is_max) & (fp_a_is_SNAN | fp_b_is_SNAN)) |
       ((is_feq ) & (fp_a_is_SNAN | fp_b_is_SNAN)) |
-      ((is_flt | is_fle ) & (fp_a_is_NAN | fp_b_is_NAN))
+      ((is_flt | is_fle ) & (fp_a_is_NAN | fp_b_is_NAN)) |
+      ((is_fminm | is_fmaxm) & (fp_a_is_SNAN | fp_b_is_SNAN)) |
+      ((is_fltq | is_fleq) & (fp_a_is_SNAN | fp_b_is_SNAN))
     val fflags_stage0 = Cat(fflags_NV_stage0,0.U(4.W))
     io.fp_c := Mux(RegEnable(is_add | is_sub , fire),float_adder_result,RegEnable(result_stage0, fire))
     io.fflags := Mux(RegEnable(is_add | is_sub , fire),float_adder_fflags,RegEnable(fflags_stage0, fire))
@@ -390,8 +418,10 @@ private[fpu] class FloatAdderF64Pipeline(val is_print:Boolean = false,val hasMin
   val Efp_b_is_all_one   = Efp_b.andR
   val fp_a_is_NAN        = io.fp_aIsFpCanonicalNAN | Efp_a_is_all_one & fp_a_mantissa_isnot_zero
   val fp_a_is_SNAN       = !io.fp_aIsFpCanonicalNAN & Efp_a_is_all_one & fp_a_mantissa_isnot_zero & !fp_a_to64(significandWidth-2)
+  val fp_a_is_QNAN       = !io.fp_aIsFpCanonicalNAN & Efp_a_is_all_one & fp_a_mantissa_isnot_zero &  fp_a_to64(significandWidth-2)
   val fp_b_is_NAN        = io.fp_bIsFpCanonicalNAN | Efp_b_is_all_one & fp_b_mantissa_isnot_zero
   val fp_b_is_SNAN       = !io.fp_bIsFpCanonicalNAN & Efp_b_is_all_one & fp_b_mantissa_isnot_zero & !fp_b_to64(significandWidth-2)
+  val fp_b_is_QNAN       = !io.fp_bIsFpCanonicalNAN & Efp_b_is_all_one & fp_b_mantissa_isnot_zero &  fp_b_to64(significandWidth-2)
   val fp_a_is_infinite   = !io.fp_aIsFpCanonicalNAN & Efp_a_is_all_one & (!fp_a_mantissa_isnot_zero)
   val fp_b_is_infinite   = !io.fp_bIsFpCanonicalNAN & Efp_b_is_all_one & (!fp_b_mantissa_isnot_zero)
   val fp_a_is_zero = !io.fp_aIsFpCanonicalNAN & Efp_a_is_zero & !fp_a_mantissa_isnot_zero
@@ -435,6 +465,10 @@ private[fpu] class FloatAdderF64Pipeline(val is_print:Boolean = false,val hasMin
     val is_fsgnjn = io.op_code === FaddOpCode.fsgnjn
     val is_fsgnjx = io.op_code === FaddOpCode.fsgnjx
     val is_fclass = io.op_code === FaddOpCode.fclass
+    val is_fminm = io.op_code === FaddOpCode.fminm
+    val is_fmaxm = io.op_code === FaddOpCode.fmaxm
+    val is_fleq = io.op_code === FaddOpCode.fleq
+    val is_fltq = io.op_code === FaddOpCode.fltq
     val fp_a_sign = io.fp_a.head(1)
     val fp_b_sign = io.fp_b.head(1)
     val fp_b_sign_is_greater = fp_a_sign & !fp_b_sign
@@ -455,6 +489,8 @@ private[fpu] class FloatAdderF64Pipeline(val is_print:Boolean = false,val hasMin
     val result_feq = Wire(UInt(floatWidth.W))
     val result_flt = Wire(UInt(floatWidth.W))
     val result_fle = Wire(UInt(floatWidth.W))
+    val result_fminm = Wire(UInt(floatWidth.W))
+    val result_fmaxm = Wire(UInt(floatWidth.W))
     val in_NAN = Cat(0.U, Fill(exponentWidth, 1.U), 1.U, Fill(significandWidth - 2, 0.U))
     val fp_aFix = Mux(io.fp_aIsFpCanonicalNAN, in_NAN, io.fp_a)
     val fp_bFix = Mux(io.fp_bIsFpCanonicalNAN, in_NAN, io.fp_b)
@@ -506,17 +542,32 @@ private[fpu] class FloatAdderF64Pipeline(val is_print:Boolean = false,val hasMin
       fp_a_is_SNAN,
       fp_a_is_NAN & !fp_a_is_SNAN
     )))
+    result_fminm := Mux(!fp_a_is_NAN & !fp_b_is_NAN,
+      Mux(fp_b_is_less || (fp_b_sign.asBool && fp_b_is_zero && fp_a_is_zero),
+        io.fp_b,
+        io.fp_a),
+      out_NAN
+    )
+    result_fmaxm := Mux(!fp_a_is_NAN & !fp_b_is_NAN,
+      Mux(fp_b_is_greater.asBool || (!fp_b_sign.asBool && fp_b_is_zero && fp_a_is_zero),
+        io.fp_b,
+        io.fp_a),
+      out_NAN
+    )
+
     val result_stage0 = Mux1H(
       Seq(
         is_min,
         is_max,
         is_feq,
-        is_flt,
-        is_fle,
+        is_flt | is_fltq,
+        is_fle | is_fleq,
         is_fsgnj,
         is_fsgnjn,
         is_fsgnjx,
         is_fclass,
+        is_fminm,
+        is_fmaxm,
       ),
       Seq(
         result_min,
@@ -528,11 +579,15 @@ private[fpu] class FloatAdderF64Pipeline(val is_print:Boolean = false,val hasMin
         result_fsgnjn,
         result_fsgnjx,
         result_fclass,
+        result_fminm,
+        result_fmaxm,
       )
     )
     val fflags_NV_stage0 = ((is_min | is_max) & (fp_a_is_SNAN | fp_b_is_SNAN)) |
       (is_feq  & (fp_a_is_SNAN | fp_b_is_SNAN)) |
-      ((is_flt | is_fle ) & (fp_a_is_NAN | fp_b_is_NAN))
+      ((is_flt | is_fle ) & (fp_a_is_NAN | fp_b_is_NAN)) |
+      ((is_fminm | is_fmaxm) & (fp_a_is_SNAN | fp_b_is_SNAN)) |
+      ((is_fltq | is_fleq) & (fp_a_is_SNAN | fp_b_is_SNAN))
     val fflags_stage0 = Cat(fflags_NV_stage0, 0.U(4.W))
     io.fp_c := Mux(RegEnable(is_add | is_sub, fire), float_adder_result, RegEnable(result_stage0, fire))
     io.fflags := Mux(RegEnable(is_add | is_sub, fire), float_adder_fflags, RegEnable(fflags_stage0, fire))
diff --git a/src/main/scala/yunsuan/package.scala b/src/main/scala/yunsuan/package.scala
index e16c551..ed7daae 100644
--- a/src/main/scala/yunsuan/package.scala
+++ b/src/main/scala/yunsuan/package.scala
@@ -582,6 +582,10 @@ object VfcvtType {
     def fsgnj    = "b00110".U(width.W)
     def fsgnjx   = "b01000".U(width.W)
     def fsgnjn   = "b00111".U(width.W)
+    def fminm    = "b11110".U(width.W)
+    def fmaxm    = "b10011".U(width.W)
+    def fleq     = "b11100".U(width.W)
+    def fltq     = "b11011".U(width.W)
   }
   object FmaOpCode {
     def width = 4
diff --git a/src/main/scala/yunsuan/scalar/Convert.scala b/src/main/scala/yunsuan/scalar/Convert.scala
index 530d0b1..d91e7ee 100644
--- a/src/main/scala/yunsuan/scalar/Convert.scala
+++ b/src/main/scala/yunsuan/scalar/Convert.scala
@@ -85,6 +85,8 @@ class FpCvtIO(width: Int) extends Bundle {
   val sew = Input(UInt(2.W))
   val rm = Input(UInt(3.W))
   val isFpToVecInst = Input(Bool())
+  val isFround = Input(UInt(2.W))
+  val isFcvtmod = Input(Bool())
 
   val result = Output(UInt(width.W))
   val fflags = Output(UInt(5.W))
@@ -151,6 +153,8 @@ class FPCVT(xlen :Int) extends Module{
   fcvt.io.opType := io.opType
   fcvt.io.rm := io.rm
   fcvt.io.isFpToVecInst := io.isFpToVecInst
+  fcvt.io.isFround := io.isFround
+  fcvt.io.isFcvtmod := io.isFcvtmod
   fcvt.io.input1H := input1H
   fcvt.io.output1H := output1H
 
diff --git a/src/main/scala/yunsuan/vector/VectorConvert/CVT64.scala b/src/main/scala/yunsuan/vector/VectorConvert/CVT64.scala
index d7b605d..9f4387b 100644
--- a/src/main/scala/yunsuan/vector/VectorConvert/CVT64.scala
+++ b/src/main/scala/yunsuan/vector/VectorConvert/CVT64.scala
@@ -7,8 +7,8 @@ import yunsuan.vector.VectorConvert.utils._
 import yunsuan.vector.VectorConvert.RoundingModle._
 import yunsuan.util._
 class CVT64(width: Int = 64,mode: Boolean) extends CVT(width){
-  val (fire, src, sew, opType, rm, input1H, output1H, isFpToVecInst) =
-    (io.fire, io.src, io.sew, io.opType, io.rm, io.input1H, io.output1H, io.isFpToVecInst)
+  val (fire, src, sew, opType, rm, input1H, output1H, isFpToVecInst, isFround, isFcvtmod) =
+    (io.fire, io.src, io.sew, io.opType, io.rm, io.input1H, io.output1H, io.isFpToVecInst, io.isFround, io.isFcvtmod)
   val fireReg = GatedValidRegNext(fire)
 
   val outIsFpNext = opType.tail(1).head(1).asBool
@@ -26,6 +26,8 @@ class CVT64(width: Int = 64,mode: Boolean) extends CVT(width){
     (!inIsFpNext, inIsFpNext && outIsFpNext && isWiden, inIsFpNext && outIsFpNext && isNarrow,
       !outIsFpNext, inIsFpNext && outIsFpNext && isCrossHigh, inIsFpNext && outIsFpNext && isCrossLow)
 
+  val isFroundOrFroundnxNext = isFround.orR
+
   val isInt2Fp = RegEnable(isInt2FpNext, false.B, fire)
   val isFpWiden = RegEnable(isFpWidenNext, false.B, fire)
   val isFpNarrow = RegEnable(isFpNarrowNext, false.B, fire)
@@ -33,7 +35,9 @@ class CVT64(width: Int = 64,mode: Boolean) extends CVT(width){
   val isFp2Int = RegEnable(isFp2IntNext, false.B, fire)
   val isFpCrossHigh = RegEnable(isFpCrossHighNext, false.B, fire)
   val isFpCrossLow = RegEnable(isFpCrossLowNext, false.B, fire)
-  val isFPsrc = isFpWiden || isFpNarrow || isFpCrossHigh || isFpCrossLow || isFp2Int
+  val isFroundReg = RegEnable(isFroundOrFroundnxNext, false.B, fire)
+  val isFcvtmodReg = RegEnable(isFcvtmod, false.B, fire)
+  val isFPsrc = isFpWiden || isFpNarrow || isFpCrossHigh || isFpCrossLow || isFp2Int || isFroundReg || isFcvtmodReg
 
   val s0_outIsF64 =  outIsFpNext && output1H(3)
   val s0_outIsF32 =  outIsFpNext && output1H(2)
@@ -66,6 +70,8 @@ class CVT64(width: Int = 64,mode: Boolean) extends CVT(width){
   fpcvt.io.input1H := input1H
   fpcvt.io.output1H := output1H
   fpcvt.io.isFpToVecInst := isFpToVecInst
+  fpcvt.io.isFround := isFround
+  fpcvt.io.isFcvtmod := isFcvtmod
 
   val s1_resultForfpCanonicalNAN = Mux1H(
     Seq(s1_outIsF64, s1_outIsF32, s1_outIsF16, s1_outIsU32 || s1_outIsU64, s1_outIsS32, s1_outIsS64),
@@ -121,6 +127,8 @@ class CVT_IO extends Bundle{
   val input1H = Input(UInt(4.W))
   val output1H = Input(UInt(4.W))
   val isFpToVecInst = Input(Bool())
+  val isFround = Input(UInt(2.W))
+  val isFcvtmod = Input(Bool())
   val result = Output(UInt(64.W))
   val fflags = Output(UInt(5.W))
 }
@@ -142,8 +150,8 @@ class FP_INCVT extends Module {
   val intParamMap = (0 to 3).map(i => (1 << i) * 8)
   val widthExpAdder = 13 // 13bits is enough
   //input
-  val (fire, src, opType, rmNext, input1H, output1H, isFpToVecInst) =
-    (io.fire, io.src, io.opType, io.rm, io.input1H, io.output1H, io.isFpToVecInst)
+  val (fire, src, opType, rmNext, input1H, output1H, isFpToVecInst, isFround, isFcvtmod) =
+    (io.fire, io.src, io.opType, io.rm, io.input1H, io.output1H, io.isFpToVecInst, io.isFround, io.isFcvtmod)
   val fireReg = GatedValidRegNext(fire)
 
   val isWiden = !opType(4) && opType(3)
@@ -157,6 +165,9 @@ class FP_INCVT extends Module {
   val float1HSrcNext = input1H.head(3)//exclude f8
   val float1HOutNext = output1H.head(3)//exclude f8
 
+  val isFroundOrFroundnxNext = isFround.orR
+  val isFroundnxNext = isFround(1)
+
   //fp input extend
   val srcMap = (0 to 3).map(i => src((1 << i) * 8 - 1, 0))
   val floatMap = srcMap.zipWithIndex.map{case (float,i) => floatExtend(float, i)}.drop(1)
@@ -180,6 +191,9 @@ class FP_INCVT extends Module {
     (outIsFpNext && isWiden, outIsFpNext && isNarrow, !outIsFpNext,
       outIsFpNext && isCrossHigh, outIsFpNext && isCrossLow)
 
+  val froundOrFroundnxIsZeroOrInfNext = isFroundOrFroundnxNext && (isZeroSrcNext || isInfSrcNext)
+  val fcvtmodIsInfOrNaNNext = isFcvtmod && (isInfSrcNext || isNaNSrcNext)
+
   //s1
   val expIsOnesSrc = RegEnable(expIsOnesSrcNext, false.B, fire)
   val fracNotZeroSrc = RegEnable(fracNotZeroSrcNext, false.B, fire)
@@ -198,6 +212,11 @@ class FP_INCVT extends Module {
   val s0_fpCanonicalNAN = isFpToVecInst & (input1H(1) & !src.head(48).andR | input1H(2) & !src.head(32).andR)
   val s1_fpCanonicalNAN = RegEnable(s0_fpCanonicalNAN, fire)
 
+  val isFroundnxReg = RegEnable(isFroundnxNext, false.B, fire)
+  val isFroundOrFroundnxReg = RegEnable(isFroundOrFroundnxNext, false.B, fire)
+  val froundOrFroundnxIsZeroOrInf = RegEnable(froundOrFroundnxIsZeroOrInfNext, false.B, fire)
+  val fcvtmodIsInfOrNaN = RegEnable(fcvtmodIsInfOrNaNNext, false.B, fire)
+
   // for fpnarrow sub
   val trunSticky = RegEnable(fracSrc.tail(f32.fracWidth).orR, false.B, fire)
   val signSrc = RegEnable(signSrcNext, false.B, fire)
@@ -284,6 +303,46 @@ class FP_INCVT extends Module {
   val sticky = Wire(Bool())
   inRounder := inRounderTmp
   sticky := stickyTmp
+
+  /**
+   * fround
+   * frac
+   * cycle: 0
+   */
+  val froundExpDeltaNext = Wire(UInt(6.W))
+  val froundFracShiftNext = Wire(UInt(64.W))
+  val froundExpSubBias = Wire(UInt(f64.expWidth.W))
+
+  val froundMaxExpNext = Mux1H(float1HOutNext, fpParamMap.map(fp => fp.froundMaxExp.U))
+  val froundFracNext = fracValueSrc ## 0.U(11.W)
+
+  val froundExpLessThanBiasNext = Mux1H(float1HOutNext, fpParamMap.map(fp => !expSrcNext(fp.expWidth-1) && !expSrcNext(fp.expWidth-2, 0).andR))
+  val froundExpGreaterThanMaxExpNext = expSrcNext > froundMaxExpNext
+
+  froundExpSubBias := Mux1H(float1HOutNext, fpParamMap.map(fp => fp.bias.U)) - expSrcNext
+  froundExpDeltaNext := Mux(froundExpLessThanBiasNext, froundExpSubBias, 1.U + ~froundExpSubBias)
+  froundFracShiftNext := Mux(froundExpLessThanBiasNext, froundFracNext >> froundExpDeltaNext, froundFracNext << froundExpDeltaNext)
+
+  val fracShiftMaskNext = f64.fracWidth.U - froundExpDeltaNext
+
+  val froundFracShift = RegEnable(froundFracShiftNext, 0.U, fire)
+  val froundExpLessThanBias = RegEnable(froundExpLessThanBiasNext, false.B, fire)
+  val froundExpGreaterThanMaxExp = RegEnable(froundExpGreaterThanMaxExpNext, false.B, fire)
+  val fracShiftMask = RegEnable(fracShiftMaskNext, 0.U, fire)
+  val froundOldExp = RegEnable(expSrcNext, 0.U, fire)
+  val froundOldFrac = RegEnable(fracSrc, 0.U, fire)
+
+  // cycle1
+  val froundShiftMask = Wire(UInt(64.W))
+  val froundUpShiftMask = Wire(UInt(52.W))
+  val froundOldInput = Wire(UInt(64.W))
+  val froundUpInput = Wire(UInt(64.W))
+
+  froundShiftMask := ~0.U(64.W) << fracShiftMask
+  froundUpShiftMask := 1.U << fracShiftMask
+  froundOldInput := Cat(signSrc, froundOldExp, froundOldFrac) & froundShiftMask
+  froundUpInput := froundOldInput + froundUpShiftMask
+
   /** rounder
    * for: int->fp, fp-fp Narrow, fp->int
    * cycle: 1
@@ -304,10 +363,14 @@ class FP_INCVT extends Module {
     (rounderMap(0), rounderMap(1), rounderMap(2), rounderMap(3))
   }
   val rounderInput = Mux(isFp2Int, inRounder.head(64),  Mux1H(float1HOut, rounderInputMap))
+
+  val froundRoundIn = froundFracShift.tail(1).head(1).asBool
+  val froundStickyIn = Mux1H(float1HOut, fpParamMap.map(fp => froundFracShift.tail(2).head(fp.fracWidth - 1).orR))
+
   val rounder = Module(new RoundingUnit(64))
-  rounder.io.in := rounderInput
-  rounder.io.roundIn := Mux(isFp2Int, inRounder(0), Mux1H(float1HOut, rounerInMap))
-  rounder.io.stickyIn := Mux(isFp2Int, sticky, Mux1H(float1HOut, rounderStikyMap))
+  rounder.io.in := Mux(isFroundOrFroundnxReg, froundFracShift, rounderInput)
+  rounder.io.roundIn := Mux(isFroundOrFroundnxReg, froundRoundIn, Mux(isFp2Int, inRounder(0), Mux1H(float1HOut, rounerInMap)))
+  rounder.io.stickyIn := Mux(isFroundOrFroundnxReg, froundStickyIn, Mux(isFp2Int, sticky, Mux1H(float1HOut, rounderStikyMap)))
   rounder.io.signIn := signSrc
   rounder.io.rm := rm
 
@@ -433,6 +496,35 @@ class FP_INCVT extends Module {
     }
     val fpNarrowResultMap: Seq[UInt] = Seq(f16, f32).map(fp => Mux1H(result1H.asBools.reverse, fpNarrowResultMapGen(fp)))
     resultNext := Mux1H(float1HOut.tail(1), fpNarrowResultMap)
+  }.elsewhen(isFroundOrFroundnxReg) {
+    val oldInputReg = Mux1H(float1HOut, fpParamMap.map(fp => signSrc ## froundOldExp(fp.expWidth - 1, 0) ## froundOldFrac.head(fp.fracWidth)))
+
+    nv := isSNaNSrc
+    dz := false.B
+    of := false.B
+    uf := false.B
+    nx := isFroundnxReg && nxRounded && !isNaNSrc
+
+    val result1H = Cat(
+      froundOrFroundnxIsZeroOrInf || froundExpGreaterThanMaxExp && !isNaNSrc,
+      isNaNSrc,
+      froundExpLessThanBias,
+      !froundExpLessThanBias && !froundOrFroundnxIsZeroOrInf && !froundExpGreaterThanMaxExp,
+    )
+
+    def froundResultMapGen(fp: FloatFormat): Seq[UInt] = {
+      VecInit((0 to 3).map {
+        case 0 => oldInputReg
+        case 1 => 0.U ## ~0.U(fp.expWidth.W) ## 1.U ## 0.U((fp.fracWidth - 1).W)
+        case 2 => signSrc ## Mux(upRounded, 0.U ## Fill(fp.expWidth - 1, 1.U(1.W)), 0.U(fp.expWidth.W)) ## 0.U(fp.fracWidth.W)
+        case 3 => Mux(upRounded,
+          froundUpInput.head(1) ## froundUpInput.tail(1).head(f64.expWidth)(fp.expWidth - 1, 0) ## froundUpInput.tail(1 + f64.expWidth).head(fp.fracWidth),
+          froundOldInput.head(1) ## froundOldInput.tail(1).head(f64.expWidth)(fp.expWidth - 1, 0) ## froundOldInput.tail(1 + f64.expWidth).head(fp.fracWidth))
+      })
+    }
+
+    val froundResultMap: Seq[UInt] = fpParamMap.map(fp => Mux1H(result1H.asBools.reverse, froundResultMapGen(fp)))
+    resultNext := Mux1H(float1HOut, froundResultMap)
   }.otherwise{
     /** out is int, any fp->any int/uint
      * drop the shift left!
@@ -475,10 +567,10 @@ class FP_INCVT extends Module {
     uf := false.B
     nx := Mux(hasSignInt, toInx, toUnx)
     val result1H = Cat(
-      (!hasSignInt && !toUnv) || (hasSignInt && !toInv), //toUnv include nan & inf
-      !hasSignInt && toUnv && (isNaNSrc || !signSrc && (isInfSrc || ofExpRounded)),
-      !hasSignInt && toUnv && signSrc && !isNaNSrc,
-      hasSignInt && toInv
+      ((!hasSignInt && !toUnv) || (hasSignInt && !toInv)) && !fcvtmodIsInfOrNaN, //toUnv include nan & inf
+      !hasSignInt && toUnv && (isNaNSrc || !signSrc && (isInfSrc || ofExpRounded)) && !fcvtmodIsInfOrNaN,
+      !hasSignInt && toUnv && signSrc && !isNaNSrc || fcvtmodIsInfOrNaN,
+      hasSignInt && toInv && !fcvtmodIsInfOrNaN
     )
     resultNext := Mux1H(result1H.asBools.reverse, Seq(
       normalResult,
diff --git a/src/main/scala/yunsuan/vector/VectorConvert/CVTparameter.scala b/src/main/scala/yunsuan/vector/VectorConvert/CVTparameter.scala
index 503eb6d..ccd072c 100644
--- a/src/main/scala/yunsuan/vector/VectorConvert/CVTparameter.scala
+++ b/src/main/scala/yunsuan/vector/VectorConvert/CVTparameter.scala
@@ -21,6 +21,7 @@ trait FloatFormat{
   def precision = fracWidth + 1
   def maxExp = (BigInt(1) << expWidth) - 2
   def minExp = 1
+  def froundMaxExp = fracWidth + bias
 }
 
 object f16 extends FloatFormat {
diff --git a/src/main/scala/yunsuan/vector/VectorConvert/Convert.scala b/src/main/scala/yunsuan/vector/VectorConvert/Convert.scala
index d891936..c024c2d 100644
--- a/src/main/scala/yunsuan/vector/VectorConvert/Convert.scala
+++ b/src/main/scala/yunsuan/vector/VectorConvert/Convert.scala
@@ -12,6 +12,8 @@ class VectorCvtIO(width: Int) extends Bundle {
   val sew = Input(UInt(2.W))
   val rm = Input(UInt(3.W))
   val isFpToVecInst = Input(Bool())
+  val isFround = Input(UInt(2.W))
+  val isFcvtmod = Input(Bool())
 
   val result = Output(UInt(width.W))
   val fflags = Output(UInt(20.W))
@@ -20,7 +22,7 @@ class VectorCvtIO(width: Int) extends Bundle {
 class VectorCvt(xlen :Int) extends Module{
 
   val io = IO(new VectorCvtIO(xlen))
-  val (fire, src, opType, sew, rm, isFpToVecInst) = (io.fire, io.src, io.opType, io.sew, io.rm, io.isFpToVecInst)
+  val (fire, src, opType, sew, rm, isFpToVecInst, isFround, isFcvtmod) = (io.fire, io.src, io.opType, io.sew, io.rm, io.isFpToVecInst, io.isFround, io.isFcvtmod)
   val widen = opType(4, 3) // 0->single 1->widen 2->norrow => width of result
 
   // input width 8， 16， 32， 64
@@ -89,10 +91,10 @@ class VectorCvt(xlen :Int) extends Module{
   val in3 = Mux1H(inputWidth1H, Seq(element8(3), element16(3), 0.U, 0.U))
 
 
-  val (result0, fflags0) = VCVT(64)(fire, in0, opType, sew, rm, input1H, output1H, isFpToVecInst)
-  val (result1, fflags1) = VCVT(32)(fire, in1, opType, sew, rm, input1H, output1H, isFpToVecInst)
-  val (result2, fflags2) = VCVT(16)(fire, in2, opType, sew, rm, input1H, output1H, isFpToVecInst)
-  val (result3, fflags3) = VCVT(16)(fire, in3, opType, sew, rm, input1H, output1H, isFpToVecInst)
+  val (result0, fflags0) = VCVT(64)(fire, in0, opType, sew, rm, input1H, output1H, isFpToVecInst, isFround, isFcvtmod)
+  val (result1, fflags1) = VCVT(32)(fire, in1, opType, sew, rm, input1H, output1H, isFpToVecInst, isFround, isFcvtmod)
+  val (result2, fflags2) = VCVT(16)(fire, in2, opType, sew, rm, input1H, output1H, isFpToVecInst, isFround, isFcvtmod)
+  val (result3, fflags3) = VCVT(16)(fire, in3, opType, sew, rm, input1H, output1H, isFpToVecInst, isFround, isFcvtmod)
 
   io.result := Mux1H(outputWidth1H, Seq(
     result3(7,0) ## result2(7,0) ## result1(7,0) ## result0(7,0),
diff --git a/src/main/scala/yunsuan/vector/VectorConvert/VCVT.scala b/src/main/scala/yunsuan/vector/VectorConvert/VCVT.scala
index ca4734f..3c15d6d 100644
--- a/src/main/scala/yunsuan/vector/VectorConvert/VCVT.scala
+++ b/src/main/scala/yunsuan/vector/VectorConvert/VCVT.scala
@@ -12,6 +12,8 @@ class CVTIO(width: Int) extends Bundle {
   val input1H = Input(UInt(4.W))
   val output1H = Input(UInt(4.W))
   val isFpToVecInst = Input(Bool())
+  val isFround = Input(UInt(2.W))
+  val isFcvtmod = Input(Bool())
   val result = Output(UInt(width.W))
   val fflags = Output(UInt(5.W))
 }
@@ -39,7 +41,9 @@ object VCVT {
              rm:      UInt,
              input1H:      UInt,
              output1H:      UInt,
-             isFpToVecInst: Bool
+             isFpToVecInst: Bool,
+             isFround: UInt,
+             isFcvtmod: Bool
            ): (UInt, UInt) = {
     val vcvtWraper = Module(new VCVT(width))
     vcvtWraper.io.fire := fire
@@ -50,6 +54,8 @@ object VCVT {
     vcvtWraper.io.input1H := input1H
     vcvtWraper.io.output1H := output1H
     vcvtWraper.io.isFpToVecInst := isFpToVecInst
+    vcvtWraper.io.isFround := isFround
+    vcvtWraper.io.isFcvtmod := isFcvtmod
     (vcvtWraper.io.result, vcvtWraper.io.fflags)
   }
 }
\ No newline at end of file
diff --git a/src/test/scala/top/VectorSimTop.scala b/src/test/scala/top/VectorSimTop.scala
index 9d12b91..7851e5c 100644
--- a/src/test/scala/top/VectorSimTop.scala
+++ b/src/test/scala/top/VectorSimTop.scala
@@ -273,6 +273,8 @@ class SimTop() extends VPUTestModule {
     vcvt.io.rm := rm
     vcvt.io.src := src1 // 128 bit->vcvt
     vcvt.io.isFpToVecInst := false.B
+    vcvt.io.isFround := 0.U
+    vcvt.io.isFcvtmod := false.B
     vcvt_result.vxsat := 0.U
     vcvt_result.result(i) := vcvt.io.result
     vcvt_result.fflags(i) := vcvt.io.fflags
@@ -296,6 +298,8 @@ class SimTop() extends VPUTestModule {
     fpcvt.io.rm := rm
     fpcvt.io.src := src1
     fpcvt.io.isFpToVecInst := true.B
+    fpcvt.io.isFround := 0.U
+    fpcvt.io.isFcvtmod := false.B
     fpcvt_result.vxsat := 0.U
     fpcvt_result.result(i) := fpcvt.io.result
     fpcvt_result.fflags(i) := fpcvt.io.fflags