From 4068fded3b0d892e9f3fd361d19e7a1fedb38e4a Mon Sep 17 00:00:00 2001
From: m <m@bitsnbites.eu>
Date: Sun, 9 Jan 2022 22:19:20 +0100
Subject: [PATCH 1/2] femtorv32-electron: Add vector support

This is a proof of concept implementation that maps virtual vector
registers onto the scalar register file.
---
 FemtoRV/RTL/PROCESSOR/femtorv32_electron.v | 67 ++++++++++++++++++++--
 1 file changed, 61 insertions(+), 6 deletions(-)

diff --git a/FemtoRV/RTL/PROCESSOR/femtorv32_electron.v b/FemtoRV/RTL/PROCESSOR/femtorv32_electron.v
index cdc46ea1..bb4f239e 100644
--- a/FemtoRV/RTL/PROCESSOR/femtorv32_electron.v
+++ b/FemtoRV/RTL/PROCESSOR/femtorv32_electron.v
@@ -15,6 +15,32 @@
 // Bruno Levy, Matthias Koch, 2020-2021
 /*******************************************************************/
 
+/*******************************************************************/
+// Custom vector extension
+//
+//  The two least significant bits of the instruction word controls
+//  the scalar/vector operation:
+//
+//    2'b00: vector <- vector,vector (extension)
+//    2'b01: vector <- vector,scalar (extension)
+//    2'b10: vector <- scalar,vector (extension)
+//    2'b11: scalar <- scalar,scalar (standard RV32I)
+//
+//  Vector registers are mapped onto the scalar register file:
+//
+//    V0 = [ X0, X1, X2, X3] (avoid! clobbers RA, SP, GP!)
+//    V1 = [ X4, X5, X6, X7] (clobbers TP)
+//    V2 = [ X8, X9,X10,X11] (clobbers FP)
+//    V3 = [X12,X13,X14,X15]
+//    V4 = [X16,X17,X18,X19]
+//    V5 = [X20,X21,X22,X23]
+//    V6 = [X24,X25,X26,X27]
+//    V7 = [X28,X29,X30,X31] (clobbers VL!)
+//
+//  Furthermore, scalar register X31 maps the the VL (Vector Length)
+//  register.
+/*******************************************************************/
+
 // Firmware generation flags for this processor
 `define NRV_ARCH     "rv32im"
 `define NRV_ABI      "ilp32"
@@ -45,7 +71,7 @@ module FemtoRV32(
    // https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
 
    // The destination register
-   wire [4:0] rdId = instr[11:7];
+   wire [4:0] rdId = dstIsVec ? {instr[9:7],vecIdx} : instr[11:7];
 
    // The ALU function, decoded in 1-hot form (doing so reduces LUT count)
    // It is used as follows: funct3Is[val] <=> funct3 == val
@@ -236,6 +262,10 @@ module FemtoRV32(
    reg  [31:2] instr;        // Latched instruction. Note that bits 0 and 1 are
                              // ignored (not used in RV32I base instr set).
 
+   reg  src1IsVec;           // Source operand 1 is vector?
+   reg  src2IsVec;           // Source operand 2 is vector?
+   wire dstIsVec = src1IsVec | src2IsVec;
+
    wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
 
    // An adder used to compute branch address, JAL address and AUIPC.
@@ -378,10 +408,17 @@ module FemtoRV32(
                          jumpToPCplusImm  ? PCplusImm :
                          PCplus4;
 
+   // Vector state.
+   reg [1:0] vecIdx;
+   reg [1:0] vecLen;
+   wire [1:0] vecIdx_new = vecIdx + 1;
+   wire vecOpDone = (vecIdx == vecLen) | !(src1IsVec | src2IsVec);
+
    always @(posedge clk) begin
       if(!reset) begin
          state      <= WAIT_ALU_OR_MEM; // Just waiting for !mem_wbusy
          PC         <= RESET_ADDR[ADDR_WIDTH-1:0];
+         vecLen     <= 2'b00;
       end else
 
       // See note [1] at the end of this file.
@@ -390,19 +427,37 @@ module FemtoRV32(
 
         state[WAIT_INSTR_bit]: begin
            if(!mem_rbusy) begin // may be high when executing from SPI flash
-              rs1 <= registerFile[mem_rdata[19:15]];
-              rs2 <= registerFile[mem_rdata[24:20]];
-              instr <= mem_rdata[31:2]; // Bits 0 and 1 are ignored (see
-              state <= EXECUTE;         // also the declaration of instr).
+              // Bits 0 and 1 of the instruction word indicate vector mode of the source operands.
+              src1IsVec <= !mem_rdata[0];
+              src2IsVec <= !mem_rdata[1];
+              rs1 <= mem_rdata[0] ? registerFile[mem_rdata[19:15]] :
+                                    registerFile[{mem_rdata[17:15],2'b00}];
+              rs2 <= mem_rdata[1] ? registerFile[mem_rdata[24:20]]:
+                                    registerFile[{mem_rdata[22:20],2'b00}];
+
+              // Restart vector element counter.
+              vecIdx <= 2b'00;
+
+              // Latch instruction word.
+              instr <= mem_rdata[31:2]; // (see declaration of instr).
+              state <= EXECUTE;
            end
         end
 
         state[EXECUTE_bit]: begin
-           PC <= PC_new;
+           if(vecOpDone) PC <= PC_new;
+
+           // Iterate over the source vector registers.
+           rs1 <= registerFile[{instr[17:15],vecIdx_new}];
+           rs2 <= registerFile[{instr[22:20],vecIdx_new}];
+           vecIdx <= vecIdx_new;
+
+           // TODO(m): Handle state transitions for vector operations!
            state <= needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR;
         end
 
         state[WAIT_ALU_OR_MEM_bit]: begin
+           // TODO(m): Handle state transitions for vector operations!
            if(!aluBusy & !mem_rbusy & !mem_wbusy) state <= FETCH_INSTR;
         end
 

From f2f6125b00d35595b3a6c45ffad5fb210da508d1 Mon Sep 17 00:00:00 2001
From: m <m@bitsnbites.eu>
Date: Sun, 9 Jan 2022 22:44:12 +0100
Subject: [PATCH 2/2] fixup! femtorv32-electron: Add vector support

Implement VSETVL and vector op state transitions.

Also make the vector length build-time configurable.
---
 FemtoRV/RTL/PROCESSOR/femtorv32_electron.v | 87 +++++++++++++++-------
 1 file changed, 62 insertions(+), 25 deletions(-)

diff --git a/FemtoRV/RTL/PROCESSOR/femtorv32_electron.v b/FemtoRV/RTL/PROCESSOR/femtorv32_electron.v
index bb4f239e..2c0a82af 100644
--- a/FemtoRV/RTL/PROCESSOR/femtorv32_electron.v
+++ b/FemtoRV/RTL/PROCESSOR/femtorv32_electron.v
@@ -35,10 +35,11 @@
 //    V4 = [X16,X17,X18,X19]
 //    V5 = [X20,X21,X22,X23]
 //    V6 = [X24,X25,X26,X27]
-//    V7 = [X28,X29,X30,X31] (clobbers VL!)
+//    V7 = [X28,X29,X30,X31]
 //
-//  Furthermore, scalar register X31 maps the the VL (Vector Length)
-//  register.
+//  Furthermore the VSETVL instruction is added, using the same
+//  encoding and definition as in the V extension, except that the
+//  rs2 operand (vtype setting) is ignored.
 /*******************************************************************/
 
 // Firmware generation flags for this processor
@@ -62,6 +63,10 @@ module FemtoRV32(
    parameter RESET_ADDR       = 32'h00000000;
    parameter ADDR_WIDTH       = 24;
 
+   // Vector configuration.
+   parameter VL_WIDTH         = 2;
+   localparam MAX_VL          = 1 << VL_WIDTH;
+
    /***************************************************************************/
    // Instruction decoding.
    /***************************************************************************/
@@ -71,7 +76,7 @@ module FemtoRV32(
    // https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
 
    // The destination register
-   wire [4:0] rdId = dstIsVec ? {instr[9:7],vecIdx} : instr[11:7];
+   wire [4:0] rdId = isVectorOp ? {instr[11-VL_WIDTH:7], vecIdx} : instr[11:7];
 
    // The ALU function, decoded in 1-hot form (doing so reduces LUT count)
    // It is used as follows: funct3Is[val] <=> funct3 == val
@@ -99,6 +104,9 @@ module FemtoRV32(
    wire isJAL     =  (instr[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
    wire isSYSTEM  =  (instr[6:2] == 5'b11100); // rd <- CSR <- rs1/uimm5
 
+   // The vector extension adds VSETVL.
+   wire isVSETVL  =  (instr[6:2] == 5'b10101); // rd <- VL <- min(rs1, MAX_VL)
+
    wire isALU = isALUimm | isALUreg;
 
    /***************************************************************************/
@@ -110,9 +118,12 @@ module FemtoRV32(
    reg [31:0] registerFile [31:0];
 
    always @(posedge clk) begin
-     if (writeBack)
+     if (writeBack) begin
        if (rdId != 0)
          registerFile[rdId] <= writeBackData;
+       if (isVSETVL)
+         VL <= writeBackData[VL_WIDTH-1:0];
+     end
    end
 
    /***************************************************************************/
@@ -216,21 +227,22 @@ module FemtoRV32(
    reg [62:0] divisor;
    reg [31:0] quotient;
    reg [31:0] quotient_msk;
+   reg        div_sign;  // Registered since aluIn1/2 may change before the
+                         // division iterations are done (for vector division)
 
    wire divstep_do = divisor <= {31'b0, dividend};
 
    wire [31:0] dividendN     = divstep_do ? dividend - divisor[31:0] : dividend;
    wire [31:0] quotientN     = divstep_do ? quotient | quotient_msk  : quotient;
 
-   wire div_sign = ~instr[12] & (instr[13] ? aluIn1[31] : 
-                    (aluIn1[31] != aluIn2[31]) & |aluIn2);
-
    always @(posedge clk) begin
       if (isDivide & aluWr) begin
 	 dividend <=   ~instr[12] & aluIn1[31] ? -aluIn1 : aluIn1;
 	 divisor  <= {(~instr[12] & aluIn2[31] ? -aluIn2 : aluIn2), 31'b0};
 	 quotient <= 0;
 	 quotient_msk <= 1 << 31;
+	 div_sign <= ~instr[12] & (instr[13] ? aluIn1[31] :
+	             (aluIn1[31] != aluIn2[31]) & |aluIn2);
       end else begin
 	 dividend     <= dividendN;
 	 divisor      <= divisor >> 1;
@@ -264,7 +276,7 @@ module FemtoRV32(
 
    reg  src1IsVec;           // Source operand 1 is vector?
    reg  src2IsVec;           // Source operand 2 is vector?
-   wire dstIsVec = src1IsVec | src2IsVec;
+   wire isVectorOp = src1IsVec | src2IsVec;
 
    wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
 
@@ -296,6 +308,21 @@ module FemtoRV32(
    wire sel_cyclesh = (instr[31:20] == 12'hC80);
    wire [31:0] CSR_read = sel_cyclesh ? cycles[63:32] : cycles[31:0];
 
+   /***************************************************************************/
+   // Vector length register.
+   /***************************************************************************/
+
+   // The size of the VL register is log2(MAX_VL) bits. When VL == 0, it
+   // represents MAX_VL (we do not support zero-cycle vector operations).
+   reg [VL_WIDTH-1:0] VL;
+
+   // This implements the VSETVL logic: rd <- vl <- min(AVL, MAX_VL)
+   // Note: We produce one bit extra compared to the VL register in order to
+   // be able to represent MAX_VL in the VSETVL result.
+   wire [31:0] setvlOut;
+   assign setvlOut[31:VL_WIDTH+1] = 0;
+   assign setvlOut[VL_WIDTH:0] = rs1 < MAX_VL ? rs1[VL_WIDTH:0] : MAX_VL;
+
    /***************************************************************************/
    // The value written back to the register file.
    /***************************************************************************/
@@ -306,7 +333,8 @@ module FemtoRV32(
       (isALU               ? aluOut    : 32'b0) |  // ALUreg, ALUimm
       (isAUIPC             ? PCplusImm : 32'b0) |  // AUIPC
       (isJALR   | isJAL    ? PCplus4   : 32'b0) |  // JAL, JALR
-      (isLoad              ? LOAD_data : 32'b0) ;  // Load
+      (isLoad              ? LOAD_data : 32'b0) |  // Load
+      (isVSETVL            ? setvlOut  : 32'b0) ;  // VSETVL
 
    /* verilator lint_on WIDTH */
 
@@ -409,16 +437,15 @@ module FemtoRV32(
                          PCplus4;
 
    // Vector state.
-   reg [1:0] vecIdx;
-   reg [1:0] vecLen;
-   wire [1:0] vecIdx_new = vecIdx + 1;
-   wire vecOpDone = (vecIdx == vecLen) | !(src1IsVec | src2IsVec);
+   reg [VL_WIDTH-1:0] vecIdx;
+   wire [VL_WIDTH-1:0] vecIdx_new = vecIdx + 1;
+   wire vecElementsPending = isVectorOp & (vecIdx_new != VL);
 
    always @(posedge clk) begin
       if(!reset) begin
          state      <= WAIT_ALU_OR_MEM; // Just waiting for !mem_wbusy
          PC         <= RESET_ADDR[ADDR_WIDTH-1:0];
-         vecLen     <= 2'b00;
+         VL         <= 0;
       end else
 
       // See note [1] at the end of this file.
@@ -430,13 +457,15 @@ module FemtoRV32(
               // Bits 0 and 1 of the instruction word indicate vector mode of the source operands.
               src1IsVec <= !mem_rdata[0];
               src2IsVec <= !mem_rdata[1];
+
+              // Latch source register contents.
               rs1 <= mem_rdata[0] ? registerFile[mem_rdata[19:15]] :
-                                    registerFile[{mem_rdata[17:15],2'b00}];
+                                    registerFile[{mem_rdata[19-VL_WIDTH:15], {VL_WIDTH{1'b0}}}];
               rs2 <= mem_rdata[1] ? registerFile[mem_rdata[24:20]]:
-                                    registerFile[{mem_rdata[22:20],2'b00}];
+                                    registerFile[{mem_rdata[24-VL_WIDTH:20], {VL_WIDTH{1'b0}}}];
 
               // Restart vector element counter.
-              vecIdx <= 2b'00;
+              vecIdx <= 0;
 
               // Latch instruction word.
               instr <= mem_rdata[31:2]; // (see declaration of instr).
@@ -445,20 +474,28 @@ module FemtoRV32(
         end
 
         state[EXECUTE_bit]: begin
-           if(vecOpDone) PC <= PC_new;
+           if (!vecElementsPending)
+              PC <= PC_new;
 
            // Iterate over the source vector registers.
-           rs1 <= registerFile[{instr[17:15],vecIdx_new}];
-           rs2 <= registerFile[{instr[22:20],vecIdx_new}];
+           // TODO(m): We really want to do this when going to EXECUTE, so we do not want to do it
+           // when going to WAIT_ALU_OR_MEM. Instead of having this logic in each state, can we do
+           // it in a single place (wires instead of registers?)?
+           if (src1IsVec)
+              rs1 <= registerFile[{instr[19-VL_WIDTH:15], vecIdx_new}];
+           if (src2IsVec)
+              rs2 <= registerFile[{instr[24-VL_WIDTH:20], vecIdx_new}];
            vecIdx <= vecIdx_new;
 
-           // TODO(m): Handle state transitions for vector operations!
-           state <= needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR;
+           if (needToWait)
+              state <= WAIT_ALU_OR_MEM;
+           else
+              state <= vecElementsPending ? EXECUTE : FETCH_INSTR;
         end
 
         state[WAIT_ALU_OR_MEM_bit]: begin
-           // TODO(m): Handle state transitions for vector operations!
-           if(!aluBusy & !mem_rbusy & !mem_wbusy) state <= FETCH_INSTR;
+           if(!aluBusy & !mem_rbusy & !mem_wbusy)
+              state <= vecElementsPending ? EXECUTE : FETCH_INSTR;
         end
 
         default: begin // FETCH_INSTR