Skip to content

Commit

Permalink
Merge pull request #1088 from riscv/vector
Browse files Browse the repository at this point in the history
  • Loading branch information
wmat authored Mar 20, 2024
2 parents a4382e9 + 7013a90 commit aa5dce0
Show file tree
Hide file tree
Showing 23 changed files with 6,698 additions and 5 deletions.
3 changes: 1 addition & 2 deletions src/c-st-ext.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -306,8 +306,7 @@ These instructions use the CI format.
C.LWSP loads a 32-bit value from memory into register _rd_. It computes
an effective address by adding the _zero_-extended offset, scaled by 4,
to the stack pointer, `x2`. It expands to `lw rd, offset(x2)`. C.LWSP is
only valid when _rd_≠x0 the code
points with _rd_=x0 are reserved.
only valid when _rd_≠x0 the code points with _rd_=x0 are reserved.

C.LDSP is an RV64C/RV128C-only instruction that loads a 64-bit value
from memory into register _rd_. It computes its effective address by
Expand Down
29 changes: 29 additions & 0 deletions src/calling-convention.adoc
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
[appendix]
== Calling Convention for Vector State (Not authoritative - Placeholder Only)

NOTE: This Appendix is only a placeholder to help explain the
conventions used in the code examples, and is not considered frozen or
part of the ratification process. The official RISC-V psABI document
is being expanded to specify the vector calling conventions.

In the RISC-V psABI, the vector registers `v0`-`v31` are all caller-saved.
The `vl` and `vtype` CSRs are also caller-saved.

Procedures may assume that `vstart` is zero upon entry. Procedures may
assume that `vstart` is zero upon return from a procedure call.

NOTE: Application software should normally not write `vstart` explicitly.
Any procedure that does explicitly write `vstart` to a nonzero value must
zero `vstart` before either returning or calling another procedure.

The `vxrm` and `vxsat` fields of `vcsr` have thread storage duration.

Executing a system call causes all caller-saved vector registers
(`v0`-`v31`, `vl`, `vtype`) and `vstart` to become unspecified.

NOTE: This scheme allows system calls that cause context switches to avoid
saving and later restoring the vector registers.

NOTE: Most OSes will choose to either leave these registers intact or reset
them to their initial state to avoid leaking information across process
boundaries.
17 changes: 17 additions & 0 deletions src/example/memcpy.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
.text
.balign 4
.global memcpy
# void *memcpy(void* dest, const void* src, size_t n)
# a0=dest, a1=src, a2=n
#
memcpy:
mv a3, a0 # Copy destination
loop:
vsetvli t0, a2, e8, m8, ta, ma # Vectors of 8b
vle8.v v0, (a1) # Load bytes
add a1, a1, t0 # Bump pointer
sub a2, a2, t0 # Decrement count
vse8.v v0, (a3) # Store bytes
add a3, a3, t0 # Bump pointer
bnez a2, loop # Any more?
ret # Return
29 changes: 29 additions & 0 deletions src/example/saxpy.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
.text
.balign 4
.global saxpy
# void
# saxpy(size_t n, const float a, const float *x, float *y)
# {
# size_t i;
# for (i=0; i<n; i++)
# y[i] = a * x[i] + y[i];
# }
#
# register arguments:
# a0 n
# fa0 a
# a1 x
# a2 y

saxpy:
vsetvli a4, a0, e32, m8, ta, ma
vle32.v v0, (a1)
sub a0, a0, a4
slli a4, a4, 2
add a1, a1, a4
vle32.v v8, (a2)
vfmacc.vf v8, fa0, v0
vse32.v v8, (a2)
add a2, a2, a4
bnez a0, saxpy
ret
221 changes: 221 additions & 0 deletions src/example/sgemm.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
.text
.balign 4
.global sgemm_nn
# RV64IDV system
#
# void
# sgemm_nn(size_t n,
# size_t m,
# size_t k,
# const float*a, // m * k matrix
# size_t lda,
# const float*b, // k * n matrix
# size_t ldb,
# float*c, // m * n matrix
# size_t ldc)
#
# c += a*b (alpha=1, no transpose on input matrices)
# matrices stored in C row-major order

#define n a0
#define m a1
#define k a2
#define ap a3
#define astride a4
#define bp a5
#define bstride a6
#define cp a7
#define cstride t0
#define kt t1
#define nt t2
#define bnp t3
#define cnp t4
#define akp t5
#define bkp s0
#define nvl s1
#define ccp s2
#define amp s3

# Use args as additional temporaries
#define ft12 fa0
#define ft13 fa1
#define ft14 fa2
#define ft15 fa3

# This version holds a 16*VLMAX block of C matrix in vector registers
# in inner loop, but otherwise does not cache or TLB tiling.

sgemm_nn:
addi sp, sp, -FRAMESIZE
sd s0, OFFSET(sp)
sd s1, OFFSET(sp)
sd s2, OFFSET(sp)

# Check for zero size matrices
beqz n, exit
beqz m, exit
beqz k, exit

# Convert elements strides to byte strides.
ld cstride, OFFSET(sp) # Get arg from stack frame
slli astride, astride, 2
slli bstride, bstride, 2
slli cstride, cstride, 2

slti t6, m, 16
bnez t6, end_rows

c_row_loop: # Loop across rows of C blocks

mv nt, n # Initialize n counter for next row of C blocks

mv bnp, bp # Initialize B n-loop pointer to start
mv cnp, cp # Initialize C n-loop pointer

c_col_loop: # Loop across one row of C blocks
vsetvli nvl, nt, e32, ta, ma # 32-bit vectors, LMUL=1

mv akp, ap # reset pointer into A to beginning
mv bkp, bnp # step to next column in B matrix

# Initalize current C submatrix block from memory.
vle32.v v0, (cnp); add ccp, cnp, cstride;
vle32.v v1, (ccp); add ccp, ccp, cstride;
vle32.v v2, (ccp); add ccp, ccp, cstride;
vle32.v v3, (ccp); add ccp, ccp, cstride;
vle32.v v4, (ccp); add ccp, ccp, cstride;
vle32.v v5, (ccp); add ccp, ccp, cstride;
vle32.v v6, (ccp); add ccp, ccp, cstride;
vle32.v v7, (ccp); add ccp, ccp, cstride;
vle32.v v8, (ccp); add ccp, ccp, cstride;
vle32.v v9, (ccp); add ccp, ccp, cstride;
vle32.v v10, (ccp); add ccp, ccp, cstride;
vle32.v v11, (ccp); add ccp, ccp, cstride;
vle32.v v12, (ccp); add ccp, ccp, cstride;
vle32.v v13, (ccp); add ccp, ccp, cstride;
vle32.v v14, (ccp); add ccp, ccp, cstride;
vle32.v v15, (ccp)


mv kt, k # Initialize inner loop counter

# Inner loop scheduled assuming 4-clock occupancy of vfmacc instruction and single-issue pipeline
# Software pipeline loads
flw ft0, (akp); add amp, akp, astride;
flw ft1, (amp); add amp, amp, astride;
flw ft2, (amp); add amp, amp, astride;
flw ft3, (amp); add amp, amp, astride;
# Get vector from B matrix
vle32.v v16, (bkp)

# Loop on inner dimension for current C block
k_loop:
vfmacc.vf v0, ft0, v16
add bkp, bkp, bstride
flw ft4, (amp)
add amp, amp, astride
vfmacc.vf v1, ft1, v16
addi kt, kt, -1 # Decrement k counter
flw ft5, (amp)
add amp, amp, astride
vfmacc.vf v2, ft2, v16
flw ft6, (amp)
add amp, amp, astride
flw ft7, (amp)
vfmacc.vf v3, ft3, v16
add amp, amp, astride
flw ft8, (amp)
add amp, amp, astride
vfmacc.vf v4, ft4, v16
flw ft9, (amp)
add amp, amp, astride
vfmacc.vf v5, ft5, v16
flw ft10, (amp)
add amp, amp, astride
vfmacc.vf v6, ft6, v16
flw ft11, (amp)
add amp, amp, astride
vfmacc.vf v7, ft7, v16
flw ft12, (amp)
add amp, amp, astride
vfmacc.vf v8, ft8, v16
flw ft13, (amp)
add amp, amp, astride
vfmacc.vf v9, ft9, v16
flw ft14, (amp)
add amp, amp, astride
vfmacc.vf v10, ft10, v16
flw ft15, (amp)
add amp, amp, astride
addi akp, akp, 4 # Move to next column of a
vfmacc.vf v11, ft11, v16
beqz kt, 1f # Don't load past end of matrix
flw ft0, (akp)
add amp, akp, astride
1: vfmacc.vf v12, ft12, v16
beqz kt, 1f
flw ft1, (amp)
add amp, amp, astride
1: vfmacc.vf v13, ft13, v16
beqz kt, 1f
flw ft2, (amp)
add amp, amp, astride
1: vfmacc.vf v14, ft14, v16
beqz kt, 1f # Exit out of loop
flw ft3, (amp)
add amp, amp, astride
vfmacc.vf v15, ft15, v16
vle32.v v16, (bkp) # Get next vector from B matrix, overlap loads with jump stalls
j k_loop

1: vfmacc.vf v15, ft15, v16

# Save C matrix block back to memory
vse32.v v0, (cnp); add ccp, cnp, cstride;
vse32.v v1, (ccp); add ccp, ccp, cstride;
vse32.v v2, (ccp); add ccp, ccp, cstride;
vse32.v v3, (ccp); add ccp, ccp, cstride;
vse32.v v4, (ccp); add ccp, ccp, cstride;
vse32.v v5, (ccp); add ccp, ccp, cstride;
vse32.v v6, (ccp); add ccp, ccp, cstride;
vse32.v v7, (ccp); add ccp, ccp, cstride;
vse32.v v8, (ccp); add ccp, ccp, cstride;
vse32.v v9, (ccp); add ccp, ccp, cstride;
vse32.v v10, (ccp); add ccp, ccp, cstride;
vse32.v v11, (ccp); add ccp, ccp, cstride;
vse32.v v12, (ccp); add ccp, ccp, cstride;
vse32.v v13, (ccp); add ccp, ccp, cstride;
vse32.v v14, (ccp); add ccp, ccp, cstride;
vse32.v v15, (ccp)

# Following tail instructions should be scheduled earlier in free slots during C block save.
# Leaving here for clarity.

# Bump pointers for loop across blocks in one row
slli t6, nvl, 2
add cnp, cnp, t6 # Move C block pointer over
add bnp, bnp, t6 # Move B block pointer over
sub nt, nt, nvl # Decrement element count in n dimension
bnez nt, c_col_loop # Any more to do?

# Move to next set of rows
addi m, m, -16 # Did 16 rows above
slli t6, astride, 4 # Multiply astride by 16
add ap, ap, t6 # Move A matrix pointer down 16 rows
slli t6, cstride, 4 # Multiply cstride by 16
add cp, cp, t6 # Move C matrix pointer down 16 rows

slti t6, m, 16
beqz t6, c_row_loop

# Handle end of matrix with fewer than 16 rows.
# Can use smaller versions of above decreasing in powers-of-2 depending on code-size concerns.
end_rows:
# Not done.

exit:
ld s0, OFFSET(sp)
ld s1, OFFSET(sp)
ld s2, OFFSET(sp)
addi sp, sp, FRAMESIZE
ret
34 changes: 34 additions & 0 deletions src/example/strcmp.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
.text
.balign 4
.global strcmp
# int strcmp(const char *src1, const char* src2)
strcmp:
## Using LMUL=2, but same register names work for larger LMULs
li t1, 0 # Initial pointer bump
loop:
vsetvli t0, x0, e8, m2, ta, ma # Max length vectors of bytes
add a0, a0, t1 # Bump src1 pointer
vle8ff.v v8, (a0) # Get src1 bytes
add a1, a1, t1 # Bump src2 pointer
vle8ff.v v16, (a1) # Get src2 bytes

vmseq.vi v0, v8, 0 # Flag zero bytes in src1
vmsne.vv v1, v8, v16 # Flag if src1 != src2
vmor.mm v0, v0, v1 # Combine exit conditions

vfirst.m a2, v0 # ==0 or != ?
csrr t1, vl # Get number of bytes fetched

bltz a2, loop # Loop if all same and no zero byte

add a0, a0, a2 # Get src1 element address
lbu a3, (a0) # Get src1 byte from memory

add a1, a1, a2 # Get src2 element address
lbu a4, (a1) # Get src2 byte from memory

sub a0, a3, a4 # Return value.

ret


20 changes: 20 additions & 0 deletions src/example/strcpy.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
.text
.balign 4
.global strcpy
# char* strcpy(char *dst, const char* src)
strcpy:
mv a2, a0 # Copy dst
li t0, -1 # Infinite AVL
loop:
vsetvli x0, t0, e8, m8, ta, ma # Max length vectors of bytes
vle8ff.v v8, (a1) # Get src bytes
csrr t1, vl # Get number of bytes fetched
vmseq.vi v1, v8, 0 # Flag zero bytes
vfirst.m a3, v1 # Zero found?
add a1, a1, t1 # Bump pointer
vmsif.m v0, v1 # Set mask up to and including zero byte.
vse8.v v8, (a2), v0.t # Write out bytes
add a2, a2, t1 # Bump pointer
bltz a3, loop # Zero byte not found, so loop

ret
22 changes: 22 additions & 0 deletions src/example/strlen.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
.text
.balign 4
.global strlen
# size_t strlen(const char *str)
# a0 holds *str

strlen:
mv a3, a0 # Save start
loop:
vsetvli a1, x0, e8, m8, ta, ma # Vector of bytes of maximum length
vle8ff.v v8, (a3) # Load bytes
csrr a1, vl # Get bytes read
vmseq.vi v0, v8, 0 # Set v0[i] where v8[i] = 0
vfirst.m a2, v0 # Find first set bit
add a3, a3, a1 # Bump pointer
bltz a2, loop # Not found?

add a0, a0, a1 # Sum start + bump
add a3, a3, a2 # Add index
sub a0, a3, a0 # Subtract start address+bump

ret
Loading

0 comments on commit aa5dce0

Please sign in to comment.