-
Notifications
You must be signed in to change notification settings - Fork 639
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1088 from riscv/vector
- Loading branch information
Showing
23 changed files
with
6,698 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
[appendix] | ||
== Calling Convention for Vector State (Not authoritative - Placeholder Only) | ||
|
||
NOTE: This Appendix is only a placeholder to help explain the | ||
conventions used in the code examples, and is not considered frozen or | ||
part of the ratification process. The official RISC-V psABI document | ||
is being expanded to specify the vector calling conventions. | ||
|
||
In the RISC-V psABI, the vector registers `v0`-`v31` are all caller-saved. | ||
The `vl` and `vtype` CSRs are also caller-saved. | ||
|
||
Procedures may assume that `vstart` is zero upon entry. Procedures may | ||
assume that `vstart` is zero upon return from a procedure call. | ||
|
||
NOTE: Application software should normally not write `vstart` explicitly. | ||
Any procedure that does explicitly write `vstart` to a nonzero value must | ||
zero `vstart` before either returning or calling another procedure. | ||
|
||
The `vxrm` and `vxsat` fields of `vcsr` have thread storage duration. | ||
|
||
Executing a system call causes all caller-saved vector registers | ||
(`v0`-`v31`, `vl`, `vtype`) and `vstart` to become unspecified. | ||
|
||
NOTE: This scheme allows system calls that cause context switches to avoid | ||
saving and later restoring the vector registers. | ||
|
||
NOTE: Most OSes will choose to either leave these registers intact or reset | ||
them to their initial state to avoid leaking information across process | ||
boundaries. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
.text | ||
.balign 4 | ||
.global memcpy | ||
# void *memcpy(void* dest, const void* src, size_t n) | ||
# a0=dest, a1=src, a2=n | ||
# | ||
memcpy: | ||
mv a3, a0 # Copy destination | ||
loop: | ||
vsetvli t0, a2, e8, m8, ta, ma # Vectors of 8b | ||
vle8.v v0, (a1) # Load bytes | ||
add a1, a1, t0 # Bump pointer | ||
sub a2, a2, t0 # Decrement count | ||
vse8.v v0, (a3) # Store bytes | ||
add a3, a3, t0 # Bump pointer | ||
bnez a2, loop # Any more? | ||
ret # Return |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
.text | ||
.balign 4 | ||
.global saxpy | ||
# void | ||
# saxpy(size_t n, const float a, const float *x, float *y) | ||
# { | ||
# size_t i; | ||
# for (i=0; i<n; i++) | ||
# y[i] = a * x[i] + y[i]; | ||
# } | ||
# | ||
# register arguments: | ||
# a0 n | ||
# fa0 a | ||
# a1 x | ||
# a2 y | ||
|
||
saxpy: | ||
vsetvli a4, a0, e32, m8, ta, ma | ||
vle32.v v0, (a1) | ||
sub a0, a0, a4 | ||
slli a4, a4, 2 | ||
add a1, a1, a4 | ||
vle32.v v8, (a2) | ||
vfmacc.vf v8, fa0, v0 | ||
vse32.v v8, (a2) | ||
add a2, a2, a4 | ||
bnez a0, saxpy | ||
ret |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,221 @@ | ||
.text | ||
.balign 4 | ||
.global sgemm_nn | ||
# RV64IDV system | ||
# | ||
# void | ||
# sgemm_nn(size_t n, | ||
# size_t m, | ||
# size_t k, | ||
# const float*a, // m * k matrix | ||
# size_t lda, | ||
# const float*b, // k * n matrix | ||
# size_t ldb, | ||
# float*c, // m * n matrix | ||
# size_t ldc) | ||
# | ||
# c += a*b (alpha=1, no transpose on input matrices) | ||
# matrices stored in C row-major order | ||
|
||
#define n a0 | ||
#define m a1 | ||
#define k a2 | ||
#define ap a3 | ||
#define astride a4 | ||
#define bp a5 | ||
#define bstride a6 | ||
#define cp a7 | ||
#define cstride t0 | ||
#define kt t1 | ||
#define nt t2 | ||
#define bnp t3 | ||
#define cnp t4 | ||
#define akp t5 | ||
#define bkp s0 | ||
#define nvl s1 | ||
#define ccp s2 | ||
#define amp s3 | ||
|
||
# Use args as additional temporaries | ||
#define ft12 fa0 | ||
#define ft13 fa1 | ||
#define ft14 fa2 | ||
#define ft15 fa3 | ||
|
||
# This version holds a 16*VLMAX block of C matrix in vector registers | ||
# in inner loop, but otherwise does not cache or TLB tiling. | ||
|
||
sgemm_nn: | ||
addi sp, sp, -FRAMESIZE | ||
sd s0, OFFSET(sp) | ||
sd s1, OFFSET(sp) | ||
sd s2, OFFSET(sp) | ||
|
||
# Check for zero size matrices | ||
beqz n, exit | ||
beqz m, exit | ||
beqz k, exit | ||
|
||
# Convert elements strides to byte strides. | ||
ld cstride, OFFSET(sp) # Get arg from stack frame | ||
slli astride, astride, 2 | ||
slli bstride, bstride, 2 | ||
slli cstride, cstride, 2 | ||
|
||
slti t6, m, 16 | ||
bnez t6, end_rows | ||
|
||
c_row_loop: # Loop across rows of C blocks | ||
|
||
mv nt, n # Initialize n counter for next row of C blocks | ||
|
||
mv bnp, bp # Initialize B n-loop pointer to start | ||
mv cnp, cp # Initialize C n-loop pointer | ||
|
||
c_col_loop: # Loop across one row of C blocks | ||
vsetvli nvl, nt, e32, ta, ma # 32-bit vectors, LMUL=1 | ||
|
||
mv akp, ap # reset pointer into A to beginning | ||
mv bkp, bnp # step to next column in B matrix | ||
|
||
# Initalize current C submatrix block from memory. | ||
vle32.v v0, (cnp); add ccp, cnp, cstride; | ||
vle32.v v1, (ccp); add ccp, ccp, cstride; | ||
vle32.v v2, (ccp); add ccp, ccp, cstride; | ||
vle32.v v3, (ccp); add ccp, ccp, cstride; | ||
vle32.v v4, (ccp); add ccp, ccp, cstride; | ||
vle32.v v5, (ccp); add ccp, ccp, cstride; | ||
vle32.v v6, (ccp); add ccp, ccp, cstride; | ||
vle32.v v7, (ccp); add ccp, ccp, cstride; | ||
vle32.v v8, (ccp); add ccp, ccp, cstride; | ||
vle32.v v9, (ccp); add ccp, ccp, cstride; | ||
vle32.v v10, (ccp); add ccp, ccp, cstride; | ||
vle32.v v11, (ccp); add ccp, ccp, cstride; | ||
vle32.v v12, (ccp); add ccp, ccp, cstride; | ||
vle32.v v13, (ccp); add ccp, ccp, cstride; | ||
vle32.v v14, (ccp); add ccp, ccp, cstride; | ||
vle32.v v15, (ccp) | ||
|
||
|
||
mv kt, k # Initialize inner loop counter | ||
|
||
# Inner loop scheduled assuming 4-clock occupancy of vfmacc instruction and single-issue pipeline | ||
# Software pipeline loads | ||
flw ft0, (akp); add amp, akp, astride; | ||
flw ft1, (amp); add amp, amp, astride; | ||
flw ft2, (amp); add amp, amp, astride; | ||
flw ft3, (amp); add amp, amp, astride; | ||
# Get vector from B matrix | ||
vle32.v v16, (bkp) | ||
|
||
# Loop on inner dimension for current C block | ||
k_loop: | ||
vfmacc.vf v0, ft0, v16 | ||
add bkp, bkp, bstride | ||
flw ft4, (amp) | ||
add amp, amp, astride | ||
vfmacc.vf v1, ft1, v16 | ||
addi kt, kt, -1 # Decrement k counter | ||
flw ft5, (amp) | ||
add amp, amp, astride | ||
vfmacc.vf v2, ft2, v16 | ||
flw ft6, (amp) | ||
add amp, amp, astride | ||
flw ft7, (amp) | ||
vfmacc.vf v3, ft3, v16 | ||
add amp, amp, astride | ||
flw ft8, (amp) | ||
add amp, amp, astride | ||
vfmacc.vf v4, ft4, v16 | ||
flw ft9, (amp) | ||
add amp, amp, astride | ||
vfmacc.vf v5, ft5, v16 | ||
flw ft10, (amp) | ||
add amp, amp, astride | ||
vfmacc.vf v6, ft6, v16 | ||
flw ft11, (amp) | ||
add amp, amp, astride | ||
vfmacc.vf v7, ft7, v16 | ||
flw ft12, (amp) | ||
add amp, amp, astride | ||
vfmacc.vf v8, ft8, v16 | ||
flw ft13, (amp) | ||
add amp, amp, astride | ||
vfmacc.vf v9, ft9, v16 | ||
flw ft14, (amp) | ||
add amp, amp, astride | ||
vfmacc.vf v10, ft10, v16 | ||
flw ft15, (amp) | ||
add amp, amp, astride | ||
addi akp, akp, 4 # Move to next column of a | ||
vfmacc.vf v11, ft11, v16 | ||
beqz kt, 1f # Don't load past end of matrix | ||
flw ft0, (akp) | ||
add amp, akp, astride | ||
1: vfmacc.vf v12, ft12, v16 | ||
beqz kt, 1f | ||
flw ft1, (amp) | ||
add amp, amp, astride | ||
1: vfmacc.vf v13, ft13, v16 | ||
beqz kt, 1f | ||
flw ft2, (amp) | ||
add amp, amp, astride | ||
1: vfmacc.vf v14, ft14, v16 | ||
beqz kt, 1f # Exit out of loop | ||
flw ft3, (amp) | ||
add amp, amp, astride | ||
vfmacc.vf v15, ft15, v16 | ||
vle32.v v16, (bkp) # Get next vector from B matrix, overlap loads with jump stalls | ||
j k_loop | ||
|
||
1: vfmacc.vf v15, ft15, v16 | ||
|
||
# Save C matrix block back to memory | ||
vse32.v v0, (cnp); add ccp, cnp, cstride; | ||
vse32.v v1, (ccp); add ccp, ccp, cstride; | ||
vse32.v v2, (ccp); add ccp, ccp, cstride; | ||
vse32.v v3, (ccp); add ccp, ccp, cstride; | ||
vse32.v v4, (ccp); add ccp, ccp, cstride; | ||
vse32.v v5, (ccp); add ccp, ccp, cstride; | ||
vse32.v v6, (ccp); add ccp, ccp, cstride; | ||
vse32.v v7, (ccp); add ccp, ccp, cstride; | ||
vse32.v v8, (ccp); add ccp, ccp, cstride; | ||
vse32.v v9, (ccp); add ccp, ccp, cstride; | ||
vse32.v v10, (ccp); add ccp, ccp, cstride; | ||
vse32.v v11, (ccp); add ccp, ccp, cstride; | ||
vse32.v v12, (ccp); add ccp, ccp, cstride; | ||
vse32.v v13, (ccp); add ccp, ccp, cstride; | ||
vse32.v v14, (ccp); add ccp, ccp, cstride; | ||
vse32.v v15, (ccp) | ||
|
||
# Following tail instructions should be scheduled earlier in free slots during C block save. | ||
# Leaving here for clarity. | ||
|
||
# Bump pointers for loop across blocks in one row | ||
slli t6, nvl, 2 | ||
add cnp, cnp, t6 # Move C block pointer over | ||
add bnp, bnp, t6 # Move B block pointer over | ||
sub nt, nt, nvl # Decrement element count in n dimension | ||
bnez nt, c_col_loop # Any more to do? | ||
|
||
# Move to next set of rows | ||
addi m, m, -16 # Did 16 rows above | ||
slli t6, astride, 4 # Multiply astride by 16 | ||
add ap, ap, t6 # Move A matrix pointer down 16 rows | ||
slli t6, cstride, 4 # Multiply cstride by 16 | ||
add cp, cp, t6 # Move C matrix pointer down 16 rows | ||
|
||
slti t6, m, 16 | ||
beqz t6, c_row_loop | ||
|
||
# Handle end of matrix with fewer than 16 rows. | ||
# Can use smaller versions of above decreasing in powers-of-2 depending on code-size concerns. | ||
end_rows: | ||
# Not done. | ||
|
||
exit: | ||
ld s0, OFFSET(sp) | ||
ld s1, OFFSET(sp) | ||
ld s2, OFFSET(sp) | ||
addi sp, sp, FRAMESIZE | ||
ret |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
.text | ||
.balign 4 | ||
.global strcmp | ||
# int strcmp(const char *src1, const char* src2) | ||
strcmp: | ||
## Using LMUL=2, but same register names work for larger LMULs | ||
li t1, 0 # Initial pointer bump | ||
loop: | ||
vsetvli t0, x0, e8, m2, ta, ma # Max length vectors of bytes | ||
add a0, a0, t1 # Bump src1 pointer | ||
vle8ff.v v8, (a0) # Get src1 bytes | ||
add a1, a1, t1 # Bump src2 pointer | ||
vle8ff.v v16, (a1) # Get src2 bytes | ||
|
||
vmseq.vi v0, v8, 0 # Flag zero bytes in src1 | ||
vmsne.vv v1, v8, v16 # Flag if src1 != src2 | ||
vmor.mm v0, v0, v1 # Combine exit conditions | ||
|
||
vfirst.m a2, v0 # ==0 or != ? | ||
csrr t1, vl # Get number of bytes fetched | ||
|
||
bltz a2, loop # Loop if all same and no zero byte | ||
|
||
add a0, a0, a2 # Get src1 element address | ||
lbu a3, (a0) # Get src1 byte from memory | ||
|
||
add a1, a1, a2 # Get src2 element address | ||
lbu a4, (a1) # Get src2 byte from memory | ||
|
||
sub a0, a3, a4 # Return value. | ||
|
||
ret | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
.text | ||
.balign 4 | ||
.global strcpy | ||
# char* strcpy(char *dst, const char* src) | ||
strcpy: | ||
mv a2, a0 # Copy dst | ||
li t0, -1 # Infinite AVL | ||
loop: | ||
vsetvli x0, t0, e8, m8, ta, ma # Max length vectors of bytes | ||
vle8ff.v v8, (a1) # Get src bytes | ||
csrr t1, vl # Get number of bytes fetched | ||
vmseq.vi v1, v8, 0 # Flag zero bytes | ||
vfirst.m a3, v1 # Zero found? | ||
add a1, a1, t1 # Bump pointer | ||
vmsif.m v0, v1 # Set mask up to and including zero byte. | ||
vse8.v v8, (a2), v0.t # Write out bytes | ||
add a2, a2, t1 # Bump pointer | ||
bltz a3, loop # Zero byte not found, so loop | ||
|
||
ret |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
.text | ||
.balign 4 | ||
.global strlen | ||
# size_t strlen(const char *str) | ||
# a0 holds *str | ||
|
||
strlen: | ||
mv a3, a0 # Save start | ||
loop: | ||
vsetvli a1, x0, e8, m8, ta, ma # Vector of bytes of maximum length | ||
vle8ff.v v8, (a3) # Load bytes | ||
csrr a1, vl # Get bytes read | ||
vmseq.vi v0, v8, 0 # Set v0[i] where v8[i] = 0 | ||
vfirst.m a2, v0 # Find first set bit | ||
add a3, a3, a1 # Bump pointer | ||
bltz a2, loop # Not found? | ||
|
||
add a0, a0, a1 # Sum start + bump | ||
add a3, a3, a2 # Add index | ||
sub a0, a3, a0 # Subtract start address+bump | ||
|
||
ret |
Oops, something went wrong.