Commit 007cb530 authored by YouGuoliang's avatar YouGuoliang
Browse files

Add Lab03

parent 6519d3e6
...@@ -4,8 +4,8 @@ ...@@ -4,8 +4,8 @@
本学期计划实验时长为10周 本学期计划实验时长为10周
* Lab1(第4-5周) 【15%】: 熟悉RISC-V指令集,完成RV32I指令集流水线CPU的设计报告; * Lab1(第4-5周) 【15%】: 熟悉RISC-V指令集,完成RV32I指令集流水线CPU的设计报告;
* Lab2(第6-8周) 【40%】: 完成RV32I流水线CPU的Verilog代码;利用RISCV-test测试文件进行仿真和CPU功能验证 * Lab2(第6-9周) 【40%】: 完成RV32I流水线CPU的Verilog代码;利用RISCV-test测试文件进行仿真和CPU功能验证
* Lab3(第9-11周) 【20%】: cache设计和实现 * Lab3(第10-11周) 【20%】: cache设计和实现
* Lab4(第12-13周)【15%】: 分支预测设计与实现 * Lab4(第12-13周)【15%】: 分支预测设计与实现
* Lab5(第14-15周) 【10%】: 学习使用提供的Tomasulo软件模拟器和多Cache一致性软件模拟器,并完成实验报告 * Lab5(第14-15周) 【10%】: 学习使用提供的Tomasulo软件模拟器和多Cache一致性软件模拟器,并完成实验报告
...@@ -51,7 +51,19 @@ ...@@ -51,7 +51,19 @@
提交至BB平台 提交至BB平台
提交格式:Lab2-学号-姓名.rar(or .zip) 要求包括一份pdf格式实验报告和用到的源代码集合的文件夹 提交格式:Lab2-学号-姓名.rar(or .zip) 要求包括一份pdf格式实验报告和用到的源代码集合的文件夹
* **2021.5.9 Release Lab3**
阶段一二课堂验收 截止日期:2021.5.24 (只进行一次统一验收,验收时间为5.24日)
实验报告 截止日期:2021.5.31
提交至BB平台
提交格式:Lab3-学号-姓名.rar(or .zip) 要求包括一份pdf格式实验报告和用到的源代码集合的文件夹
## 实验课安排 ## 实验课安排
* **lab1答案分析+Lab2预先讲解** * lab1答案分析+Lab2预先讲解
2021.4.19晚(18:30-21:00 电三楼516) 2021.4.19晚(18:30-21:00 电三楼406)
* lab2阶段一检查
2021.4.26晚(18:30-21:00 电三楼406)
* lab2阶段二三检查(未检查的同学在本次阶段检查中完成Lab2所有阶段检查)
2021.5.10晚(18:30-21:00 电三楼406)
\ No newline at end of file
# -*- coding:utf-8 -*-
# Python2 or Python3
# Author : WangXuan
#
# 功能: 生成针对于矩阵乘法(matmul)的 mem.sv ,里面存放两个要进行相乘的初始矩阵
#
verilog_head = '''
module mem #( //
parameter ADDR_LEN = 11 //
) (
input clk, rst,
input [ADDR_LEN-1:0] addr, // memory address
output reg [31:0] rd_data, // data read out
input wr_req,
input [31:0] wr_data // data write in
);
localparam MEM_SIZE = 1<<ADDR_LEN;
reg [31:0] ram_cell [MEM_SIZE];
always @ (posedge clk or posedge rst)
if(rst)
rd_data <= 0;
else
rd_data <= ram_cell[addr];
always @ (posedge clk)
if(wr_req)
ram_cell[addr] <= wr_data;
initial begin'''
verilog_tail = '''end
endmodule
'''
import sys
from random import randint
if len(sys.argv) != 2:
print(' Usage:\n python generate_mem_for_matmul.py [matrix size]')
print(' Example:\n python generate_mem_for_matmul.py 16')
print(' Tip: use this command to write to file:\n python generate_mem_for_matmul.py 16 > mem.sv')
else:
try:
N = int( sys.argv[1] )
except:
print(' *** Error: parameter must be integer, not %s' % (sys.argv[1], ) )
sys.exit(-1)
if N<=1:
print(' *** Error: parameter must be larger than 1, not %d' % (N, ) )
sys.exit(-1)
print(verilog_head)
A, B, C = [], [], []
for i in range(N):
Aline, Bline, Cline = [], [], []
for j in range(N):
Aline.append( randint(0,0xffffffff) )
Bline.append( randint(0,0xffffffff) )
Cline.append( 0 )
A.append(Aline)
B.append(Bline)
C.append(Cline)
for i in range(N):
for j in range(N):
for k in range(N):
C[i][j] += A[i][k] & B[k][j]
print(' // dst matrix C')
for i in range(N):
for j in range(N):
print(" ram_cell[%8d] = 32'h0; // 32'h%08x;" % ( N*i+j, C[i][j] & 0xffffffff, ) )
print(' // src matrix A')
for i in range(N):
for j in range(N):
print(" ram_cell[%8d] = 32'h%08x;" % ( N*N+N*i+j, A[i][j], ) )
print(' // src matrix B')
for i in range(N):
for j in range(N):
print(" ram_cell[%8d] = 32'h%08x;" % ( 2*N*N+N*i+j, B[i][j], ) )
print(verilog_tail)
# -*- coding:utf-8 -*-
# Python2 or Python3
# Author : WangXuan
#
# 功能: 生成针对于快速排序(matmul)的 mem.sv ,里面存放即将被排序的数据
#
verilog_head = '''
module mem #( //
parameter ADDR_LEN = 11 //
) (
input clk, rst,
input [ADDR_LEN-1:0] addr, // memory address
output reg [31:0] rd_data, // data read out
input wr_req,
input [31:0] wr_data // data write in
);
localparam MEM_SIZE = 1<<ADDR_LEN;
reg [31:0] ram_cell [MEM_SIZE];
always @ (posedge clk or posedge rst)
if(rst)
rd_data <= 0;
else
rd_data <= ram_cell[addr];
always @ (posedge clk)
if(wr_req)
ram_cell[addr] <= wr_data;
initial begin'''
verilog_tail = '''end
endmodule
'''
import sys
from random import shuffle
if len(sys.argv) != 2:
print(' Usage:\n python generate_mem_for_quicksort.py [matrix size]')
print(' Example:\n python generate_mem_for_quicksort.py 16')
print(' Tip: use this command to write to file:\n python generate_mem_for_quicksort.py 16 > mem.sv')
else:
try:
N = int( sys.argv[1] )
except:
print(' *** Error: parameter must be integer, not %s' % (sys.argv[1], ) )
sys.exit(-1)
if N<=2:
print(' *** Error: parameter must be larger than 2, not %d' % (N, ) )
sys.exit(-1)
print(verilog_head)
lst = list(range(N))
shuffle(lst)
for i in range(N):
print(" ram_cell[%8d] = 32'h%08x;" % ( i, lst[i], ) )
print(verilog_tail)
# 伪矩阵乘法 汇编代码
# 我们的 RV32I CPU 没有实现乘法指令,所以在伪矩阵乘法中,使用按位或代替加法,用加法代替乘法,完成矩阵运算。
# 虽然不是真的矩阵乘法,但能够模仿矩阵乘法对RAM的访问过程,对cache的性能研究起到作用
#
.org 0x0
.global _start
_start:
xori a4, zero, 4 # a4寄存器决定了计算的规模,矩阵规模=N*NN=2^a4。例如a4=4,则矩阵为 2^4=16阶方阵。该值可以修改。当然,矩阵规模变化后,DataRam的内存分配方式也要同步的变化,才能运行出正确结果
# 以下指令计算3个矩阵(目的矩阵,源矩阵1,源矩阵2)在内存中的起始地址。
# 这三个矩阵在内存中顺序而紧挨着存放,例如 a4=4,则N=16,则每个矩阵占N*N=256个字,即1024个字节
# 目的矩阵起始地址为0 源矩阵1起始地址为1024 源矩阵2起始地址为2048
# 目的矩阵起始地址放在a2里,源矩阵1起始地址放在a0里,源矩阵2起始地址放在a1
xori a3, zero, 4
sll a3, a3 , a4
xor a2, zero, zero
sll a0, a3 , a4
add a1, a0 , a0
# 开始矩阵乘法,使用伪矩阵乘法公式:c_{ij} = \sigma c_{ik}*b{kj} 循环嵌套顺序(从内向外)为 i,j,k 分别使用 t0,t1,t2 存放 i,j,k
xor t0, zero, zero
MatMulLoopI:
xor t1, zero, zero
MatMulLoopJ:
xor t3, zero, zero #t3存放最内求和循环的累加和,首先将t3清零
xor t2, zero, zero
MatMulLoopK:
sll t4, t0, a4
add t4, t4, t2
add t4, t4, a0
lw t4, (t4)
sll t5, t2, a4
add t5, t5, t1
add t5, t5, a1
lw t5, (t5)
and t4, t4, t5
add t3, t3, t4
addi t2, t2, 4
blt t2, a3, MatMulLoopK
sll t4, t0, a4
add t4, t4, t1
add t4, t4, a2
sw t3, (t4)
addi t1, t1, 4
blt t1, a3, MatMulLoopJ
addi t0, t0, 4
blt t0, a3, MatMulLoopI
# 计算结束,死循环
# 计算结束,死循环
add t0, zero, 1
sll t0, t0, a4
sll t0, t0, a4
slli t0, t0, 2
addi t1, zero, 0
Loop:
lw t2, (t1)
addi t1, t1, 4
blt t1, t0, Loop
InfLoop:
jal zero, InfLoop
\ No newline at end of file
# 概述:对数组进行原地快速排序
# Author: WangXuan
.org 0x0
.global _start
_start:
main:
xor a3, zero, 0x100 # 指定排序问题的规模。0x100则代表要给0x100=256个数字进行快速排序。
lui sp, 0x00001 # 设置栈顶指针 sp=0x1000
xor a0, zero, zero # 准备函数参数,a0=0, 说明要排序的数组的RAM起始地址为0
xor a1, zero, zero # 准备函数参数,a1=0,说明从第0个字开始排序
addi a2, a3 , -1
slli a2, a2 , 2 # 准备函数参数,a2=数组最后一个元素的地址偏移。我们要排0x100=1024个数,最后一个数的地址为0x3fc
jal ra , QuickSort # 开始排序
addi t0, a3, 0
addi t1, a0, 0
slli t0, t0, 2
slli t1, t1, 2
Loop:
lw t2, (t1)
addi t1, t1, 4
blt t1, t0, Loop
infinity_loop:
jal zero, infinity_loop # 排序结束,死循环
QuickSort:
# 函数:QuickSort:以a0为基地址的原地升序快速排序,a1start即开始下标,a2end即结束下标
# 例: a0=0x00000100a1=0, a2=31*4,则计算从0x00000100开始的32个字的快速排序
# 注: 以有符号数为比较标准。例如0xffffffff应该排在0x00000001前面,因为0xffffffff代表-1,比1要小
# 之所以使用低13位,因为13位二进制数取值范围位0~8191,不会超过4位十进制数
# 改变数据RAM 除了被排序的数组外,还使用了以sp寄存器为栈顶指针的栈。使用栈的大小根据排序长度而不同,调用前合理设置sp的值以防爆栈
# 改变的寄存器: t0, t1, t2, t3, t4
bge a1, a2, QuickSortReturn # if a1>=a2, end<=start, jump to return
or t1, a1, zero # t1=i=a1=start
or t2, a2, zero # t2=j=a2=end
add t0, a0, t1 #
lw t0, (t0) # t0=key=lst[start]
PartationStart:
PartationFirstStart: # start of for loop
bge t1, t2, PartationEnd # if i>=j, branch to next step
add t3, a0, t2 #
lw t3, (t3) # t3=lst[j]
blt t3, t0, PartationFirstEnd # if lst[j]<key, branch to next step
addi t2, t2, -4 # t2-=4 j--
jal zero, PartationFirstStart # for loop
PartationFirstEnd: # end of for loop
add t4 , a0, t1 # t4=lst+i
sw t3 , (t4) # lst[i] = t3 = lst[j]
PartationSecondStart: # start of for loop
bge t1, t2, PartationEnd # if i>=j, branch to next step
add t3, a0, t1 #
lw t3, (t3) # t3=lst[i]
blt t0, t3, PartationSecondEnd # if key<lst[i], branch to next step
addi t1, t1, 4 # t1+=4 i++
jal zero, PartationSecondStart # for loop
PartationSecondEnd: # end of for loop
add t4 , a0, t2 # t4=lst+j
sw t3 , (t4) # lst[j] = t3 = lst[i]
blt t1, t2, PartationStart # if t1<t2, branch to while start
PartationEnd:
add t4 , a0, t1 # t4=lst+i
sw t0 , (t4) # lst[i] = t0 = key
addi sp, sp, -4 # sp-=4
sw ra, (sp) # mem[sp] = ra # push ra to stack
addi sp, sp, -4 # sp-=4
sw a1, (sp) # mem[sp] = a1 # push a1 to stack, save start
addi sp, sp, -4 # sp-=4
sw a2, (sp) # mem[sp] = a2 # push a2 to stack, save end
addi sp, sp, -4 # sp-=4
sw t1, (sp) # mem[sp] = t1 # push t1 to stack, save i
addi a2, t1, -4 # a2 = i-4, a parameter for recursive call
jal ra , QuickSort
lw t1, (sp) # pop i form stack
addi sp, sp, 4 # sp+=4
lw a2, (sp) # pop end form stack
addi sp, sp, 4 # sp+=4
lw a1, (sp) # pop start form stack
addi sp, sp, -4 # sp-=4
sw a2, (sp) # mem[sp] = a2 # push a2 to stack, save end
addi sp, sp, -4 # sp-=4
sw t1, (sp) # mem[sp] = t1 # push t1 to stack, save i
addi a1, t1, 4 # a1 = i+4, a parameter for recursive call
jal ra , QuickSort
lw t1, (sp) # pop i form stack
addi sp, sp, 4 # sp+=4
lw a2, (sp) # pop end form stack
addi sp, sp, 4 # sp+=4
lw a1, (sp) # pop start form stack
addi sp, sp, 4 # sp+=4
lw ra, (sp) # pop ra form stack
addi sp, sp, 4 # sp+=4
QuickSortReturn: # 函数结尾
jalr zero, ra, 0 # 返回
#
# QuickSort函数的等效C代码:
# void QuickSort(int *lst, int start, int end){
# if(end>start){
# int i = start,j = end,key = lst[start];
# while(i < j){
# for (;i < j && key <= lst[j];j--);
# lst[i] = lst[j];
# for (;i < j && key >= lst[i];i++);
# lst[j] = lst[i];
# }
# lst[i] = key;
# QuickSort(lst, start, i - 1);
# QuickSort(lst, i + 1, end);
# }
# }
#
#
\ No newline at end of file
# -*- coding:utf-8 -*-
# Python2 or Python3
# Author : WangXuan
#
# 功能: 使用 Windows 版的 RISCV 工具链将汇编编译成 指令 Cache 的 Verilog 文件
#
import os, sys, binascii
verilog_head = '''// asm file name: %s
module InstructionCache(
input wire clk,
input wire write_en,
input wire [31:2] addr, debug_addr,
input wire [31:0] debug_input,
output reg [31:0] data, debug_data
);
// local variable
wire addr_valid = (addr[31:14] == 18'h0);
wire debug_addr_valid = (debug_addr[31:14] == 18'h0);
wire [11:0] dealt_addr = addr[13:2];
wire [11:0] dealt_debug_addr = debug_addr[13:2];
// cache content
reg [31:0] inst_cache[0:4095];
initial begin
data = 32'h0;
debug_data = 32'h0;
'''
verilog_tail = '''end
always@(posedge clk)
begin
data <= addr_valid ? inst_cache[dealt_addr] : 32'h0;
debug_data <= debug_addr_valid ? inst_cache[dealt_debug_addr] : 32'h0;
if(write_en & debug_addr_valid)
inst_cache[dealt_debug_addr] <= debug_input;
end
endmodule
'''
RISCV_TOOLCHAIN_PATH = '.\\riscv32-gnu-toolchain-windows\\'
if len(sys.argv) != 3:
print(' Usage:\n python asm2verilog.py [INPUT ASM file] [OUTPUT Verilog file]')
print(' Example:\n python asm2verilog.py QuickSort.S InstructionCache.v')
else:
INPUT = sys.argv[1]
OUTPUT = sys.argv[2]
res = os.system( '%sriscv32-elf-as %s -o compile_tmp.o -march=rv32i' % (RISCV_TOOLCHAIN_PATH, INPUT) )
if res != 0:
print('\n Assembling Error!')
sys.exit()
os.system( '%sriscv32-elf-ld compile_tmp.o -o compile_tmp.om' % (RISCV_TOOLCHAIN_PATH ) )
os.system( 'del compile_tmp.o' )
os.system( '%sriscv32-elf-objcopy -O binary compile_tmp.om compile_tmp.bin' % (RISCV_TOOLCHAIN_PATH, ) )
os.system( 'del compile_tmp.om' )
s = binascii.b2a_hex( open('compile_tmp.bin', 'rb').read() )
os.system( 'del compile_tmp.bin' )
def byte_wise_reverse(b):
return b[6:8] + b[4:6] + b[2:4] + b[0:2]
return b[6:8] + b[4:6] + b[2:4] + b[0:2]
with open(OUTPUT, 'w') as f:
f.write(verilog_head % (INPUT,))
for i in range(0, len(s), 8):
instr_string = str(byte_wise_reverse(s[i:i+8]))
if instr_string[1] == "'":
instr_string = instr_string[2:]
instr_string = instr_string.strip("'")
f.write(' inst_cache[%8d] = 32\'h%s;\n' % (i//8, instr_string, ))
f.write(verilog_tail)
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment