本文先總結不同AXI IP核的實現的方法,性能的對比,性能差異的分析,可能改進的方面。使用的硬件平臺是Zedboard。
不同的AXI總線卷積加速模塊的概況
這次實現并逐漸優化了三個版本的卷積加速模塊,先簡要描述各個版本的主要內容。
版本一
版本一主要是用來測試AXI總線IP核的實現可能。
該模塊擁有19個32位寄存器
其中前9個寄存器用來保存需要計算的值
后面9個寄存器用來保存卷積核
在讀取第19個寄存器的地址的時候計算9個寄存器的卷積和(該計算可以在一個時鐘周期內完成)
9個寄存器單獨賦值,程序中分別向對應地址寫入內容,通過總線進行傳輸。
故樂觀的來算,需要10個總線周期可以獲取一個輸出
可以從驅動的書寫簡單理解一下:
void Conv_HW(int filter[3][3], int arr[100][100],
int filterW, int filterH, int arrW, int arrH) {
int i, j;
for (i = 2; i 《 filterH + arrH - 3; i++) {
for (j = 2; j 《 filterW + arrW - 3; j++) {
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR, arr[i][j]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+4, arr[i][j - 1]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+8, arr[i][j - 2]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+12, arr[i - 1][j]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+16, arr[i - 1][j - 1]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+20, arr[i - 1][j - 2]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+24, arr[i - 2][j]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+28, arr[i - 2][j - 1]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+32, arr[i - 2][j - 2]);
res[i][j] = Xil_In32(XPAR_CONV_0_S00_AXI_BASEADDR + 72);
}
if (i % 15 == 0)
printf(“=”);
}
}
版本一性能
版本一性能最慘,由于沒有時間戳,目測軟件計算速度遠遠快于FPGA核心運算速度。
版本一的改進速度就是引入滑動窗口,能夠最大程度減少總線周期。
版本二
版本二引入滑動窗口,和初期設計的概念相同。
該模塊擁有19個32位寄存器
其中前9個寄存器用來保存需要計算的值
后面9個寄存器用來保存卷積核
在讀取第19個寄存器的地址的時候計算9個寄存器的卷積和(該計算可以在一個時鐘周期內完成)
三個寄存器滑動賦值,該計算窗口在計算矩陣上滑動 除了冷啟動多余兩個周期用來預載寄存器,后面的每一個計算只需要四個總線周期
可以通過寫的驅動簡單理解一下:
void Conv_HW(int filter[3][3], int arr[100][100], int arrW, int arrH) {
int i, j;
i = 2; j = 2;
for (i = 2; i 《 arrH; i++) {
//pre load
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, arr[i - 1][j - 1]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, arr[i][j - 1]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, arr[i + 1][j - 1]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, arr[i - 1][j]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, arr[i][j]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, arr[i + 1][j]);
for (j = 2; j 《 arrW; j++) {
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, arr[i - 1][j + 1]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, arr[i][j + 1]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, arr[i + 1][j + 1]);
res[i][j] = Xil_In32(XPAR_CONV_0_S00_AXI_BASEADDR + 72);
}
}
}
版本二性能
測試樣本 500*500的32bit單位的矩陣 計算200次。
軟件消耗33.78秒,卷積IP核心40.25秒
這樣的結果還是非常不樂觀,分析可能有兩種限制了IP核的速度。
兩個寄存器的乘法LUT太大,無法硬件優化
總線周期太慢太慢
版本三對于這兩種可能進行探索。
版本二的FPGA部分核心代碼
// Implement memory mapped register select and write logic generation
// The write data is accepted and written to memory mapped registers when
// axi_awready, S_AXI_WVALID, axi_wready and S_AXI_WVALID are asserted. Write strobes are used to
// select byte enables of slave registers while writing.
// These registers are cleared when reset (active low) is applied.
// Slave register write enable is asserted when valid address and data are available
// and the slave is ready to accept the write address and write data.
assign slv_reg_wren = axi_wready && S_AXI_WVALID && axi_awready && S_AXI_AWVALID;
always @( posedge S_AXI_ACLK )
begin
if ( S_AXI_ARESETN == 1‘b0 )
begin
slv_reg0 《= 0;
slv_reg1 《= 0;
slv_reg2 《= 0;
slv_reg3 《= 0;
slv_reg4 《= 0;
slv_reg5 《= 0;
slv_reg6 《= 0;
slv_reg7 《= 0;
slv_reg8 《= 0;
slv_reg9 《= 0;
slv_reg10 《= 0;
slv_reg11 《= 0;
slv_reg12 《= 0;
slv_reg13 《= 0;
slv_reg14 《= 0;
slv_reg15 《= 0;
slv_reg16 《= 0;
slv_reg17 《= 0;
// slv_reg18 《= 0;
end
else begin
if (slv_reg_wren)
begin
case ( axi_awaddr[ADDR_LSB+OPT_MEM_ADDR_BITS:ADDR_LSB] )
5’h00:
for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 0
slv_reg0[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];
end
5‘h01:
for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 1
slv_reg1[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];
end
5’h02:
begin
slv_reg0 《= slv_reg1;
slv_reg1 《= slv_reg2;
for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 2
slv_reg2[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];
end
end
5‘h03:
for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 3
slv_reg3[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];
end
5’h04:
for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 4
slv_reg4[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];
end
5‘h05:
begin
slv_reg3 《= slv_reg4;
slv_reg4 《= slv_reg5;
for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 5
slv_reg5[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];
end
end
5’h06:
for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 6
slv_reg6[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];
end
5‘h07:
for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 7
slv_reg7[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];
end
5’h08:
begin
slv_reg6 《= slv_reg7;
slv_reg7 《= slv_reg8;
for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 8
slv_reg8[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];
end
end
5‘h09:
for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 9
slv_reg9[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];
end
5’h0A:
for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 10
slv_reg10[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];
end
5‘h0B:
for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 11
slv_reg11[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];
end
5’h0C:
for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 12
slv_reg12[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];
end
5‘h0D:
for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 13
slv_reg13[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];
end
5’h0E:
for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 14
slv_reg14[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];
end
5‘h0F:
for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 15
slv_reg15[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];
end
5’h10:
for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 16
slv_reg16[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];
end
5‘h11:
for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 17
slv_reg17[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];
end
// 5’h12:
// for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
// if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// // Respective byte enables are asserted as per write strobes
// // Slave register 18
// slv_reg18[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];
// end
default : begin
slv_reg0 《= slv_reg0;
slv_reg1 《= slv_reg1;
slv_reg2 《= slv_reg2;
slv_reg3 《= slv_reg3;
slv_reg4 《= slv_reg4;
slv_reg5 《= slv_reg5;
slv_reg6 《= slv_reg6;
slv_reg7 《= slv_reg7;
slv_reg8 《= slv_reg8;
slv_reg9 《= slv_reg9;
slv_reg10 《= slv_reg10;
slv_reg11 《= slv_reg11;
slv_reg12 《= slv_reg12;
slv_reg13 《= slv_reg13;
slv_reg14 《= slv_reg14;
slv_reg15 《= slv_reg15;
slv_reg16 《= slv_reg16;
slv_reg17 《= slv_reg17;
end
endcase
end
end
end
// Implement memory mapped register select and read logic generation
// Slave register read enable is asserted when valid address is available
// and the slave is ready to accept the read address.
assign slv_reg_rden = axi_arready & S_AXI_ARVALID & ~axi_rvalid;
always @(*)
begin
// Address decoding for reading registers
case ( axi_araddr[ADDR_LSB+OPT_MEM_ADDR_BITS:ADDR_LSB] )
5‘h00 : reg_data_out 《= slv_reg0;
5’h01 : reg_data_out 《= slv_reg1;
5‘h02 : reg_data_out 《= slv_reg2;
5’h03 : reg_data_out 《= slv_reg3;
5‘h04 : reg_data_out 《= slv_reg4;
5’h05 : reg_data_out 《= slv_reg5;
5‘h06 : reg_data_out 《= slv_reg6;
5’h07 : reg_data_out 《= slv_reg7;
5‘h08 : reg_data_out 《= slv_reg8;
5’h09 : reg_data_out 《= slv_reg9;
5‘h0A : reg_data_out 《= slv_reg10;
5’h0B : reg_data_out 《= slv_reg11;
5‘h0C : reg_data_out 《= slv_reg12;
5’h0D : reg_data_out 《= slv_reg13;
5‘h0E : reg_data_out 《= slv_reg14;
5’h0F : reg_data_out 《= slv_reg15;
5‘h10 : reg_data_out 《= slv_reg16;
5’h11 : reg_data_out 《= slv_reg17;
5‘h12 : reg_data_out 《= slv_reg0 * slv_reg9 +
slv_reg1 * slv_reg10 +
slv_reg2 * slv_reg11 +
slv_reg3 * slv_reg12 +
slv_reg4 * slv_reg13 +
slv_reg5 * slv_reg14 +
slv_reg6 * slv_reg15 +
slv_reg7 * slv_reg16 +
slv_reg8 * slv_reg17;
default : reg_data_out 《= 0;
endcase
end
版本三
先嘗試生成更小的LUT
該模塊擁有19個32位寄存器
其中前9個寄存器用來保存需要計算的值
卷積核固定在Verilog中,用來生成更小的LUT
一個計算只需要四個總線周期
性能測試
仍然軟件消耗33秒,卷積IP核心40秒
基本否決是LUT問題。
下面測試AXI總線問題:
假設所有數據均來自于FPGA,無需從總線寫入:
void Conv_HW(int filter[3][3], int arr[100][100], int arrW, int arrH) {
int i, j;
i = 2; j = 2;
for (i = 2; i 《 arrH; i++) {
for (j = 2; j 《 arrW; j++) {
res[i][j] = Xil_In32(XPAR_CONV_0_S00_AXI_BASEADDR + 72);
}
}
}
只需要9.47秒即可完成計算,并傳回CPU !!!
總結
至此,基本上可以否決利用AXI傳數據的可能,所有需要利用AXI總線傳輸數據的模塊均會被總線周期所連累,在優化了傳輸后,仍然無法解決該問題。確實需要一個更快的方式來傳輸數據。
在Altera的NIOS2中,直接利用IO口傳輸數據,無需總線周期,再因為NIOS II內核沒有流水線優化,所以硬件確實比較快。
附1:AXI4 總線的 FPGA 接口部分
先看總線接口:
// Users to add ports here
// User ports ends
// Do not modify the ports beyond this line
// Global Clock Signal
// 全局時鐘
input wire S_AXI_ACLK,
// Global Reset Signal. This Signal is Active LOW
// 全局復位信號
input wire S_AXI_ARESETN,
// Write address (issued by master, acceped by Slave)
// 寫地址
input wire [C_S_AXI_ADDR_WIDTH-1 : 0] S_AXI_AWADDR,
// 寫地址的保護模式 包括privilege和security level
// Write channel Protection type. This signal indicates the
// privilege and security level of the transaction, and whether
// the transaction is a data access or an instruction access.
input wire [2 : 0] S_AXI_AWPROT,
// 寫地址有效信號。為高指示地址有效。
// Write address valid. This signal indicates that the master signaling
// valid write address and control information.
input wire S_AXI_AWVALID,
// 寫地址準備信號。為高表示從設備空閑,準備接收地址;為低表示從設備忙。
// ********** 注意 這里是地址 下面是數據 ********
// Write address ready. This signal indicates that the slave is ready
// to accept an address and associated control signals.
output wire S_AXI_AWREADY,
// 寫數據,32位到1024位寬
// 從主設備來的數據 從設備接收
// Write data (issued by master, acceped by Slave)
input wire [C_S_AXI_DATA_WIDTH-1 : 0] S_AXI_WDATA,
// 寫字節選通,用于表示更新存儲器的字節通道,對于數據總線的每8位數據有一位寫選通信號。
// Write strobes. This signal indicates which byte lanes hold
// valid data. There is one write strobe bit for each eight
// bits of the write data bus.
input wire [(C_S_AXI_DATA_WIDTH/8)-1 : 0] S_AXI_WSTRB,
// 寫有效。為高指示數據有效。
// Write valid. This signal indicates that valid write
// data and strobes are available.
input wire S_AXI_WVALID,
// 寫準備。為高表示從設備空閑,準備接收數據;為低表示從設備忙。
// Write ready. This signal indicates that the slave
// can accept the write data.
output wire S_AXI_WREADY,
// 寫響應。該信號表示寫狀態,可允許相應的表示為OKAYEXOKAYSLVERRDECERR。
// Write response. This signal indicates the status
// of the write transaction.
output wire [1 : 0] S_AXI_BRESP,
// 寫響應有效。為高指示響應數據有效
// Write response valid. This signal indicates that the channel
// is signaling a valid write response.
output wire S_AXI_BVALID,
// 寫響應準備。為高表示主設備空閑,準備接收寫響應;為低表示主設備忙。
// Response ready. This signal indicates that the master
// can accept a write response.
input wire S_AXI_BREADY,
//
// 讀地址。讀地址給出突發數據傳輸的第一個傳輸地址。
// Read address (issued by master, acceped by Slave)
input wire [C_S_AXI_ADDR_WIDTH-1 : 0] S_AXI_ARADDR,
// 保護類型,建議值為000。
// Protection type. This signal indicates the privilege
// and security level of the transaction, and whether the
// transaction is a data access or an instruction access.
input wire [2 : 0] S_AXI_ARPROT,
//
// Read address valid. This signal indicates that the channel
// is signaling valid read address and control information.
input wire S_AXI_ARVALID,
// 讀地址準備信號。為高表示從設備空閑,準備接收地址;為低表示從設備忙。
// Read address ready. This signal indicates that the slave is
// ready to accept an address and associated control signals.
output wire S_AXI_ARREADY,
// Read data (issued by slave)
output wire [C_S_AXI_DATA_WIDTH-1 : 0] S_AXI_RDATA,
// Read response. This signal indicates the status of the
// read transfer.
output wire [1 : 0] S_AXI_RRESP,
// Read valid. This signal indicates that the channel is
// signaling the required read data.
output wire S_AXI_RVALID,
// Read ready. This signal indicates that the master can
// accept the read data and response information.
input wire S_AXI_RREADY
);
// AXI4LITE signals
reg [C_S_AXI_ADDR_WIDTH-1 : 0] axi_awaddr;
reg axi_awready;
reg axi_wready;
reg [1 : 0] axi_bresp;
reg axi_bvalid;
reg [C_S_AXI_ADDR_WIDTH-1 : 0] axi_araddr;
reg axi_arready;
reg [C_S_AXI_DATA_WIDTH-1 : 0] axi_rdata;
reg [1 : 0] axi_rresp;
reg axi_rvalid;
其中最為重要的讀取總線信號尋址的部分:
assign slv_reg_wren = axi_wready && S_AXI_WVALID && axi_awready && S_AXI_AWVALID;
always @( posedge S_AXI_ACLK )
begin
if ( S_AXI_ARESETN == 1’b0 )
begin
slv_reg0 《= 0;
slv_reg1 《= 0;
slv_reg2 《= 0;
slv_reg3 《= 0;
slv_reg4 《= 0;
slv_reg5 《= 0;
slv_reg6 《= 0;
slv_reg7 《= 0;
slv_reg8 《= 0;
slv_reg9 《= 0;
end
else begin
if (slv_reg_wren)
begin
// 進行尋址
// 地址尋址 是這么玩的
// 當寄存器是32位的 最后就是 2位 4個Byte ADDR_LSB = 2
// 當寄存器是64位的 最后就是 3位 8個Byte ADDR_LSB = 3
// OPT_MEM_ADDR_BITS 用來尋址寄存器 這里選了十個寄存器 所以這里就是4位
case ( axi_awaddr[ADDR_LSB+OPT_MEM_ADDR_BITS:ADDR_LSB] )
4‘h0:
for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
// 只有在對應的Bit位置為1的時候才能開始讀取
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 0
slv_reg0[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];
end
4’h1:
for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 1
slv_reg1[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];
end
4‘h2:
for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 2
slv_reg2[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];
end
4’h3:
for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 3
slv_reg3[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];
end
4‘h4:
for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 4
slv_reg4[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];
end
4’h5:
for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 5
slv_reg5[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];
end
4‘h6:
for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 6
slv_reg6[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];
end
4’h7:
for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 7
slv_reg7[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];
end
4‘h8:
for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 8
slv_reg8[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];
end
4’h9:
for ( byte_index = 0; byte_index 《= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 9
slv_reg9[(byte_index*8) +: 8] 《= S_AXI_WDATA[(byte_index*8) +: 8];
end
default : begin
slv_reg0 《= slv_reg0;
slv_reg1 《= slv_reg1;
slv_reg2 《= slv_reg2;
slv_reg3 《= slv_reg3;
slv_reg4 《= slv_reg4;
slv_reg5 《= slv_reg5;
slv_reg6 《= slv_reg6;
slv_reg7 《= slv_reg7;
slv_reg8 《= slv_reg8;
slv_reg9 《= slv_reg9;
end
endcase
end
end
end
附2:AXI4的測試模塊與仿真測試
`timescale 1ns/1ns
module conv_axi_test();
parameter integer C_S00_AXI_DATA_WIDTH = 32;
parameter integer C_S00_AXI_ADDR_WIDTH = 6;
reg s00_axi_aclk;
// 全局復位信號
reg s00_axi_aresetn;
reg [C_S00_AXI_ADDR_WIDTH-1 : 0] s00_axi_awaddr;
wire [2 : 0] s00_axi_awprot;
reg s00_axi_awvalid;
wire s00_axi_awready;
reg [C_S00_AXI_DATA_WIDTH-1 : 0] s00_axi_wdata;
reg [(C_S00_AXI_DATA_WIDTH/8)-1 : 0] s00_axi_wstrb;
reg s00_axi_wvalid;
wire s00_axi_wready;
wire [1 : 0] s00_axi_bresp;
wire s00_axi_bvalid;
wire s00_axi_bready;
reg [C_S00_AXI_ADDR_WIDTH-1 : 0] s00_axi_araddr;
wire [2 : 0] s00_axi_arprot;
reg s00_axi_arvalid;
wire s00_axi_arready;
wire [C_S00_AXI_DATA_WIDTH-1 : 0] s00_axi_rdata;
wire [1 : 0] s00_axi_rresp;
wire s00_axi_rvalid;
wire s00_axi_rready;
conv_v1_0_S00_AXI # (
.C_S_AXI_DATA_WIDTH(C_S00_AXI_DATA_WIDTH),
.C_S_AXI_ADDR_WIDTH(C_S00_AXI_ADDR_WIDTH)
) conv_v1_0_S00_AXI_inst (
.S_AXI_ACLK(s00_axi_aclk),
.S_AXI_ARESETN(s00_axi_aresetn),
.S_AXI_AWADDR(s00_axi_awaddr),
.S_AXI_AWPROT(s00_axi_awprot),
.S_AXI_AWVALID(s00_axi_awvalid),
.S_AXI_AWREADY(s00_axi_awready),
.S_AXI_WDATA(s00_axi_wdata),
.S_AXI_WSTRB(s00_axi_wstrb),
.S_AXI_WVALID(s00_axi_wvalid),
.S_AXI_WREADY(s00_axi_wready),
.S_AXI_BRESP(s00_axi_bresp),
.S_AXI_BVALID(s00_axi_bvalid),
.S_AXI_BREADY(s00_axi_bready),
.S_AXI_ARADDR(s00_axi_araddr),
.S_AXI_ARPROT(s00_axi_arprot),
.S_AXI_ARVALID(s00_axi_arvalid),
.S_AXI_ARREADY(s00_axi_arready),
.S_AXI_RDATA(s00_axi_rdata),
.S_AXI_RRESP(s00_axi_rresp),
.S_AXI_RVALID(s00_axi_rvalid),
.S_AXI_RREADY(s00_axi_rready)
);
initial
begin:d
integer i;
s00_axi_aclk = 1;
for(i = 0; i《 1000;i++)
begin
#1 s00_axi_aclk = ~ s00_axi_aclk;
end
$finish();
end
initial
begin
s00_axi_aresetn = 0;
s00_axi_arvalid = 0;
#4 s00_axi_aresetn = 1;
s00_axi_awvalid = 1;
s00_axi_wvalid = 1;
s00_axi_awaddr = 0;
s00_axi_wstrb = 4‘b1111;
s00_axi_wdata = 3;
#4 s00_axi_awaddr = 6’b000100;
s00_axi_wdata = 21;
#4 s00_axi_awaddr = 6‘b001000;
s00_axi_wdata = 19;
#4 s00_axi_awaddr = 6’b001100;
s00_axi_wdata = 22;
#4 s00_axi_awaddr = 6‘b010000;
s00_axi_wdata = 20;
#4 s00_axi_awaddr = 6’b010100;
s00_axi_wdata = 13;
#4 s00_axi_awaddr = 6‘b011000;
s00_axi_wdata = 16;
#4 s00_axi_awaddr = 6’b011100;
s00_axi_wdata = 14;
#4 s00_axi_awaddr = 6‘b100000;
s00_axi_wdata = 7;
#4
s00_axi_arvalid = 1;
s00_axi_araddr = 6’b100100;
end
initial
begin
$dumpfile(“test.vcd”);
$dumpvars();
end
endmodule
利用iverilog進行仿真GTKwave顯示測試波形如下
新建IP核如下:
工程頂層圖如下:
附3:軟件驅動
#include
#include “platform.h”
#include “xbasic_types.h”
#include “xparameters.h”
#include “xil_io.h”
#define test_speed
int res[1000][1000];
void delay() {
int i, j, k;
for (i = 0; i 《 1000; i++) {
for (j = 0; j 《 1000; j++) {
for (k = 0; k 《 100; k++)
;
}
}
}
void show_reg() {
int i;
u32 result;
printf(“ ============SHOW REG ================ ”);
for (i = 0; i 《 9; i++) {
result = Xil_In32(XPAR_CONV_0_S00_AXI_BASEADDR + 4 * i);
printf(“Reg %3d : %u ”, i, result);
}
}
void load_kernel(int filter[3][3]) {
UINTPTR kernel_addr = (UINTPTR) XPAR_CONV_0_S00_AXI_BASEADDR + 36;
Xil_Out32(kernel_addr, filter[0][0]);
kernel_addr = kernel_addr + 0x4;
Xil_Out32(kernel_addr, filter[0][1]);
kernel_addr = kernel_addr + 0x4;
Xil_Out32(kernel_addr, filter[0][2]);
kernel_addr = kernel_addr + 0x4;
Xil_Out32(kernel_addr, filter[1][0]);
kernel_addr = kernel_addr + 0x4;
Xil_Out32(kernel_addr, filter[1][1]);
kernel_addr = kernel_addr + 0x4;
Xil_Out32(kernel_addr, filter[1][2]);
kernel_addr = kernel_addr + 0x4;
Xil_Out32(kernel_addr, filter[2][0]);
kernel_addr = kernel_addr + 0x4;
Xil_Out32(kernel_addr, filter[2][1]);
kernel_addr = kernel_addr + 0x4;
Xil_Out32(kernel_addr, filter[2][2]);
}
void test_set() {
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, 3);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, 22);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, 16);
printf(“1 ”);
show_reg();
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, 21);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, 20);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, 14);
printf(“2 ”);
show_reg();
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, 19);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, 13);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, 7);
printf(“3 ”);
show_reg();
}
void Conv_SW(int filter[3][3], int arr[100][100], int arrW, int arrH) {
int i, j;
i = 2; j = 2;
for (i = 2; i 《 arrH; i++) {
for (j = 2; j 《 arrW;j++){
res[i][j] = 0;
res[i][j] += filter[0][0] * arr[i - 1][j - 1];
res[i][j] += filter[0][1] * arr[i - 1][j];
res[i][j] += filter[0][2] * arr[i - 1][j + 1];
res[i][j] += filter[1][0] * arr[i][j - 1];
res[i][j] += filter[1][1] * arr[i][j];
res[i][j] += filter[1][2] * arr[i][j + 1];
res[i][j] += filter[2][0] * arr[i + 1][j - 1];
res[i][j] += filter[2][1] * arr[i + 1][j];
res[i][j] += filter[2][2] * arr[i + 1][j + 1];
}
}
}
void Conv_HW(int filter[3][3], int arr[100][100], int arrW, int arrH) {
int i, j;
i = 2; j = 2;
for (i = 2; i 《 arrH; i++) {
//pre load
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, arr[i - 1][j - 1]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, arr[i][j - 1]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, arr[i + 1][j - 1]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, arr[i - 1][j]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, arr[i][j]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, arr[i + 1][j]);
for (j = 2; j 《 arrW; j++) {
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, arr[i - 1][j + 1]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, arr[i][j + 1]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, arr[i + 1][j + 1]);
res[i][j] = Xil_In32(XPAR_CONV_0_S00_AXI_BASEADDR + 72);
}
}
}
int main() {
printf(“HELLO WORLD”);
u32 result;
int filterW = 3;
int filterH = 3;
int arrW = 5;
int arrH = 5;
int resW = filterW + arrW - 1;
int resH = filterH + arrH - 1;
int i, j;
int pFilter[3][3];
int arr[100][100];
UINTPTR cur_addr = (UINTPTR) XPAR_CONV_0_S00_AXI_BASEADDR;
pFilter[0][0] = 1;
pFilter[0][1] = 3;
pFilter[0][2] = 1;
pFilter[1][0] = 0;
pFilter[1][1] = 5;
pFilter[1][2] = 0;
pFilter[2][0] = 2;
pFilter[2][1] = 1;
pFilter[2][2] = 2;
init_platform();
for (i = 0; i 《 9; i++) {
Xil_Out32(cur_addr, 0);
cur_addr = cur_addr + 4;
}
load_kernel(pFilter);
printf(“Kernel Loaded ”);
#ifdef test_single
test_set();
result = Xil_In32(XPAR_CONV_0_S00_AXI_BASEADDR + 72);
printf(“Test Set Result %u”, result);
show_reg();
#endif
#ifdef test_func
srand(10);
arrW = 20;
arrH = 20;
resH = filterH + arrH - 1;
resW = filterW + arrW - 1;
for (i = 0; i 《 arrH; i++) {
for (j = 0; j 《 arrW; j++) {
arr[i][j] = rand() % 20;
}
}
printf(“*********************************************** ”);
printf(“Filter: ”);
for (i = filterH - 1; i 》= 0; i--) {
for (j = filterW - 1; j 》= 0; j--) {
printf(“%d ”, pFilter[i][j]);
}
printf(“ ”);
}
printf(“*********************************************** ”);
printf(“Matrix: ”);
for (i = 0; i 《 arrH; i++) {
for (j = 0; j 《 arrW; j++) {
printf(“%4d ”, arr[i][j]);
}
printf(“ ”);
}
printf(“*********************************************** ”);
printf(“Software Start! ”);
Conv_SW(pFilter, arr, arrW, arrH);
printf(“ Software end! ”);
printf(“*********************************************** ”);
printf(“Result1: ”);
for (i = 0; i 《 resH; i++) {
for (j = 0; j 《 resW; j++) {
printf(“%5d ”, res[i][j]);
}
printf(“ ”);
}
for (i = 0; i 《 resH; i++) {
for (j = 0; j 《 resW; j++) {
res[i][j] = 0;
}
}
printf(“*********************************************** ”);
printf(“HardWare Start! ”);
Conv_HW(pFilter, arr, arrW, arrH);
printf(“ HardWare end!”);
printf(“Result2: ”);
for (i = 0; i 《 resH; i++) {
for (j = 0; j 《 resW; j++) {
printf(“%5d ”, res[i][j]);
}
printf(“ ”);
}
printf(“*********************************************** ”);
#endif
#ifdef test_speed
arrW = 500;
arrH = 500;
resH = filterH + arrH - 1;
resW = filterW + arrW - 1;
printf(“Software Start! ”);
for(i = 0; i《 200;i++) {
Conv_SW(pFilter, arr, arrW, arrH);
}
printf(“ Software end! ”);
printf(“HardWare Start! ”);
for(i = 0; i《 200;i++) {
Conv_HW(pFilter, arr, arrW, arrH);
}
printf(“ HardWare end!”);
cleanup_platform();
#endif
return 0;
}
-
IP核
+關注
關注
4文章
327瀏覽量
49485 -
AXI
+關注
關注
1文章
127瀏覽量
16622
發布評論請先 登錄
相關推薦
評論