神经网络的FPGA实现:基础卷积操作(一) RGB三通道
Verilog HDL
Xilinx VIVADO
conv_pe.v文件在上述链接中
`timescale 1ns / 1ps
module conv_layer#(
parameter CHANNEL_IN=3, //输入图像的通道R G B
parameter CHANNEL_OUT=1 //输出图像的通道
)(
input clk,//时钟
input rst,//复位
input input_weight_en, //权值使能信号
input input_bias_en, //bias使能信号
//卷积核的输入
input [7:0] weight_ab, //单时刻写入的权值
input [7:0] bias_ab, //单时刻写入的bias
output write_done_weight_bias,//权值和bias写完的信号
//输入特征图
input input_fmap_en,//特征输入使能信号
input [9*8*CHANNEL_IN-1:0]fmap,//特征输入3*3fmap,输入输出均为8bit,3通道
output output_en,//输出使能信号
output [8*CHANNEL_OUT-1:0] end_data//输出8bit,1通道
);
//memory 的类型 不可综合语法 若要综合需要改成RAM
reg [7:0] weight [9*CHANNEL_IN*CHANNEL_OUT-1:0]; //weight 个数为9*CHANNEL_IN*CHANNEL_OUT个8bit的数据:9*CHANNEL_IN*CHANNEL_OUT个8位寄存器
reg [7:0] bias [CHANNEL_OUT-1:0]; //bias 个数为CHANNEL_OUT个8bit的数据:CHANNEL_OUT个8位寄存器
//写权值
reg [15:0]weight_count;//写权重计数 16位weight_count
reg weight_write_done;//写权重完成
always @(posedge clk) begin
if(rst) begin//复位
weight_count<= 0;
weight_write_done<=0;
end else begin//非复位
if(input_weight_en)begin//权重输入使能
weight[weight_count]<=weight_ab;//权重写入
weight_count<=weight_count+1;//权重写入计数
end
if(weight_count==9*CHANNEL_IN*CHANNEL_OUT)begin//weight 个数为9*CHANNEL_IN*CHANNEL_OUT,权重全部写入
weight_write_done<=1;//权重写入完成
end
end
end
//写偏置
reg [7:0]bias_count;//写偏置计数
reg bias_write_done;//写偏置完成
always @(posedge clk) begin
if(rst) begin
bias_count<= 0;
bias_write_done<=0;
end else begin
if(input_bias_en)begin//偏置输入使能
bias[bias_count]<=bias_ab;//偏置写入
bias_count<=bias_count+1;//偏置写入计数
end
if(bias_count==CHANNEL_OUT)begin//bias个数为CHANNEL_OUT,偏置全部写入
bias_write_done<=1;//偏置写入完成
end
end
end
assign write_done_weight_bias=weight_write_done&bias_write_done; //参数传输完成
//fmap 的输入计算
//在此设置与输入通道数和输出通道数相关的 conv_pe 的个数为最快计算数据
// 实际工程中的conv_pe的个数与 资源 速度 有关系 需要整体考虑
//例如 此工程中的数据 输入通道为3 输出通道为1 设置为3*1个conv_pe
wire valid_out_1,valid_out_2,valid_out_3;//每通道输出使能信号
wire [31:0]sum_data_1,sum_data_2,sum_data_3;//每通道输出数据
//3通道卷积
conv_pe uut_conv_pe_1(
.clk(clk),
.rst(rst),
.input_en(input_fmap_en),
.kernel_00(weight[0]),
.kernel_01(weight[1]),
.kernel_02(weight[2]),
.kernel_10(weight[3]),
.kernel_11(weight[4]),
.kernel_12(weight[5]),
.kernel_20(weight[6]),
.kernel_21(weight[7]),
.kernel_22(weight[8]),
.fmap_00(fmap[7:0]),
.fmap_01(fmap[15:8]),
.fmap_02(fmap[23:16]),
.fmap_10(fmap[31:24]),
.fmap_11(fmap[39:32]),
.fmap_12(fmap[47:40]),
.fmap_20(fmap[55:48]),
.fmap_21(fmap[63:56]),
.fmap_22(fmap[71:64]),
.valid_out(valid_out_1),
.sum_data(sum_data_1)
);
conv_pe uut_conv_pe_2(
.clk(clk),
.rst(rst),
.input_en(input_fmap_en),
.kernel_00(weight[9]),
.kernel_01(weight[10]),
.kernel_02(weight[11]),
.kernel_10(weight[12]),
.kernel_11(weight[13]),
.kernel_12(weight[14]),
.kernel_20(weight[15]),
.kernel_21(weight[16]),
.kernel_22(weight[17]),
.fmap_00(fmap[79:72]),
.fmap_01(fmap[87:80]),
.fmap_02(fmap[95:88]),
.fmap_10(fmap[103:96]),
.fmap_11(fmap[111:104]),
.fmap_12(fmap[119:112]),
.fmap_20(fmap[127:120]),
.fmap_21(fmap[135:128]),
.fmap_22(fmap[143:136]),
.valid_out(valid_out_2),
.sum_data(sum_data_2)
);
conv_pe uut_conv_pe_3(
.clk(clk),
.rst(rst),
.input_en(input_fmap_en),
.kernel_00(weight[18]),
.kernel_01(weight[19]),
.kernel_02(weight[20]),
.kernel_10(weight[21]),
.kernel_11(weight[22]),
.kernel_12(weight[23]),
.kernel_20(weight[24]),
.kernel_21(weight[25]),
.kernel_22(weight[26]),
.fmap_00(fmap[151:144]),
.fmap_01(fmap[159:152]),
.fmap_02(fmap[167:160]),
.fmap_10(fmap[175:168]),
.fmap_11(fmap[183:176]),
.fmap_12(fmap[191:184]),
.fmap_20(fmap[199:192]),
.fmap_21(fmap[207:200]),
.fmap_22(fmap[215:208]),
.valid_out(valid_out_3),
.sum_data(sum_data_3)
);
wire [31:0] sum_data_32; //3通道求和
wire signed [7:0]bias_temp;//临时存储 8位偏置数据
assign bias_temp=bias[0];
assign output_en=valid_out_1&valid_out_2&valid_out_3; //输出的使能信号
//将单个结果量化到0到255
wire [31:0]sum_data_1_Q,sum_data_2_Q,sum_data_3_Q;//每通道输出数据量化
assign sum_data_1_Q={24'd0,sum_data_1[7:0]};
assign sum_data_2_Q={24'd0,sum_data_2[7:0]};
assign sum_data_3_Q={24'd0,sum_data_3[7:0]};
//将最后的结果量化到-127到128
assign sum_data_32= (output_en==1)?sum_data_1_Q+sum_data_2_Q+sum_data_3_Q+{{24{bias_temp[7]}},bias_temp}:32'd0; //根据输出使能判断数据是否进行通道求和
assign end_data=sum_data_32[7:0];//通道求和数据输出
reg [31:0]conv_pe_count;
always @(posedge clk) begin
if(rst) begin
conv_pe_count<= 0;
end else begin
if(output_en)begin
conv_pe_count<=conv_pe_count+1;
end
end
end
endmodule
需要imageBlueChannels.txt、imageGreenChannels.txt、imageRedChannels.txt,以及bias.txt、weight.txt文件;并放入工程文件夹下。
`timescale 1ns / 1ps
module tb_conv_layer;
reg clk;
reg rst;
initial begin
rst = 1;
#1000
rst = 0;
end
always begin: clk1_blk
clk = 0;
forever #5 clk = ~clk;//周期为10个时间单位的波
end
localparam CHANNEL_IN=3;
localparam CHANNEL_OUT=1;
reg input_weight_en,input_bias_en,input_fmap_en;//权重,偏置,特征输入使能信号
reg [7:0] weight_ab,bias_ab;//单时刻写入的8位weight与bias
reg [9*8*CHANNEL_IN-1:0]fmap_ab;//写入的特征 一次3通道被卷积位宽:(kernel=3*3)*(8bit/个)*通道数channel_in
wire write_done_weight_bias,output_en;//参数写完使能,输出使能
wire [8*CHANNEL_OUT-1:0]end_data;//通道求和数据输出,输出8bit,1通道
conv_layer#(
.CHANNEL_IN(CHANNEL_IN),
.CHANNEL_OUT(CHANNEL_OUT)
)uut_conv_layer(
.clk(clk),
.rst(rst),
.input_weight_en(input_weight_en),
.input_bias_en(input_bias_en),
.weight_ab(weight_ab),
.bias_ab(bias_ab),
.write_done_weight_bias(write_done_weight_bias),
.input_fmap_en(input_fmap_en),
.fmap(fmap_ab),
.output_en(output_en),
.end_data(end_data)
);
//实际的工程中传输数据到器件内的时候需要通过 串口 pcie 光口 网口等外部接口写入
//或者直接让权值存储到片内 但是一般数据较多 需要借助外部存储器如DDR进行权值缓存
reg [7:0] weight[9*CHANNEL_IN*CHANNEL_OUT-1:0]; //9*CHANNEL_IN*CHANNEL_OUT个 8位权重寄存器,每次卷积需要[(kernel*kernel)*channel_in]个权值
initial begin //数据的组织形式是1的块,按照通道的方向进入到数据中,然后按照列方式进入
$readmemh("C://Users//mayn//Desktop//nn//conv//conv_layer//weight.txt",weight);// 将所有的数据输入到mem1中
end
integer weight_count;
always @(posedge clk) begin
if(rst) begin
weight_count<= 0;
input_weight_en<=0;
weight_ab<=0;
end else begin
if(weight_count<9*CHANNEL_IN*CHANNEL_OUT)begin//权重未输入完毕时
input_weight_en<=1;//权重输入使能
weight_count<= weight_count+1;//权重输入计数
weight_ab<=weight[weight_count];//单时刻写入的权重值
end
else begin
input_weight_en<=0;
weight_ab<=0;
end
end
end
reg [7:0]bias[CHANNEL_OUT-1:0]; //
initial begin //数据的组织形式是1的块,按照通道的方向进入到数据中,然后按照列方式进入
$readmemh("C://Users//mayn//Desktop//nn//conv//conv_layer//bias.txt",bias);// 将所有的数据输入到mem1中
end
integer bias_count;
always @(posedge clk) begin
if(rst) begin
bias_count<= 0;
input_bias_en<=0;
bias_ab<=0;
end else begin
if(bias_count<CHANNEL_OUT)begin
input_bias_en<=1;
bias_count<= bias_count+1;
bias_ab<=bias[bias_count];
end
else begin
input_bias_en<=0;
bias_ab<=0;
end
end
end
localparam IMAGE_WIDTH=482;
localparam IMAGE_HIGH=322;
//一般写入特征图的值从外围存储器中获得
//写入fmap 的值
reg [7:0] fmap_R[IMAGE_WIDTH*IMAGE_HIGH-1:0]; // 图像的总数据
reg [7:0] fmap_G[IMAGE_WIDTH*IMAGE_HIGH-1:0]; // 图像的总数据
reg [7:0] fmap_B[IMAGE_WIDTH*IMAGE_HIGH-1:0]; // 图像的总数据
initial begin //数据的组织形式是1的块,按照通道的方向进入到数据中,然后按照列方式进入
$readmemh("C://Users//mayn//Desktop//nn//conv//conv_layer//imageBlueChannels.txt",fmap_B);
$readmemh("C://Users//mayn//Desktop//nn//conv//conv_layer//imageGreenChannels.txt",fmap_G);
$readmemh("C://Users//mayn//Desktop//nn//conv//conv_layer//imageRedChannels.txt",fmap_R);//读取3通道数据
end
integer i,j;
reg [2:0]states;
always @(posedge clk ) begin
if(rst) begin
i<=0;
j<=0;
states<=0;
input_fmap_en<=0;
fmap_ab<=0;
end
else if(write_done_weight_bias) begin//权重偏置写入完毕
case(states)
0:begin
if(i<IMAGE_WIDTH-2)begin//卷积输出width=IMAGE_WIDTH-kernel_size+1=482-3+1=480
input_fmap_en<=1;
fmap_ab={fmap_R[i+2+IMAGE_WIDTH*(j+2)],fmap_R[i+1+IMAGE_WIDTH*(j+2)],fmap_R[i+0+IMAGE_WIDTH*(j+2)],//i=0,j=0:R第三行前三个fmap[964]、fmap[965]、fmap[966]
fmap_R[i+2+IMAGE_WIDTH*(j+1)],fmap_R[i+1+IMAGE_WIDTH*(j+1)],fmap_R[i+0+IMAGE_WIDTH*(j+1)],//i=0,j=0:R第二行前三个fmap[482]、fmap[483]、fmap[484]
fmap_R[i+2+IMAGE_WIDTH*(j+0)],fmap_R[i+1+IMAGE_WIDTH*(j+0)],fmap_R[i+0+IMAGE_WIDTH*(j+0)],//i=0,j=0:R第一行前三个fmap[0]、fmap[1]、fmap[2]
fmap_G[i+2+IMAGE_WIDTH*(j+2)],fmap_G[i+1+IMAGE_WIDTH*(j+2)],fmap_G[i+0+IMAGE_WIDTH*(j+2)],
fmap_G[i+2+IMAGE_WIDTH*(j+1)],fmap_G[i+1+IMAGE_WIDTH*(j+1)],fmap_G[i+0+IMAGE_WIDTH*(j+1)],
fmap_G[i+2+IMAGE_WIDTH*(j+0)],fmap_G[i+1+IMAGE_WIDTH*(j+0)],fmap_G[i+0+IMAGE_WIDTH*(j+0)],
fmap_B[i+2+IMAGE_WIDTH*(j+2)],fmap_B[i+1+IMAGE_WIDTH*(j+2)],fmap_B[i+0+IMAGE_WIDTH*(j+2)],
fmap_B[i+2+IMAGE_WIDTH*(j+1)],fmap_B[i+1+IMAGE_WIDTH*(j+1)],fmap_B[i+0+IMAGE_WIDTH*(j+1)],
fmap_B[i+2+IMAGE_WIDTH*(j+0)],fmap_B[i+1+IMAGE_WIDTH*(j+0)],fmap_B[i+0+IMAGE_WIDTH*(j+0)]};
i<=i+1;//i依次等于0-480,j=0,卷积窗口右移。
states<=1;
end
else begin
input_fmap_en<=0;
end // else
end // 0:
1:begin
input_fmap_en<=0;
if(i==IMAGE_WIDTH-2)begin//第一轮卷积结束
i<=0;
j<=j+1;//下一轮,卷积核下移一行,窗口开始右移卷积
end
if(j<IMAGE_HIGH-2)begin//卷积输出high=fmap_high-kernel_size+1
states<=0;
end
if((j==IMAGE_HIGH-3)&&(i==IMAGE_WIDTH-2))begin//卷积窗口遍历结束
states<=2;
end
end // 1:
2:begin//卷积结束,初始化
input_fmap_en<=0;
fmap_ab<=0;
end // 2:
endcase // states
end
end
integer end_temp;
initial begin
end_temp=$fopen("C://Users//mayn//Desktop//nn//conv//conv_layer//conv_layer_result.txt","w");
end
always @(posedge clk) begin
if(uut_conv_layer.output_en)begin//uut模块中输出使能为1,写入卷积结果数据
$fwrite(end_temp,"%h\n",$signed(uut_conv_layer.end_data));
end
end
endmodule