神经网络的FPGA实现:基础卷积操作(一) RGB三通道
Verilog HDL
Xilinx VIVADO
conv_pe.v文件在上述链接中

`timescale 1ns / 1ps
module conv_layer#(
	parameter CHANNEL_IN=3, //输入图像的通道R G B
	parameter CHANNEL_OUT=1  //输出图像的通道
	)(
	input clk,//时钟
	input rst,//复位

	input input_weight_en, //权值使能信号
	input input_bias_en, //bias使能信号
	//卷积核的输入
	input [7:0] weight_ab, //单时刻写入的权值
	input [7:0] bias_ab,   //单时刻写入的bias
	output write_done_weight_bias,//权值和bias写完的信号

	//输入特征图
	input input_fmap_en,//特征输入使能信号
	input [9*8*CHANNEL_IN-1:0]fmap,//特征输入3*3fmap,输入输出均为8bit,3通道

	output output_en,//输出使能信号
	output  [8*CHANNEL_OUT-1:0] end_data//输出8bit,1通道
    );

//memory 的类型 不可综合语法 若要综合需要改成RAM
reg [7:0] weight [9*CHANNEL_IN*CHANNEL_OUT-1:0]; //weight 个数为9*CHANNEL_IN*CHANNEL_OUT个8bit的数据:9*CHANNEL_IN*CHANNEL_OUT个8位寄存器
reg [7:0] bias [CHANNEL_OUT-1:0]; //bias  个数为CHANNEL_OUT个8bit的数据:CHANNEL_OUT个8位寄存器

//写权值
reg [15:0]weight_count;//写权重计数  16位weight_count
reg weight_write_done;//写权重完成
always @(posedge clk) begin
	if(rst) begin//复位
		weight_count<= 0;
		weight_write_done<=0;
	end else begin//非复位
		if(input_weight_en)begin//权重输入使能
			weight[weight_count]<=weight_ab;//权重写入
			weight_count<=weight_count+1;//权重写入计数
		end
		if(weight_count==9*CHANNEL_IN*CHANNEL_OUT)begin//weight 个数为9*CHANNEL_IN*CHANNEL_OUT,权重全部写入
			weight_write_done<=1;//权重写入完成
		end
	end
end

//写偏置
reg [7:0]bias_count;//写偏置计数
reg bias_write_done;//写偏置完成
always @(posedge clk) begin
	if(rst) begin
		bias_count<= 0;
		bias_write_done<=0;
	end else begin
		if(input_bias_en)begin//偏置输入使能
			bias[bias_count]<=bias_ab;//偏置写入
			bias_count<=bias_count+1;//偏置写入计数
		end
		if(bias_count==CHANNEL_OUT)begin//bias个数为CHANNEL_OUT,偏置全部写入
			bias_write_done<=1;//偏置写入完成
		end
	end
end

assign write_done_weight_bias=weight_write_done&bias_write_done; //参数传输完成

//fmap 的输入计算
//在此设置与输入通道数和输出通道数相关的 conv_pe 的个数为最快计算数据 
// 实际工程中的conv_pe的个数与 资源 速度 有关系 需要整体考虑
//例如 此工程中的数据 输入通道为3 输出通道为1 设置为3*1个conv_pe

wire valid_out_1,valid_out_2,valid_out_3;//每通道输出使能信号
wire [31:0]sum_data_1,sum_data_2,sum_data_3;//每通道输出数据
//3通道卷积
conv_pe uut_conv_pe_1(
	.clk(clk),
	.rst(rst),

	.input_en(input_fmap_en),
	.kernel_00(weight[0]),
	.kernel_01(weight[1]),
	.kernel_02(weight[2]),
	.kernel_10(weight[3]),
	.kernel_11(weight[4]),
	.kernel_12(weight[5]),
	.kernel_20(weight[6]),
	.kernel_21(weight[7]),
	.kernel_22(weight[8]),

	.fmap_00(fmap[7:0]),
	.fmap_01(fmap[15:8]),
	.fmap_02(fmap[23:16]),
	.fmap_10(fmap[31:24]),
	.fmap_11(fmap[39:32]),
	.fmap_12(fmap[47:40]),
	.fmap_20(fmap[55:48]),
	.fmap_21(fmap[63:56]),
	.fmap_22(fmap[71:64]),

	.valid_out(valid_out_1),
	.sum_data(sum_data_1)
	);

conv_pe uut_conv_pe_2(
	.clk(clk),
	.rst(rst),

	.input_en(input_fmap_en),
	.kernel_00(weight[9]),
	.kernel_01(weight[10]),
	.kernel_02(weight[11]),
	.kernel_10(weight[12]),
	.kernel_11(weight[13]),
	.kernel_12(weight[14]),
	.kernel_20(weight[15]),
	.kernel_21(weight[16]),
	.kernel_22(weight[17]),

	.fmap_00(fmap[79:72]),
	.fmap_01(fmap[87:80]),
	.fmap_02(fmap[95:88]),
	.fmap_10(fmap[103:96]),
	.fmap_11(fmap[111:104]),
	.fmap_12(fmap[119:112]),
	.fmap_20(fmap[127:120]),
	.fmap_21(fmap[135:128]),
	.fmap_22(fmap[143:136]),

	.valid_out(valid_out_2),
	.sum_data(sum_data_2)
	);

conv_pe uut_conv_pe_3(
	.clk(clk),
	.rst(rst),

	.input_en(input_fmap_en),
	.kernel_00(weight[18]),
	.kernel_01(weight[19]),
	.kernel_02(weight[20]),
	.kernel_10(weight[21]),
	.kernel_11(weight[22]),
	.kernel_12(weight[23]),
	.kernel_20(weight[24]),
	.kernel_21(weight[25]),
	.kernel_22(weight[26]),

	.fmap_00(fmap[151:144]),
	.fmap_01(fmap[159:152]),
	.fmap_02(fmap[167:160]),
	.fmap_10(fmap[175:168]),
	.fmap_11(fmap[183:176]),
	.fmap_12(fmap[191:184]),
	.fmap_20(fmap[199:192]),
	.fmap_21(fmap[207:200]),
	.fmap_22(fmap[215:208]),

	.valid_out(valid_out_3),
	.sum_data(sum_data_3)
	);

wire [31:0] sum_data_32; //3通道求和
wire signed [7:0]bias_temp;//临时存储 8位偏置数据
assign bias_temp=bias[0];
assign output_en=valid_out_1&valid_out_2&valid_out_3; //输出的使能信号

//将单个结果量化到0到255
wire [31:0]sum_data_1_Q,sum_data_2_Q,sum_data_3_Q;//每通道输出数据量化

assign sum_data_1_Q={24'd0,sum_data_1[7:0]};
assign sum_data_2_Q={24'd0,sum_data_2[7:0]};
assign sum_data_3_Q={24'd0,sum_data_3[7:0]};

//将最后的结果量化到-127到128
assign sum_data_32= (output_en==1)?sum_data_1_Q+sum_data_2_Q+sum_data_3_Q+{{24{bias_temp[7]}},bias_temp}:32'd0; //根据输出使能判断数据是否进行通道求和

assign end_data=sum_data_32[7:0];//通道求和数据输出

reg [31:0]conv_pe_count;
always @(posedge clk) begin
	if(rst) begin
		 conv_pe_count<= 0;
	end else begin
		if(output_en)begin
			conv_pe_count<=conv_pe_count+1;
		end
	end
end	

endmodule

需要imageBlueChannels.txt、imageGreenChannels.txt、imageRedChannels.txt,以及bias.txt、weight.txt文件;并放入工程文件夹下。

`timescale 1ns / 1ps

module tb_conv_layer;

reg clk;
reg rst;

initial begin
	rst = 1;
	#1000
	rst = 0;
end

always begin: clk1_blk
    clk = 0;
	forever #5 clk = ~clk;//周期为10个时间单位的波
end

localparam CHANNEL_IN=3;
localparam CHANNEL_OUT=1;
reg input_weight_en,input_bias_en,input_fmap_en;//权重,偏置,特征输入使能信号
reg [7:0] weight_ab,bias_ab;//单时刻写入的8位weight与bias
reg [9*8*CHANNEL_IN-1:0]fmap_ab;//写入的特征   一次3通道被卷积位宽:(kernel=3*3)*(8bit/个)*通道数channel_in
wire write_done_weight_bias,output_en;//参数写完使能,输出使能
wire [8*CHANNEL_OUT-1:0]end_data;//通道求和数据输出,输出8bit,1通道

conv_layer#(
	.CHANNEL_IN(CHANNEL_IN),
	.CHANNEL_OUT(CHANNEL_OUT)
	)uut_conv_layer(
	.clk(clk),
	.rst(rst),

	.input_weight_en(input_weight_en),
	.input_bias_en(input_bias_en),

	.weight_ab(weight_ab),
	.bias_ab(bias_ab),
	.write_done_weight_bias(write_done_weight_bias),

	.input_fmap_en(input_fmap_en),
	.fmap(fmap_ab),

	.output_en(output_en),
	.end_data(end_data)

	);

//实际的工程中传输数据到器件内的时候需要通过 串口 pcie 光口 网口等外部接口写入
//或者直接让权值存储到片内 但是一般数据较多 需要借助外部存储器如DDR进行权值缓存
reg [7:0] weight[9*CHANNEL_IN*CHANNEL_OUT-1:0]; //9*CHANNEL_IN*CHANNEL_OUT个 8位权重寄存器,每次卷积需要[(kernel*kernel)*channel_in]个权值
initial begin  //数据的组织形式是1的块,按照通道的方向进入到数据中,然后按照列方式进入
	$readmemh("C://Users//mayn//Desktop//nn//conv//conv_layer//weight.txt",weight);// 将所有的数据输入到mem1中
end

integer weight_count;
always @(posedge clk) begin
	if(rst) begin
		weight_count<= 0;
		input_weight_en<=0;
		weight_ab<=0;
	end else begin
		if(weight_count<9*CHANNEL_IN*CHANNEL_OUT)begin//权重未输入完毕时
			input_weight_en<=1;//权重输入使能
			weight_count<= weight_count+1;//权重输入计数
			weight_ab<=weight[weight_count];//单时刻写入的权重值
		end
		else begin
			input_weight_en<=0;
			weight_ab<=0;
		end
	end
end

reg [7:0]bias[CHANNEL_OUT-1:0]; //
initial begin  //数据的组织形式是1的块,按照通道的方向进入到数据中,然后按照列方式进入
	$readmemh("C://Users//mayn//Desktop//nn//conv//conv_layer//bias.txt",bias);// 将所有的数据输入到mem1中
end

integer bias_count;
always @(posedge clk) begin
	if(rst) begin
		bias_count<= 0;
		input_bias_en<=0;
		bias_ab<=0;
	end else begin
		if(bias_count<CHANNEL_OUT)begin
			input_bias_en<=1;
			bias_count<= bias_count+1;
			bias_ab<=bias[bias_count];
		end
		else begin
			input_bias_en<=0;
			bias_ab<=0;
		end
	end
end

localparam  IMAGE_WIDTH=482;
localparam  IMAGE_HIGH=322;
//一般写入特征图的值从外围存储器中获得
//写入fmap 的值
reg [7:0] fmap_R[IMAGE_WIDTH*IMAGE_HIGH-1:0]; // 图像的总数据
reg [7:0] fmap_G[IMAGE_WIDTH*IMAGE_HIGH-1:0]; // 图像的总数据
reg [7:0] fmap_B[IMAGE_WIDTH*IMAGE_HIGH-1:0]; // 图像的总数据
initial begin  //数据的组织形式是1的块,按照通道的方向进入到数据中,然后按照列方式进入
	$readmemh("C://Users//mayn//Desktop//nn//conv//conv_layer//imageBlueChannels.txt",fmap_B);
	$readmemh("C://Users//mayn//Desktop//nn//conv//conv_layer//imageGreenChannels.txt",fmap_G);
	$readmemh("C://Users//mayn//Desktop//nn//conv//conv_layer//imageRedChannels.txt",fmap_R);//读取3通道数据
end
integer i,j;
reg [2:0]states;

always @(posedge clk ) begin 
		if(rst) begin
		 	i<=0;
		 	j<=0;

		 	states<=0;

		 	input_fmap_en<=0;
		 	fmap_ab<=0;
		end 
		else if(write_done_weight_bias) begin//权重偏置写入完毕
				case(states)
					0:begin
						if(i<IMAGE_WIDTH-2)begin//卷积输出width=IMAGE_WIDTH-kernel_size+1=482-3+1=480
							input_fmap_en<=1;
							fmap_ab={fmap_R[i+2+IMAGE_WIDTH*(j+2)],fmap_R[i+1+IMAGE_WIDTH*(j+2)],fmap_R[i+0+IMAGE_WIDTH*(j+2)],//i=0,j=0:R第三行前三个fmap[964]、fmap[965]、fmap[966]
						 			 fmap_R[i+2+IMAGE_WIDTH*(j+1)],fmap_R[i+1+IMAGE_WIDTH*(j+1)],fmap_R[i+0+IMAGE_WIDTH*(j+1)],//i=0,j=0:R第二行前三个fmap[482]、fmap[483]、fmap[484]
						  			 fmap_R[i+2+IMAGE_WIDTH*(j+0)],fmap_R[i+1+IMAGE_WIDTH*(j+0)],fmap_R[i+0+IMAGE_WIDTH*(j+0)],//i=0,j=0:R第一行前三个fmap[0]、fmap[1]、fmap[2]
						  			 fmap_G[i+2+IMAGE_WIDTH*(j+2)],fmap_G[i+1+IMAGE_WIDTH*(j+2)],fmap_G[i+0+IMAGE_WIDTH*(j+2)],
						  			 fmap_G[i+2+IMAGE_WIDTH*(j+1)],fmap_G[i+1+IMAGE_WIDTH*(j+1)],fmap_G[i+0+IMAGE_WIDTH*(j+1)],
						  			 fmap_G[i+2+IMAGE_WIDTH*(j+0)],fmap_G[i+1+IMAGE_WIDTH*(j+0)],fmap_G[i+0+IMAGE_WIDTH*(j+0)],
						  			 fmap_B[i+2+IMAGE_WIDTH*(j+2)],fmap_B[i+1+IMAGE_WIDTH*(j+2)],fmap_B[i+0+IMAGE_WIDTH*(j+2)],
						  			 fmap_B[i+2+IMAGE_WIDTH*(j+1)],fmap_B[i+1+IMAGE_WIDTH*(j+1)],fmap_B[i+0+IMAGE_WIDTH*(j+1)],
						  			 fmap_B[i+2+IMAGE_WIDTH*(j+0)],fmap_B[i+1+IMAGE_WIDTH*(j+0)],fmap_B[i+0+IMAGE_WIDTH*(j+0)]};
									 i<=i+1;//i依次等于0-480,j=0,卷积窗口右移。
									 states<=1;
						end
						else begin
							input_fmap_en<=0;
						end // else
					end // 0:
					1:begin
						input_fmap_en<=0;
						if(i==IMAGE_WIDTH-2)begin//第一轮卷积结束
							i<=0;
							j<=j+1;//下一轮,卷积核下移一行,窗口开始右移卷积
						end
						if(j<IMAGE_HIGH-2)begin//卷积输出high=fmap_high-kernel_size+1
							states<=0;
						end
						if((j==IMAGE_HIGH-3)&&(i==IMAGE_WIDTH-2))begin//卷积窗口遍历结束
							states<=2;
						end
					end // 1:
					2:begin//卷积结束,初始化
						input_fmap_en<=0;
						fmap_ab<=0;
					end // 2:
				endcase // states
			end
		end

integer end_temp;
initial begin
	end_temp=$fopen("C://Users//mayn//Desktop//nn//conv//conv_layer//conv_layer_result.txt","w");
end
always @(posedge clk) begin
	if(uut_conv_layer.output_en)begin//uut模块中输出使能为1,写入卷积结果数据
		$fwrite(end_temp,"%h\n",$signed(uut_conv_layer.end_data));
	end
end
endmodule