Verilator 5でそれなりの規模のSystemVerilogテストベンチをシミュレーション(したかった)
Verilator5からはタイミング記述がサポート、すなわちテストベンチをHDLで書けるようになったらしい。
つまりこれまでxsimで使ってたベンチがそのまま使えるのか!?ということでやってみた。
結果としては動かなかったが、Verilator5の情報はまだ少ないので記事として公開する。
環境
- Verilator 5.010 2023-04-30 rev v5.010-33-gc37ec1975
-
g++ (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0unrecognized optionエラーが出たのでg++11にしました - Ubuntu 18.04 on Windows11 WSL2
C++20の使用がstrongly recommendedされているがgcc 7はC++17。まあいけるっしょ
いけなかった
乗算器シミュレーション
使ったコード
`timescale 1ns / 1ps
//////////////////////////////////////////////////////////////////////////////////
// Company: YNU
// Engineer: Junichi Sakamoto
//
// Create Date: 08/16/2022 10:49:29 AM
// Design Name:
// Module Name: params
// Project Name:
// Target Devices:
// Tool Versions:
// Description:
//
// Dependencies:
//
// Revision:
// Revision 0.01 - File Created
// Additional Comments:
//
//////////////////////////////////////////////////////////////////////////////////
`define BLS12_381
`ifdef BLS12_381
//`define PARAMS_BN254_d0
package PARAMS_BN254_d0;
localparam
Mod = 384'h1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab, //The BLS12381 prime
M_tilde = 397'h1a00c3e703c13a1a974cc6634c28e2f45df11e1f10f638302ef29d0e7ece12113c3fa3fab157ec03b9ffd202ffffaaabffff; // 397 bits
localparam
K = 17,
L = 26,
C = L - K,
D = 0,
_Mpp = 384'hd0061f381e09d0d4ba66331a614717a2ef88f0f887b1c1817794e873f6709089e1fd1fd58abf601dcffe9017fffd556, //k=17
R = 425 - 1,
RmodM = 'h6266ea86a2d27c9ffae7bca245ef0aec53439f2badb6a0480fa012749c8bd9dde2ecd0389ebc8f9c4291d51d3fbb2cd,
R2modM = 'h11e41d6827510424302748620c16c6767000b6609ab32f0bbce50a9fa478fa904fb1a226d7c8c3b38ac02e272c3d3bb1,
R_INV = 'h5d0b6a95f0f140c9be94886e201ff4e1d3e75366d5014ac26f9e99e3ac1a0d2797922988d180e68f014e4fe540686ac, //r=425
N = 25, // R/K + 1, at K = 17
M = 16, // R/L +1
HALF_S = (M+D+1)/2, // must be even
// S_1_3 = (M+D+1)/3, // 1/3
S_1_3 = 6,
//S_1_4 = (M+D-1)/4; // 1/4
S_1_4 = 4,//(M+D)/4,
ADD_DIV = 4,
L3_CARRY = 8,
LEN_12M_TILDE = 404,
LEN_1024M_TILDE = 407, // Must be divided by ADD_DIV
N_THREADS = 6,
BRAM_DEPTH = 10,
LAT_PE = 3,
LAT_QPMM = (N+1) * LAT_PE + 4;
typedef logic[$bits(M_tilde):0] uint_Mtilde2_t;
typedef logic[M:0][47:0] qpmm_S_t;
typedef logic[M-2:0][L-1:0] poly_Mpp_t;
typedef logic[K*N-1:0] uint_fp_t;
typedef logic[L*M-1:0] uint_fpa_t;
typedef logic[M-1:0][L-1:0] poly_a_t; //288
typedef logic[N-1:0][K-1:0] poly_b_t; //272
typedef logic[(HALF_S-1)*L+48-1:0] qpmm_S_half;
typedef logic[(S_1_3-1)*L+48-1:0] qpmm_S_1_3;
typedef logic[(S_1_4-1)*L+48-1:0] qpmm_S_1_4;
typedef logic[LEN_12M_TILDE/ADD_DIV-1:0] fp_div4_t; // uint divided by 4. Lack of 1 bit for 289
// Struct
typedef struct packed {
logic carry;
fp_div4_t val;
} redundant_term_L1;
typedef redundant_term_L1[ADD_DIV-1:0] redundant_poly_L1;
typedef struct packed {
logic [1:0] carry;
fp_div4_t val;
} redundant_term_L2;
typedef redundant_term_L2[ADD_DIV-1:0] redundant_poly_L2;
typedef struct packed {
logic [L3_CARRY-1:0] carry;
fp_div4_t val;
} redundant_term_L3;
typedef redundant_term_L3[ADD_DIV-1:0] redundant_poly_L3;
// Union
typedef union packed {
logic[LEN_12M_TILDE-1:0] uint; // 272
fp_div4_t[ADD_DIV-1:0] poly;
} M_tilde12_t;
typedef union packed {
uint_fp_t uint;
poly_b_t poly_b;
} qpmm_fp_t;
typedef union packed {
uint_fpa_t uint;
poly_a_t poly_a;
} qpmm_fpa_t;
typedef union packed {
uint_fp_t uint;
poly_b_t poly_b;
} qpmm_fpb_t;
function qpmm_fp_t rand_288();
// N must be less than 320 bits
rand_288 = {$urandom(), $urandom(), $urandom(), $urandom(), $urandom(), $urandom(), $urandom(), $urandom(), $urandom(), $urandom(), $urandom(), $urandom(), $urandom()};
endfunction
endpackage
`else
//`define PARAMS_BN254_d0
package PARAMS_BN254_d0;
localparam
Mod = 256'h2523648240000001ba344d80000000086121000000000013a700000000000013, //The BN254 prime
M_tilde = 267'h7d18c77dfc340005d1864d4d3800001c39ab785000000042327630000000003ffff; // 267 bits
// localparam
// K = 16,
// L = 24,
// C = L - K,
// D = 0,
// _Mpp = 256'h7d18c77dfc340005d1864d4d3800001c39ab785000000042327630000000004, //k=16
// R = 272 - 1,
// R_INV = 'h131822b9f3de491ff4d85504410ed56c72e68c1f514017577c489d762ae9cf77, //r=272
// N = R/K + 1, // 17 at k=16
// M = R/L + 1, //12 at L=24
// HALF_S = (M+D+1)/2, // must be even
// S_1_3 = (M+D+1)/3, // 1/3
// //S_1_4 = (M+D-1)/4; // 1/4
// S_1_4 = (M+D)/4,
// ADD_DIV = 4,
// L3_CARRY = 8; // 1/4
localparam
K = 17,
L = 26,
C = L - K,
D = 0,
_Mpp = 256'h3e8c63befe1a0002e8c326a69c00000e1cd5bc2800000021193b18000000002, //k=17
R = 289 - 1,
RmodM = 'h5b61645efa0be833e0cf20c7a8e86587e5efef111005428d8fffefa0f51466d,
R2modM = 'h1e3ad4f19ece02905cd917dec0178837a70990ae5b87678a825bfd79f8a881b8,
R_INV = 'h4183ffd639e59ef555105a592c220885737a65cc60fa8a23659c0a44ebb1577, //r=289
N = R/K + 1, //17
M = 11,
HALF_S = (M+D+1)/2, // must be even
S_1_3 = (M+D+1)/3, // 1/3
//S_1_4 = (M+D-1)/4; // 1/4
S_1_4 = 3,//(M+D)/4,
ADD_DIV = 4,
L3_CARRY = 8,
LEN_12M_TILDE = 272,
LEN_1024M_TILDE = 277, // Must be divided by ADD_DIV
N_THREADS = 5,
BRAM_DEPTH = 10, // N_THREADS > 4, then 10
LAT_PE = 4,
LAT_QPMM = (N+1) * LAT_PE + 4;
typedef logic[$bits(M_tilde):0] uint_Mtilde2_t;
typedef logic[M:0][47:0] qpmm_S_t;
typedef logic[M-2:0][L-1:0] poly_Mpp_t;
typedef logic[K*N-1:0] uint_fp_t;
typedef logic[L*M-1:0] uint_fpa_t;
typedef logic[M-1:0][L-1:0] poly_a_t; //288
typedef logic[N-1:0][K-1:0] poly_b_t; //272
typedef logic[(HALF_S-1)*L+48-1:0] qpmm_S_half;
typedef logic[(S_1_3-1)*L+48-1:0] qpmm_S_1_3;
typedef logic[(S_1_4-1)*L+48-1:0] qpmm_S_1_4;
typedef logic[LEN_12M_TILDE/ADD_DIV-1:0] fp_div4_t; // uint divided by 4. Lack of 1 bit for 289
// Struct
typedef struct packed {
logic carry;
fp_div4_t val;
} redundant_term_L1;
typedef redundant_term_L1[ADD_DIV-1:0] redundant_poly_L1;
typedef struct packed {
logic [1:0] carry;
fp_div4_t val;
} redundant_term_L2;
typedef redundant_term_L2[ADD_DIV-1:0] redundant_poly_L2;
typedef struct packed {
logic [L3_CARRY-1:0] carry;
fp_div4_t val;
} redundant_term_L3;
typedef redundant_term_L3[ADD_DIV-1:0] redundant_poly_L3;
// Union
typedef union packed {
logic[LEN_12M_TILDE-1:0] uint; // 272
fp_div4_t[ADD_DIV-1:0] poly;
} M_tilde12_t;
typedef union packed {
uint_fp_t uint;
poly_b_t poly_b;
} qpmm_fp_t;
typedef union packed {
uint_fpa_t uint;
poly_a_t poly_a;
} qpmm_fpa_t;
typedef union packed {
uint_fp_t uint;
poly_b_t poly_b;
} qpmm_fpb_t;
function qpmm_fp_t rand_288();
// N must be less than 320 bits
rand_288 = {$urandom(), $urandom(), $urandom(), $urandom(), $urandom(), $urandom(), $urandom(), $urandom(), $urandom()};
endfunction
endpackage
`endif
`ifndef CONTROL
`define CONTROL
package CONTROL;
import PARAMS_BN254_d0::BRAM_DEPTH;
// Struct
typedef struct packed {
logic os;
logic inve;
logic [1:0] poa3;
logic [1:0] poa2;
logic [1:0] pos;
logic [2:0] pom3;
logic [2:0] pom2;
logic [2:0] pom1;
logic [2:0] cm; // Control instruction
logic [1:0] pm2;
logic [1:0] pm1;
logic me1;
logic me0;
logic opc;
logic [7:0] waddr0;
logic [7:0] waddr1;
logic [7:0] raddr0;
logic [7:0] raddr1;
} raw_rom_t;
typedef struct packed {
logic inve;
logic [1:0] pos;
logic [2:0] pom3;
logic [2:0] pom2;
logic [2:0] pom1;
logic [2:0] cm;
logic [1:0] pm;
logic me1;
logic me0;
} ctrl_sig_t;
typedef struct packed {
logic inve;
logic [1:0] pos;
logic [2:0] pom3;
logic [2:0] pom2;
logic [2:0] pom1;
logic [2:0] cm;
logic [1:0] pm;
logic offset_dst;
logic offset_src0;
logic offset_src1;
} ctrl_sig_offset_t; // without cmul
typedef struct packed {
ctrl_sig_t csig;
logic [BRAM_DEPTH-1:0] dst;
logic [BRAM_DEPTH-1:0] src0;
logic [BRAM_DEPTH-1:0] src1;
} micro_ops_t;
typedef struct packed {
logic [1:0] field;
logic t;
logic [3:0] arith;
} operation_t; // 7 bits
typedef struct packed {
logic me;
logic [2:0] cm;
logic [3:0] end_mop_cnt;
} sub_operation_t; // 8 bits
typedef struct packed {
operation_t op;
sub_operation_t sub_op;
} opcode_t; // 15 bits
typedef struct packed {
opcode_t opcode;
logic [6:0] dst;
logic [6:0] src0;
logic [6:0] src1;
} instruction_t;
endpackage
`endif
`ifndef PARAM_UART
`define PARAM_UART
package PARAM_UART;
parameter UART_CLK_FREQ = 100000000,
UART_BAUD_RATE = 460800;
endpackage
`endif
`timescale 1ns / 1ps
//////////////////////////////////////////////////////////////////////////////////
// Company: YNU
// Engineer: Junichi Sakamoto
//
// Create Date: 09/06/2022 09:51:27 AM
// Design Name:
// Module Name: DSP_mul
// Project Name:
// Target Devices:
// Tool Versions:
// Description:
//
// Dependencies:
//
// Revision:
// Revision 0.01 - File Created
// Additional Comments:
//
//////////////////////////////////////////////////////////////////////////////////
import PARAMS_BN254_d0::*;
module DSP_mul
#(
parameter latency = 0
)(
input clk,
input [L-1:0] in_a,
input [K-1:0] in_b,
output [47:0] out_s
);
`ifdef XILINX_SIMULATOR
logic [26:0] reg_a, reg_m;
logic [17:0] reg_b, reg_q;
logic [47:0] reg_p, reg_pp;
assign out_s = reg_pp;
always @(posedge clk) begin
reg_a <= 27'(in_a);
reg_b <= 18'(in_b);
reg_p <= reg_a * reg_b;
reg_pp <= reg_p;
end
`else
xbip_dsp48_macro_ab_3 mul_ab (
.CLK(clk), // input wire CLK
.A(27'(in_a)), // input wire [16 : 0] A
.B(18'(in_b)), // input wire [16 : 0] B
.PCOUT(), // output wire [47 : 0] PCOUT
.P(out_s) // output wire [33 : 0] P
);
`endif
endmodule
module PE
#(
parameter latency = 0
)(
input clk,
input [K-1:0] in_b, in_q,
input [L-1:0] in_a, in_m,
input [2*K-1:0] in_sl,
input [48-2*K-1:0] in_su,
output [47:0] out_s
);
wire [47:0] w_pc;
`ifdef XILINX_SIMULATOR
logic [26:0] reg_a, reg_m;
logic [17:0] reg_b, reg_q;
logic [47:0] reg_su, reg_sl, reg_sl_, reg_pcout, reg_prod, reg_p;
assign out_s = reg_p;
always @(posedge clk) begin
reg_a <= 27'(in_a);
reg_b <= 18'(in_b);
reg_su <= 48'(in_su << K);
reg_pcout <= reg_a * reg_b + reg_su;
reg_m <= 27'(in_m);
reg_q <= 18'(in_q);
reg_sl_ <= 48'(in_sl << C);
reg_sl <= reg_sl_;
reg_prod <= reg_m * reg_q;
reg_p <= reg_prod + reg_sl + reg_pcout;
end
`else
xbip_dsp48_macro_ab_c_2 mul_ab (
.CLK(clk), // input wire CLK
.A(27'(in_a)), // input wire [26 : 0] A
.B(18'(in_b)), // input wire [17 : 0] B
.C(48'(in_su << K)), // input wire [47 : 0] C
.PCOUT(w_pc), // output wire [47 : 0] PCOUT
.P() // output wire [47 : 0] P
);
xbip_dsp48_macro_ab_c_pcin_3 mul_qm (
.CLK(clk), // input wire CLK
.PCIN(w_pc), // input wire [47 : 0] PCIN
.A(27'(in_m)), // input wire [26 : 0] A
.B(18'(in_q)), // input wire [17 : 0] B
.C(48'(in_sl << C)), // input wire [47 : 0] C
.P(out_s) // output wire [47 : 0] P
);
`endif
endmodule
`timescale 1ns / 1ps
//////////////////////////////////////////////////////////////////////////////////
// Company: YNU
// Engineer: Junichi Sakamoto
//
// Create Date: 09/5/2022 10:40:18 AM
// Design Name:
// Module Name: QPMM_d0
// Project Name:
// Target Devices:
// Tool Versions:
// Description:
//
// Dependencies:
//
// Revision:
// Revision 0.01 - File Created
// Additional Comments:
//
//////////////////////////////////////////////////////////////////////////////////
import PARAMS_BN254_d0::*;
localparam latency_FA = 4;
module QPMM_d0(
input clk, rstn,
input [LEN_1024M_TILDE-1:0] A,
input [LEN_1024M_TILDE-1:0] B,
output uint_Mtilde2_t Z
);
///////////////////////////////////////////////
// Declaration
//////////////////////////////////////////////
qpmm_fpa_t[N+D-1:0] reg_A, buf_A, buf_A2, buf_A3;
qpmm_fpb_t[N+D-1:0] reg_B, buf_B, buf_B2, buf_B3;
qpmm_fpa_t Mpp = _Mpp;
logic[N+D:0][K-1:0] wire_q;
qpmm_S_t[N+D+1:0] reg_S;
qpmm_S_t buf_S;
///////////////////////////////////////////////
// Initilization
//////////////////////////////////////////////
// For 272>
// assign reg_A[0][271:0] = A[271:0];
// assign reg_A[0][285:272] = '0;
assign reg_A[0] = ($bits(qpmm_fpa_t))'(A);
assign reg_B[0] = ($bits(qpmm_fpb_t))'(B);
assign reg_S[0] = {default: '0};
assign wire_q[0] = '0;
///////////////////////////////////////////////
// Final Addition
//////////////////////////////////////////////
if (latency_FA == 2) begin
qpmm_S_half Z_L, Z_H;
always @(posedge clk) begin
buf_S <= reg_S[N+D+1];
Z_L <= poly2int_half(buf_S[HALF_S-1:1]) + buf_S[0][47:K];
Z_H <= poly2int_half(buf_S[M+D:HALF_S]);
Z <= (Z_H << (HALF_S-1)*L) + Z_L;
end
end
else if (latency_FA == 3) begin
qpmm_S_1_3 Z_L, Z_M, Z_H, Z_H2;
logic [(S_1_3*2-1)*L+48-1:0] Z_ML;
always @(posedge clk) begin
buf_S <= reg_S[N+D+1];
Z_L <= poly2int_1_3(buf_S[S_1_3-1:1]) + buf_S[0][47:K];
Z_M <= poly2int_1_3(buf_S[(2*S_1_3)-1:S_1_3]);
Z_H <= poly2int_half(buf_S[M+D:(2*S_1_3)]);
Z_ML <= (Z_M << (S_1_3-1)*L) + Z_L;
Z_H2 <= Z_H;
Z <= (Z_H2 << (2*S_1_3-1)*L) + Z_ML;
end
end
else if (latency_FA == 4) begin
qpmm_S_1_4 Z_LL, Z_LH, Z_HL, Z_HH;
qpmm_S_half Z_L, Z_H;
always @(posedge clk) begin
buf_S <= reg_S[N+D+1];
Z_LL <= poly2int_1_4(buf_S[S_1_4:1]) + buf_S[0][47:K];
Z_LH <= poly2int_1_4(buf_S[2*S_1_4:S_1_4+1]);
Z_HL <= poly2int_1_4(buf_S[3*S_1_4:2*S_1_4+1]);
`ifdef BLS12_381
Z_HH <= poly2int_1_4(buf_S[4*S_1_4:3*S_1_4+1]);
`else
if(K==16)
Z_HH <= poly2int_1_4(buf_S[4*S_1_4:3*S_1_4+1]);
else if(K==17)
Z_HH <= poly2int_1_4({40'd0, buf_S[3*S_1_4+2:3*S_1_4+1]});
`endif
Z_L <= (Z_LH << (S_1_4*L)) + Z_LL;
Z_H <= (Z_HH << (S_1_4*L)) + Z_HL;
Z <= (Z_H << (2*S_1_4*L)) + Z_L;
end
end
///////////////////////////////////////////////
// Body
//////////////////////////////////////////////
for(genvar i = 0; i < N + D + 1; i = i + 1) begin : QPMM
for(genvar j = 0; j < M + D; j = j + 1) begin : for_1
if(i == 0)
DSP_mul #(.latency(LAT_PE)) mul_ab (.clk(clk), .in_a(reg_A[i].poly_a[j]), .in_b(reg_B[i].poly_b[i]), .out_s(reg_S[i+1][j+1]));
else if(i == N + D) begin
if(j == M - 1)
assign reg_S[i+1][j+1] = '0;
else
PE #(.latency(LAT_PE)) pe(.clk(clk), .in_a('0), .in_b('0), .in_m(Mpp.poly_a[j]), .in_q(wire_q[i]), .in_sl(reg_S[i][j+2][2*K-1:0]), .in_su(reg_S[i][j+1][47:2*K]), .out_s(reg_S[i+1][j+1]));
end
else begin
if(j == M - 1)
DSP_mul #(.latency(LAT_PE)) mul_ab (.clk(clk), .in_a(reg_A[i].poly_a[j]), .in_b(reg_B[i].poly_b[i]), .out_s(reg_S[i+1][j+1]));
else
PE #(.latency(LAT_PE)) pe(.clk(clk), .in_a(reg_A[i].poly_a[j]), .in_b(reg_B[i].poly_b[i]), .in_m(Mpp.poly_a[j]), .in_q(wire_q[i]), .in_sl(reg_S[i][j+2][2*K-1:0]), .in_su(reg_S[i][j+1][47:2*K]), .out_s(reg_S[i+1][j+1]));
end
end
wire [47:0] buf_q = reg_S[i][0][47:K] + reg_S[i][1][2*K-1:0];
assign wire_q[i] = buf_q[K-1:0];
if (LAT_PE == 1) begin
always_ff @(posedge clk) begin : ff_ab_q
if(i < N + D) begin
reg_A[i+1] <= reg_A[i];
reg_B[i+1] <= reg_B[i];
end
reg_S[i+1][0] <= buf_q;
end
end
else if (LAT_PE == 2) begin
logic [47:0] buf_s0;
always_ff @(posedge clk) begin : ff_ab_q
if(i < N + D) begin
buf_A[i] <= reg_A[i];
buf_B[i] <= reg_B[i];
reg_A[i+1] <= buf_A[i];
reg_B[i+1] <= buf_B[i];
end
buf_s0 <= buf_q;
reg_S[i+1][0] <= buf_s0;
end
end
else if (LAT_PE == 3) begin
logic [47:0] buf_s0_0, buf_s0_1;
always_ff @(posedge clk) begin : ff_ab_q
if(i < N + D) begin
buf_A[i] <= reg_A[i];
buf_B[i] <= reg_B[i];
buf_A2[i] <= buf_A[i];
buf_B2[i] <= buf_B[i];
reg_A[i+1] <= buf_A2[i];
reg_B[i+1] <= buf_B2[i];
end
buf_s0_0 <= buf_q;
buf_s0_1 <= buf_s0_0;
reg_S[i+1][0] <= buf_s0_1;
end
end
else if (LAT_PE == 4) begin
logic [47:0] buf_s0_0, buf_s0_1, buf_s0_2;
always_ff @(posedge clk) begin : ff_ab_q
if(i < N + D) begin
buf_A[i] <= reg_A[i];
buf_B[i] <= reg_B[i];
buf_A2[i] <= buf_A[i];
buf_B2[i] <= buf_B[i];
buf_A3[i] <= buf_A2[i];
buf_B3[i] <= buf_B2[i];
reg_A[i+1] <= buf_A3[i];
reg_B[i+1] <= buf_B3[i];
end
buf_s0_0 <= buf_q;
buf_s0_1 <= buf_s0_0;
buf_s0_2 <= buf_s0_1;
reg_S[i+1][0] <= buf_s0_2;
end
end
end
function qpmm_S_half poly2int_1_4;
input [S_1_4-1:0][47:0] A;
poly2int_1_4 = 0;
for(integer i = 0; i < S_1_4; i = i + 1) begin
poly2int_1_4 = poly2int_1_4 + (A[i] << (L*i));
end
endfunction
function qpmm_S_half poly2int_1_3;
input [S_1_3-1:0][47:0] A;
poly2int_1_3 = 0;
for(integer i = 0; i < S_1_3; i = i + 1) begin
poly2int_1_3 = poly2int_1_3 + (A[i] << (L*i));
end
endfunction
function qpmm_S_half poly2int_half;
input [HALF_S-1:0][47:0] A;
poly2int_half = 0;
for(integer i = 0; i < HALF_S; i = i + 1) begin
poly2int_half = poly2int_half + (A[i] << (L*i));
end
endfunction
function qpmm_fpa_t poly2int;
input qpmm_S_t A;
poly2int = 0;
for(integer i = 0; i <= M+D+1; i = i + 1) begin
poly2int = poly2int + (A[i] << (L*i));
end
endfunction
endmodule
`timescale 1ns / 1ps
//////////////////////////////////////////////////////////////////////////////////
// Company: YNU
// Engineer: Junichi Sakamoto
//
// Create Date: 08/16/2022 11:50:10 AM
// Design Name:
// Module Name: test_QPMM
// Project Name:
// Target Devices:
// Tool Versions:
// Description:
//
// Dependencies:
//
// Revision:
// Revision 0.01 - File Created
// Additional Comments:
//
//////////////////////////////////////////////////////////////////////////////////
import PARAMS_BN254_d0::*;
`ifdef BLS12_381
localparam bit_width = 381;
localparam N_PIPELINE_STAGES = 82;
//localparam N_PIPELINE_STAGES = 108;
`else
localparam bit_width = 254 - 3; // Why -3?
//localparam N_PIPELINE_STAGES = 22;
//localparam N_PIPELINE_STAGES = 40;
//localparam N_PIPELINE_STAGES = 58;
localparam N_PIPELINE_STAGES = 76;
`endif
module test_QPMM;
localparam
CYCLE = 10,
DELAY = 2,
N_DATA = 1000000;
reg clk, rstn;
qpmm_fp_t A, B;
wire [bit_width+16:0] Z;
wire [999:0] tmp_ans = MR(A) * MR(B);
wire [bit_width-1:0] ans = tmp_ans % PARAMS_BN254_d0::Mod;
wire [bit_width-1:0] res = MR(Z);
reg [N_PIPELINE_STAGES:0][bit_width-1:0] reg_ans;
QPMM_d0 DUT(.clk, .rstn, .A, .B, .Z);
always begin
#(CYCLE/2) clk <= ~clk;
end
for(genvar i = 0; i < N_PIPELINE_STAGES; i++) begin
always @(posedge clk) begin
reg_ans[i+1] <= reg_ans[i];
end
end
/*-------------------------------------------
Test
-------------------------------------------*/
initial begin
rstn <= 1'b1;
clk <= 1'b1;
#(CYCLE*10)
rstn <= 1'b0;
#(CYCLE*10)
rstn <= 1'b1;
#DELAY;
$display("Test QPMM start\n");
for(integer i = 0; i < N_PIPELINE_STAGES; i = i + 1) begin
A <= rand_288() % (1024*PARAMS_BN254_d0::M_tilde);
B <= rand_288() % (1024*PARAMS_BN254_d0::M_tilde);
// A <= 272'h9d289143693cedb99a3634824056732c6ab659f650c97a3a07de4a3de195611861a;
// B <= 272'hcfc83b144a374cedb4ebb176cd5f622edee5ea20fbe8dcd1b6cec23632499a3cc0a1;
#DELAY
reg_ans[0] <= ans;
#(CYCLE-DELAY);
end
for(integer i = N_PIPELINE_STAGES; i < N_DATA; i = i + 1) begin
A <= rand_288() % (1024*PARAMS_BN254_d0::M_tilde);
B <= rand_288() % (1024*PARAMS_BN254_d0::M_tilde);
#DELAY
reg_ans[0] <= ans;
//$display("%d: x = %h, y = %h", i, A, B);
if(res !== reg_ans[N_PIPELINE_STAGES]) begin
$display("#%d Failed: ans = %h, res = %h", i, reg_ans[N_PIPELINE_STAGES], res);
//$display("x = %h, y = %h", x, y);
$stop();
end
#(CYCLE-DELAY);
end
$display("#%d Test end\n", N_DATA);
$finish;
end
function [bit_width-1:0] MR;
input qpmm_fp_t A;
logic [999:0] tmp_ans = (PARAMS_BN254_d0::R_INV*A);
MR = tmp_ans % PARAMS_BN254_d0::Mod;
endfunction
endmodule
検証コードは前作った大規模な剰余乗算器で、ペアリング用の素体乗算でかなり周波数出るようにチューニングしてある。
import
, packed union
, packed struct
などが含まれていたり、プリプロディレクティブを使ったりしているが、この辺は大丈夫なのか気になる。
あとxilinx IPを使ってる部分があるので、そこはHDL記述に直す。
このあたり
指定するファイルはトップだけでいいらしいので、まず
verilator --binary src/test_QPMM.sv
を実行
すると次のようなエラー
%Error-PKGNODECL: src/test_QPMM.sv:22:8: Package/class 'PARAMS_BN254_d0' not found, and needs to be predeclared (IEEE 1800-2017 26.3)
22 | import PARAMS_BN254_d0::*;
| ^~~~~~~~~~~~~~~
... For error description see https://verilator.org/warn/PKGNODECL?v=5.011
%Error: src/test_QPMM.sv:22:8: Importing from missing package 'PARAMS_BN254_d0'
22 | import PARAMS_BN254_d0::*;
| ^~~~~~~~~~~~~~~
インポートするファイルが見つからない的なことを言われているので、インポートファイルを明示してみる。
verilator --binary src/params.sv src/test_QPMM.sv
するとうまくいったのか、エラーが次のように変わる。
%Error: src/params.sv:37:17: Too many digits for 32 bit number: 'h6266ea86a2d27c9ffae7bca245ef0aec53439f2badb6a0480fa012749c8bd9dde2ecd0389ebc8f9c4291d51d3fbb2cd
37 | RmodM = 'h6266ea86a2d27c9ffae7bca245ef0aec53439f2badb6a0480fa012749c8bd9dde2ecd0389ebc8f9c4291d51d3fbb2cd,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
パラメータのビット幅省略がダメっぽいので、指定してやる。
すると
%Error: src/test_QPMM.sv:51:5: Cannot find file containing module: 'QPMM_d0'
51 | QPMM_d0 DUT(.clk, .rstn, .A, .B, .Z);
| ^~~~~~~
モジュールが見つからんといわれる。カレントディレクトリしか探してくれないんだろうな。ということでsrcフォルダに入ってやってみる。
%Error: QPMM_d0.sv:53:24: syntax error, unexpected default, expecting TYPE-IDENTIFIER
53 | assign reg_S[0] = {default: '0};
| ^~~~~~~
test_QPMM.sv:53:1: ... note: In file included from test_QPMM.sv
%Error: Exiting due to 1 error(s)
全ての構造体に同じ値を設定するdefault
は使えない?
とりあえず全部0にすればいいので`0
にしてみる。
すると次は
%Error: DSP_mul.sv:144:13: Cannot find file containing module: 'xbip_dsp48_macro_ab_c_2'
144 | xbip_dsp48_macro_ab_c_2 mul_ab (
| ^~~~~~~~~~~~~~~~~~~~~~~
こんな感じでIPはやっぱりだめみたい。ifdef
で消されると期待したけどそうではなかった。
というわけでip記述全部消す。
%Error: Internal Error: test_QPMM.sv:106:57: ../V3Scope.cpp:80: Can't locate varref scope
106 | logic [999:0] tmp_ans = (PARAMS_BN254_d0::R_INV*A);
これはたぶんlogicには宣言と代入が同時にできないというやつだろう。functionの中ではなぜか許されてたので使っていたが
logic [999:0] tmp_ans;
assign tmp_ans = (PARAMS_BN254_d0::R_INV*A);
このように直した。
まだまだありそう
%Error-BLKANDNBLK: QPMM_d0.sv:37:25: Unsupported: Blocked and non-blocking assignments to same variable: 'test_QPMM.DUT.reg_B'
37 | qpmm_fpb_t[N+D-1:0] reg_B, buf_B, buf_B2, buf_B3;
| ^~~~~
test_QPMM.sv:53:1: ... note: In file included from test_QPMM.sv
QPMM_d0.sv:29:33: ... Location of blocking assignment
29 | input [LEN_1024M_TILDE-1:0] B,
| ^
QPMM_d0.sv:156:32: ... Location of nonblocking assignment
156 | reg_B[i+1] <= buf_B2[i];
| ^~
ブロッキングとノンブロッキング代入を同じ変数に使えないらしい。エラーでは29行目でブロッキングしていることになってるが実際には52行目でassign reg_B[0] = ($bits(qpmm_fpb_t))'(B);
をやってるのでこれのことだろう。
これはassignなのでブロッキング代入してるわけではなくて、reg_B[0]をwireとして使ってるということで、for文で0番目の要素を特別扱いしなくてよくなるので使ってたんだけど、コーディングスタイルが悪いかな・・・
always_combの中で<=演算子を使えばいいかと思ったけど、<=がブロッキング代入として認識されてうまくいかなかった。
しかたないのでひとつずつ見て行ってノンブロッキング/ブロッキングの変換を行った。これは結構手間だし、変換前後で合ってるかどうかわからないので、かなりストレス。
ここまで来てエラーは消えたが、53個ワーニングが残っているせいか実行形式が作成されない。
ひとまず-Wno-lint -Wno-fatal
で無視しよう。すると
g++: error: unrecognized command line option ‘-fcoroutines’; did you mean ‘-fc-prototypes’?
エラーが出る。g++7/8/9で試したがだめだった。最終的にg++-11でコンパイルできることを確認
気になる実行結果は
Test QPMM start
*** stack smashing detected ***: <unknown> terminated
Aborted
まあそんなもん。$display
の文字列が表示されてるので動いてはいるので前進かな・・・
とりあえず、今までのsvが簡単に動くという感じではないということはわかった。
Discussion