🌊

Verilator 5でそれなりの規模のSystemVerilogテストベンチをシミュレーション(したかった)

2023/05/24に公開

Verilator5からはタイミング記述がサポート、すなわちテストベンチをHDLで書けるようになったらしい。
つまりこれまでxsimで使ってたベンチがそのまま使えるのか!?ということでやってみた。
結果としては動かなかったが、Verilator5の情報はまだ少ないので記事として公開する。

環境

  • Verilator 5.010 2023-04-30 rev v5.010-33-gc37ec1975
  • g++ (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0 unrecognized optionエラーが出たのでg++11にしました
  • Ubuntu 18.04 on Windows11 WSL2

C++20の使用がstrongly recommendedされているがgcc 7はC++17。まあいけるっしょ
いけなかった

乗算器シミュレーション

使ったコード
params.sv
`timescale 1ns / 1ps
//////////////////////////////////////////////////////////////////////////////////
// Company: YNU
// Engineer: Junichi Sakamoto
// 
// Create Date: 08/16/2022 10:49:29 AM
// Design Name: 
// Module Name: params
// Project Name: 
// Target Devices: 
// Tool Versions: 
// Description: 
// 
// Dependencies: 
// 
// Revision:
// Revision 0.01 - File Created
// Additional Comments:
// 
//////////////////////////////////////////////////////////////////////////////////

`define BLS12_381

`ifdef BLS12_381
//`define PARAMS_BN254_d0
package PARAMS_BN254_d0;
    localparam
        Mod = 384'h1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab, //The BLS12381 prime
        M_tilde = 397'h1a00c3e703c13a1a974cc6634c28e2f45df11e1f10f638302ef29d0e7ece12113c3fa3fab157ec03b9ffd202ffffaaabffff; // 397 bits
    localparam
        K = 17,
        L = 26,
        C = L - K,
        D = 0,
         _Mpp = 384'hd0061f381e09d0d4ba66331a614717a2ef88f0f887b1c1817794e873f6709089e1fd1fd58abf601dcffe9017fffd556, //k=17
        R = 425 - 1,
        RmodM = 'h6266ea86a2d27c9ffae7bca245ef0aec53439f2badb6a0480fa012749c8bd9dde2ecd0389ebc8f9c4291d51d3fbb2cd,
        R2modM = 'h11e41d6827510424302748620c16c6767000b6609ab32f0bbce50a9fa478fa904fb1a226d7c8c3b38ac02e272c3d3bb1,
        R_INV = 'h5d0b6a95f0f140c9be94886e201ff4e1d3e75366d5014ac26f9e99e3ac1a0d2797922988d180e68f014e4fe540686ac, //r=425
        N = 25, // R/K + 1, at K = 17
        M = 16, // R/L +1
        HALF_S = (M+D+1)/2, // must be even
//        S_1_3 = (M+D+1)/3,  // 1/3
        S_1_3 = 6,
        //S_1_4 = (M+D-1)/4;  // 1/4
        S_1_4 = 4,//(M+D)/4,
        ADD_DIV = 4,
        L3_CARRY = 8,
        LEN_12M_TILDE = 404,
        LEN_1024M_TILDE = 407,  // Must be divided by ADD_DIV
        N_THREADS = 6,
        BRAM_DEPTH = 10,
        LAT_PE = 3,
        LAT_QPMM = (N+1) * LAT_PE + 4;  

    typedef logic[$bits(M_tilde):0] uint_Mtilde2_t;
    typedef logic[M:0][47:0] qpmm_S_t;
    typedef logic[M-2:0][L-1:0] poly_Mpp_t;
    typedef logic[K*N-1:0] uint_fp_t;
    typedef logic[L*M-1:0] uint_fpa_t;
    typedef logic[M-1:0][L-1:0] poly_a_t; //288
    typedef logic[N-1:0][K-1:0] poly_b_t; //272
    typedef logic[(HALF_S-1)*L+48-1:0] qpmm_S_half;
    typedef logic[(S_1_3-1)*L+48-1:0] qpmm_S_1_3;
    typedef logic[(S_1_4-1)*L+48-1:0] qpmm_S_1_4;
    typedef logic[LEN_12M_TILDE/ADD_DIV-1:0] fp_div4_t; // uint divided by 4. Lack of 1 bit for 289
    // Struct
    typedef struct packed {
            logic carry;
            fp_div4_t val;
    } redundant_term_L1;
    typedef redundant_term_L1[ADD_DIV-1:0] redundant_poly_L1;
    typedef struct packed {
            logic [1:0] carry;
            fp_div4_t val;
    } redundant_term_L2;
    typedef redundant_term_L2[ADD_DIV-1:0] redundant_poly_L2;
    typedef struct packed {
            logic [L3_CARRY-1:0] carry;
            fp_div4_t val;
    } redundant_term_L3;
    typedef redundant_term_L3[ADD_DIV-1:0] redundant_poly_L3;
    // Union
    typedef union packed {
        logic[LEN_12M_TILDE-1:0] uint; // 272
        fp_div4_t[ADD_DIV-1:0] poly;
    } M_tilde12_t;
    typedef union packed {
        uint_fp_t uint;
        poly_b_t poly_b;
    } qpmm_fp_t;
    typedef union packed {
        uint_fpa_t uint;
        poly_a_t poly_a;
    } qpmm_fpa_t;
    typedef union packed {
        uint_fp_t uint;
        poly_b_t poly_b;
    } qpmm_fpb_t;
    function qpmm_fp_t rand_288();
        // N must be less than 320 bits
        rand_288 = {$urandom(), $urandom(), $urandom(), $urandom(), $urandom(), $urandom(), $urandom(), $urandom(), $urandom(), $urandom(), $urandom(), $urandom(), $urandom()};
    endfunction
endpackage

`else
//`define PARAMS_BN254_d0
package PARAMS_BN254_d0;
    localparam
        Mod = 256'h2523648240000001ba344d80000000086121000000000013a700000000000013, //The BN254 prime
        M_tilde = 267'h7d18c77dfc340005d1864d4d3800001c39ab785000000042327630000000003ffff; // 267 bits
    // localparam
    //     K = 16,
    //     L = 24,
    //     C = L - K,
    //     D = 0,
    //     _Mpp = 256'h7d18c77dfc340005d1864d4d3800001c39ab785000000042327630000000004, //k=16
    //     R = 272 - 1,
    //     R_INV = 'h131822b9f3de491ff4d85504410ed56c72e68c1f514017577c489d762ae9cf77, //r=272
    //     N = R/K + 1, // 17 at k=16
    //     M = R/L + 1, //12 at L=24
    //     HALF_S = (M+D+1)/2, // must be even
    //     S_1_3 = (M+D+1)/3,  // 1/3
    //     //S_1_4 = (M+D-1)/4;  // 1/4
    //     S_1_4 = (M+D)/4,
    //     ADD_DIV = 4,
    //     L3_CARRY = 8;  // 1/4
    localparam
        K = 17,
        L = 26,
        C = L - K,
        D = 0,
        _Mpp = 256'h3e8c63befe1a0002e8c326a69c00000e1cd5bc2800000021193b18000000002, //k=17
        R = 289 - 1,
        RmodM = 'h5b61645efa0be833e0cf20c7a8e86587e5efef111005428d8fffefa0f51466d,
        R2modM = 'h1e3ad4f19ece02905cd917dec0178837a70990ae5b87678a825bfd79f8a881b8,
        R_INV = 'h4183ffd639e59ef555105a592c220885737a65cc60fa8a23659c0a44ebb1577, //r=289
        N = R/K + 1, //17
        M = 11,
        HALF_S = (M+D+1)/2, // must be even
        S_1_3 = (M+D+1)/3,  // 1/3
        //S_1_4 = (M+D-1)/4;  // 1/4
        S_1_4 = 3,//(M+D)/4,
        ADD_DIV = 4,
        L3_CARRY = 8,
        LEN_12M_TILDE = 272,
        LEN_1024M_TILDE = 277, // Must be divided by ADD_DIV
        N_THREADS = 5,
        BRAM_DEPTH = 10,  // N_THREADS > 4, then 10
        LAT_PE = 4,
        LAT_QPMM = (N+1) * LAT_PE + 4;  
    
    typedef logic[$bits(M_tilde):0] uint_Mtilde2_t;
    typedef logic[M:0][47:0] qpmm_S_t;
    typedef logic[M-2:0][L-1:0] poly_Mpp_t;
    typedef logic[K*N-1:0] uint_fp_t;
    typedef logic[L*M-1:0] uint_fpa_t;
    typedef logic[M-1:0][L-1:0] poly_a_t; //288
    typedef logic[N-1:0][K-1:0] poly_b_t; //272
    typedef logic[(HALF_S-1)*L+48-1:0] qpmm_S_half;
    typedef logic[(S_1_3-1)*L+48-1:0] qpmm_S_1_3;
    typedef logic[(S_1_4-1)*L+48-1:0] qpmm_S_1_4;
    typedef logic[LEN_12M_TILDE/ADD_DIV-1:0] fp_div4_t; // uint divided by 4. Lack of 1 bit for 289

    // Struct
    typedef struct packed {
            logic carry;
            fp_div4_t val;
    } redundant_term_L1;
    typedef redundant_term_L1[ADD_DIV-1:0] redundant_poly_L1;

    typedef struct packed {
            logic [1:0] carry;
            fp_div4_t val;
    } redundant_term_L2;
    typedef redundant_term_L2[ADD_DIV-1:0] redundant_poly_L2;

    typedef struct packed {
            logic [L3_CARRY-1:0] carry;
            fp_div4_t val;
    } redundant_term_L3;
    typedef redundant_term_L3[ADD_DIV-1:0] redundant_poly_L3;


    // Union
    typedef union packed {
        logic[LEN_12M_TILDE-1:0] uint; // 272
        fp_div4_t[ADD_DIV-1:0] poly;
    } M_tilde12_t;

    typedef union packed {
        uint_fp_t uint;
        poly_b_t poly_b;
    } qpmm_fp_t;

    typedef union packed {
        uint_fpa_t uint;
        poly_a_t poly_a;
    } qpmm_fpa_t;

    typedef union packed {
        uint_fp_t uint;
        poly_b_t poly_b;
    } qpmm_fpb_t;

    function qpmm_fp_t rand_288();
        // N must be less than 320 bits
        rand_288 = {$urandom(), $urandom(), $urandom(), $urandom(), $urandom(), $urandom(), $urandom(), $urandom(), $urandom()};
    endfunction

endpackage
`endif 


`ifndef CONTROL
`define CONTROL
package CONTROL;
    import PARAMS_BN254_d0::BRAM_DEPTH;
    // Struct
    typedef struct packed {
        logic os;
        logic inve;
        logic [1:0] poa3;
        logic [1:0] poa2;
        logic [1:0] pos;
        logic [2:0] pom3;
        logic [2:0] pom2;
        logic [2:0] pom1;
        logic [2:0] cm; // Control instruction
        logic [1:0] pm2;
        logic [1:0] pm1;
        logic me1;
        logic me0;
        logic opc;
        logic [7:0] waddr0;
        logic [7:0] waddr1;
        logic [7:0] raddr0;
        logic [7:0] raddr1;
    } raw_rom_t;

    typedef struct packed {
        logic inve;
        logic [1:0] pos;
        logic [2:0] pom3;
        logic [2:0] pom2;
        logic [2:0] pom1;
        logic [2:0] cm;
        logic [1:0] pm;
        logic me1;
        logic me0;
    } ctrl_sig_t;

    typedef struct packed {
        logic inve;
        logic [1:0] pos;
        logic [2:0] pom3;
        logic [2:0] pom2;
        logic [2:0] pom1;
        logic [2:0] cm;
        logic [1:0] pm;
        logic offset_dst;
        logic offset_src0;
        logic offset_src1;
    } ctrl_sig_offset_t; // without cmul

    typedef struct packed {
        ctrl_sig_t csig;
        logic [BRAM_DEPTH-1:0] dst;
        logic [BRAM_DEPTH-1:0] src0;
        logic [BRAM_DEPTH-1:0] src1;
    } micro_ops_t;

    typedef struct packed {
        logic [1:0] field;
        logic t;
        logic [3:0] arith;
    } operation_t; // 7 bits

    typedef struct packed {
        logic me;
        logic [2:0] cm;
        logic [3:0] end_mop_cnt;
    } sub_operation_t; // 8 bits

    typedef struct packed {
        operation_t op;
        sub_operation_t sub_op;
    } opcode_t; // 15 bits

    typedef struct packed {
        opcode_t opcode;
        logic [6:0] dst;
        logic [6:0] src0;
        logic [6:0] src1;
    } instruction_t;

endpackage
`endif 


`ifndef PARAM_UART
`define PARAM_UART
package PARAM_UART;

    parameter   UART_CLK_FREQ = 100000000,
                UART_BAUD_RATE = 460800;
endpackage
`endif 
DSP_mul.sv
`timescale 1ns / 1ps
//////////////////////////////////////////////////////////////////////////////////
// Company: YNU
// Engineer: Junichi Sakamoto
// 
// Create Date: 09/06/2022 09:51:27 AM
// Design Name: 
// Module Name: DSP_mul
// Project Name: 
// Target Devices: 
// Tool Versions: 
// Description: 
// 
// Dependencies: 
// 
// Revision:
// Revision 0.01 - File Created
// Additional Comments:
// 
//////////////////////////////////////////////////////////////////////////////////
import PARAMS_BN254_d0::*;


module DSP_mul
    #(
    parameter latency = 0
    )(
    input clk,
    input [L-1:0] in_a,
    input [K-1:0] in_b,
    output [47:0] out_s
    );
		
		`ifdef XILINX_SIMULATOR 
            logic [26:0] reg_a, reg_m;
            logic [17:0] reg_b, reg_q;
            logic [47:0] reg_p, reg_pp;
            assign out_s = reg_pp;
            always @(posedge clk) begin
                reg_a <= 27'(in_a);
                reg_b <= 18'(in_b);
                reg_p <= reg_a * reg_b;
				reg_pp <= reg_p;
            end
        `else
			xbip_dsp48_macro_ab_3 mul_ab (
			.CLK(clk),      // input wire CLK
			.A(27'(in_a)),          // input wire [16 : 0] A
			.B(18'(in_b)),          // input wire [16 : 0] B
			.PCOUT(),  // output wire [47 : 0] PCOUT
			.P(out_s)          // output wire [33 : 0] P
			);
        `endif
endmodule



module PE
    #(
    parameter latency = 0
    )(
    input clk,
    input [K-1:0] in_b, in_q,
    input [L-1:0] in_a, in_m,
    input [2*K-1:0] in_sl,
    input [48-2*K-1:0] in_su,
    output [47:0] out_s
    );

    wire [47:0] w_pc;
        `ifdef XILINX_SIMULATOR 
            logic [26:0] reg_a, reg_m;
            logic [17:0] reg_b, reg_q;
            logic [47:0] reg_su, reg_sl, reg_sl_, reg_pcout, reg_prod, reg_p;
            assign out_s = reg_p;
            always @(posedge clk) begin
                reg_a <= 27'(in_a);
                reg_b <= 18'(in_b);
                reg_su <= 48'(in_su << K);
                reg_pcout <= reg_a * reg_b + reg_su;
                reg_m <= 27'(in_m);
                reg_q <= 18'(in_q);
                reg_sl_ <= 48'(in_sl << C);
                reg_sl <= reg_sl_;
                reg_prod <= reg_m * reg_q;
                reg_p <= reg_prod + reg_sl + reg_pcout;
            end
        `else
            xbip_dsp48_macro_ab_c_2 mul_ab (
            .CLK(clk),    // input wire CLK
            .A(27'(in_a)),        // input wire [26 : 0] A
            .B(18'(in_b)),        // input wire [17 : 0] B
            .C(48'(in_su << K)),    // input wire [47 : 0] C
            .PCOUT(w_pc),  // output wire [47 : 0] PCOUT
            .P()        // output wire [47 : 0] P
            );
            xbip_dsp48_macro_ab_c_pcin_3 mul_qm (
            .CLK(clk),    // input wire CLK
            .PCIN(w_pc),  // input wire [47 : 0] PCIN
            .A(27'(in_m)),        // input wire [26 : 0] A
            .B(18'(in_q)),        // input wire [17 : 0] B
            .C(48'(in_sl << C)),        // input wire [47 : 0] C
            .P(out_s)        // output wire [47 : 0] P
            );
        `endif
endmodule

QPMM_d0.sv
`timescale 1ns / 1ps
//////////////////////////////////////////////////////////////////////////////////
// Company: YNU
// Engineer: Junichi Sakamoto
// 
// Create Date: 09/5/2022 10:40:18 AM
// Design Name: 
// Module Name: QPMM_d0
// Project Name: 
// Target Devices: 
// Tool Versions: 
// Description: 
// 
// Dependencies: 
// 
// Revision:
// Revision 0.01 - File Created
// Additional Comments:
// 
//////////////////////////////////////////////////////////////////////////////////

import PARAMS_BN254_d0::*;
localparam latency_FA = 4;


module QPMM_d0(
    input clk, rstn,
    input [LEN_1024M_TILDE-1:0] A,
    input [LEN_1024M_TILDE-1:0] B,
    output uint_Mtilde2_t Z
    );

    ///////////////////////////////////////////////
    // Declaration
    //////////////////////////////////////////////
    qpmm_fpa_t[N+D-1:0] reg_A, buf_A, buf_A2, buf_A3;
    qpmm_fpb_t[N+D-1:0] reg_B, buf_B, buf_B2, buf_B3;


    qpmm_fpa_t Mpp = _Mpp;
    logic[N+D:0][K-1:0] wire_q;
    qpmm_S_t[N+D+1:0] reg_S;
    qpmm_S_t buf_S;

    ///////////////////////////////////////////////
    // Initilization
    //////////////////////////////////////////////
    // For 272>
    // assign reg_A[0][271:0] = A[271:0];
    // assign reg_A[0][285:272] = '0;
    assign reg_A[0] = ($bits(qpmm_fpa_t))'(A);
    assign reg_B[0] = ($bits(qpmm_fpb_t))'(B);
    assign reg_S[0] = {default: '0};
    assign wire_q[0] = '0;

    ///////////////////////////////////////////////
    // Final Addition
    //////////////////////////////////////////////
    if (latency_FA == 2) begin
        qpmm_S_half Z_L, Z_H;
        always @(posedge clk) begin
            buf_S <= reg_S[N+D+1];
            Z_L <= poly2int_half(buf_S[HALF_S-1:1]) + buf_S[0][47:K];
            Z_H <= poly2int_half(buf_S[M+D:HALF_S]);
            Z <=  (Z_H << (HALF_S-1)*L) + Z_L;
        end
    end
    else if (latency_FA == 3) begin
        qpmm_S_1_3 Z_L, Z_M, Z_H, Z_H2;
        logic [(S_1_3*2-1)*L+48-1:0] Z_ML;
        always @(posedge clk) begin
            buf_S <= reg_S[N+D+1];
            Z_L <= poly2int_1_3(buf_S[S_1_3-1:1]) + buf_S[0][47:K];
            Z_M <= poly2int_1_3(buf_S[(2*S_1_3)-1:S_1_3]);
            Z_H <= poly2int_half(buf_S[M+D:(2*S_1_3)]);
            Z_ML <= (Z_M << (S_1_3-1)*L) + Z_L;
            Z_H2 <= Z_H;
            Z <= (Z_H2 << (2*S_1_3-1)*L) + Z_ML;
        end
    end
    else if (latency_FA == 4) begin
        qpmm_S_1_4 Z_LL, Z_LH, Z_HL, Z_HH;
        qpmm_S_half Z_L, Z_H;
        always @(posedge clk) begin
            buf_S <= reg_S[N+D+1];
            Z_LL <= poly2int_1_4(buf_S[S_1_4:1]) + buf_S[0][47:K];
            Z_LH <= poly2int_1_4(buf_S[2*S_1_4:S_1_4+1]);
            Z_HL <= poly2int_1_4(buf_S[3*S_1_4:2*S_1_4+1]);
            `ifdef BLS12_381
                Z_HH <= poly2int_1_4(buf_S[4*S_1_4:3*S_1_4+1]);
            `else
                if(K==16)
                    Z_HH <= poly2int_1_4(buf_S[4*S_1_4:3*S_1_4+1]);
                else if(K==17)
                    Z_HH <= poly2int_1_4({40'd0, buf_S[3*S_1_4+2:3*S_1_4+1]});
            `endif
            Z_L <= (Z_LH << (S_1_4*L)) + Z_LL;
            Z_H <= (Z_HH << (S_1_4*L)) + Z_HL;
            Z <= (Z_H << (2*S_1_4*L)) + Z_L;
        end
    end
    ///////////////////////////////////////////////
    // Body
    //////////////////////////////////////////////
    for(genvar i = 0; i < N + D + 1; i = i + 1) begin : QPMM
        for(genvar j = 0; j < M + D; j = j + 1) begin : for_1
            if(i == 0)
                DSP_mul #(.latency(LAT_PE)) mul_ab (.clk(clk), .in_a(reg_A[i].poly_a[j]), .in_b(reg_B[i].poly_b[i]), .out_s(reg_S[i+1][j+1]));
            else if(i == N + D) begin
                if(j == M - 1)
                    assign reg_S[i+1][j+1] = '0;
                else
                    PE #(.latency(LAT_PE)) pe(.clk(clk), .in_a('0), .in_b('0), .in_m(Mpp.poly_a[j]), .in_q(wire_q[i]), .in_sl(reg_S[i][j+2][2*K-1:0]), .in_su(reg_S[i][j+1][47:2*K]), .out_s(reg_S[i+1][j+1])); 
            end
            else begin
                if(j == M - 1)
                    DSP_mul #(.latency(LAT_PE)) mul_ab (.clk(clk), .in_a(reg_A[i].poly_a[j]), .in_b(reg_B[i].poly_b[i]), .out_s(reg_S[i+1][j+1]));
                else 
                    PE #(.latency(LAT_PE)) pe(.clk(clk), .in_a(reg_A[i].poly_a[j]), .in_b(reg_B[i].poly_b[i]), .in_m(Mpp.poly_a[j]), .in_q(wire_q[i]), .in_sl(reg_S[i][j+2][2*K-1:0]), .in_su(reg_S[i][j+1][47:2*K]), .out_s(reg_S[i+1][j+1])); 
            end
        end
        wire [47:0] buf_q = reg_S[i][0][47:K] + reg_S[i][1][2*K-1:0];
        assign wire_q[i] = buf_q[K-1:0];

        if (LAT_PE == 1) begin
            always_ff @(posedge clk) begin : ff_ab_q
                if(i < N + D) begin
                    reg_A[i+1] <= reg_A[i];
                    reg_B[i+1] <= reg_B[i];
                end
                reg_S[i+1][0] <= buf_q;
            end
        end
        else if (LAT_PE == 2) begin
            logic [47:0] buf_s0;
            always_ff @(posedge clk) begin : ff_ab_q
                if(i < N + D) begin
                    buf_A[i] <= reg_A[i];
                    buf_B[i] <= reg_B[i];
                    reg_A[i+1] <= buf_A[i];
                    reg_B[i+1] <= buf_B[i];
                end
                buf_s0 <= buf_q;
                reg_S[i+1][0] <= buf_s0;
            end
        end
        else if (LAT_PE == 3) begin
            logic [47:0] buf_s0_0, buf_s0_1;
            always_ff @(posedge clk) begin : ff_ab_q
                if(i < N + D) begin
                    buf_A[i] <= reg_A[i];
                    buf_B[i] <= reg_B[i];
                    buf_A2[i] <= buf_A[i];
                    buf_B2[i] <= buf_B[i];
                    reg_A[i+1] <= buf_A2[i];
                    reg_B[i+1] <= buf_B2[i];
                end
                buf_s0_0 <= buf_q;
                buf_s0_1 <= buf_s0_0;
                reg_S[i+1][0] <= buf_s0_1;
            end
        end
        else if (LAT_PE == 4) begin
            logic [47:0] buf_s0_0, buf_s0_1, buf_s0_2;
            always_ff @(posedge clk) begin : ff_ab_q
                if(i < N + D) begin
                    buf_A[i] <= reg_A[i];
                    buf_B[i] <= reg_B[i];
                    buf_A2[i] <= buf_A[i];
                    buf_B2[i] <= buf_B[i];
                    buf_A3[i] <= buf_A2[i];
                    buf_B3[i] <= buf_B2[i];
                    reg_A[i+1] <= buf_A3[i];
                    reg_B[i+1] <= buf_B3[i];
                end
                buf_s0_0 <= buf_q;
                buf_s0_1 <= buf_s0_0;
                buf_s0_2 <= buf_s0_1;
                reg_S[i+1][0] <= buf_s0_2;
            end
        end
    end

    function qpmm_S_half poly2int_1_4;
        input [S_1_4-1:0][47:0] A;

            poly2int_1_4 = 0;
            for(integer i = 0; i < S_1_4; i = i + 1) begin
                poly2int_1_4 = poly2int_1_4 + (A[i] << (L*i));
            end
    endfunction

    function qpmm_S_half poly2int_1_3;
        input [S_1_3-1:0][47:0] A;

            poly2int_1_3 = 0;
            for(integer i = 0; i < S_1_3; i = i + 1) begin
                poly2int_1_3 = poly2int_1_3 + (A[i] << (L*i));
            end
    endfunction

    function qpmm_S_half poly2int_half;
        input [HALF_S-1:0][47:0] A;

            poly2int_half = 0;
            for(integer i = 0; i < HALF_S; i = i + 1) begin
                poly2int_half = poly2int_half + (A[i] << (L*i));
            end
    endfunction

    function qpmm_fpa_t poly2int;
        input qpmm_S_t A;

            poly2int = 0;
            for(integer i = 0; i <= M+D+1; i = i + 1) begin
                poly2int = poly2int + (A[i] << (L*i));
            end
    endfunction

endmodule
test_QPMM.sv
`timescale 1ns / 1ps
//////////////////////////////////////////////////////////////////////////////////
// Company: YNU
// Engineer: Junichi Sakamoto
// 
// Create Date: 08/16/2022 11:50:10 AM
// Design Name: 
// Module Name: test_QPMM
// Project Name: 
// Target Devices: 
// Tool Versions: 
// Description: 
// 
// Dependencies: 
// 
// Revision:
// Revision 0.01 - File Created
// Additional Comments:
// 
//////////////////////////////////////////////////////////////////////////////////

import PARAMS_BN254_d0::*;

`ifdef BLS12_381
    localparam bit_width = 381;
    localparam N_PIPELINE_STAGES = 82;
    //localparam N_PIPELINE_STAGES = 108;

`else
    localparam bit_width = 254 - 3; // Why -3?
    //localparam N_PIPELINE_STAGES = 22;
    //localparam N_PIPELINE_STAGES = 40;
    //localparam N_PIPELINE_STAGES = 58;
    localparam N_PIPELINE_STAGES = 76;
`endif 

module test_QPMM;
    localparam 
        CYCLE = 10,
        DELAY = 2,
        N_DATA = 1000000;
               
    reg clk, rstn;
    qpmm_fp_t A, B;
    wire [bit_width+16:0] Z;
    wire [999:0] tmp_ans = MR(A) * MR(B);
    wire [bit_width-1:0] ans = tmp_ans % PARAMS_BN254_d0::Mod;
    wire [bit_width-1:0] res = MR(Z);
    reg [N_PIPELINE_STAGES:0][bit_width-1:0] reg_ans;

    QPMM_d0 DUT(.clk, .rstn, .A, .B, .Z);
    
    always begin
        #(CYCLE/2) clk <= ~clk;
    end
    
    for(genvar i = 0; i < N_PIPELINE_STAGES; i++) begin
        always @(posedge clk) begin
            reg_ans[i+1] <= reg_ans[i];
        end
    end

    /*-------------------------------------------
    Test
    -------------------------------------------*/
    initial begin
        rstn <= 1'b1;
        clk <= 1'b1;
    #(CYCLE*10)
        rstn <= 1'b0;
    #(CYCLE*10)
        rstn <= 1'b1;
    #DELAY;
        $display("Test QPMM start\n");
        for(integer i = 0; i < N_PIPELINE_STAGES; i = i + 1) begin
            A <= rand_288() % (1024*PARAMS_BN254_d0::M_tilde);
            B <= rand_288() % (1024*PARAMS_BN254_d0::M_tilde);
//            A <= 272'h9d289143693cedb99a3634824056732c6ab659f650c97a3a07de4a3de195611861a;
//            B <= 272'hcfc83b144a374cedb4ebb176cd5f622edee5ea20fbe8dcd1b6cec23632499a3cc0a1;
            #DELAY
            reg_ans[0] <= ans;
            #(CYCLE-DELAY);
        end

        for(integer i = N_PIPELINE_STAGES; i < N_DATA; i = i + 1) begin
            A <= rand_288() % (1024*PARAMS_BN254_d0::M_tilde);
            B <= rand_288() % (1024*PARAMS_BN254_d0::M_tilde);
            
            #DELAY
            reg_ans[0] <= ans;
            //$display("%d: x = %h, y = %h", i, A, B);
            if(res !== reg_ans[N_PIPELINE_STAGES]) begin
                $display("#%d Failed: ans = %h, res = %h", i, reg_ans[N_PIPELINE_STAGES], res);
                //$display("x = %h, y = %h", x, y);
                $stop();
            end
            #(CYCLE-DELAY);
        end
        $display("#%d Test end\n", N_DATA);
        $finish;
    end


    function [bit_width-1:0] MR;
        input qpmm_fp_t A;
        logic [999:0] tmp_ans = (PARAMS_BN254_d0::R_INV*A);
        MR = tmp_ans % PARAMS_BN254_d0::Mod;
    endfunction

endmodule

検証コードは前作った大規模な剰余乗算器で、ペアリング用の素体乗算でかなり周波数出るようにチューニングしてある。
import, packed union, packed structなどが含まれていたり、プリプロディレクティブを使ったりしているが、この辺は大丈夫なのか気になる。
あとxilinx IPを使ってる部分があるので、そこはHDL記述に直す。

このあたり
https://lsifrontend.hatenablog.com/entry/2022/11/14/150102
https://marsee101.blog.fc2.com/blog-entry-1812.html
を参考にビルドしてみる。
指定するファイルはトップだけでいいらしいので、まず

verilator --binary src/test_QPMM.sv

を実行

すると次のようなエラー

%Error-PKGNODECL: src/test_QPMM.sv:22:8: Package/class 'PARAMS_BN254_d0' not found, and needs to be predeclared (IEEE 1800-2017 26.3)
   22 | import PARAMS_BN254_d0::*;
      |        ^~~~~~~~~~~~~~~
                  ... For error description see https://verilator.org/warn/PKGNODECL?v=5.011
%Error: src/test_QPMM.sv:22:8: Importing from missing package 'PARAMS_BN254_d0'
   22 | import PARAMS_BN254_d0::*;
      |        ^~~~~~~~~~~~~~~

インポートするファイルが見つからない的なことを言われているので、インポートファイルを明示してみる。

verilator --binary src/params.sv src/test_QPMM.sv 

するとうまくいったのか、エラーが次のように変わる。

%Error: src/params.sv:37:17: Too many digits for 32 bit number: 'h6266ea86a2d27c9ffae7bca245ef0aec53439f2badb6a0480fa012749c8bd9dde2ecd0389ebc8f9c4291d51d3fbb2cd
   37 |         RmodM = 'h6266ea86a2d27c9ffae7bca245ef0aec53439f2badb6a0480fa012749c8bd9dde2ecd0389ebc8f9c4291d51d3fbb2cd,
      |                 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

パラメータのビット幅省略がダメっぽいので、指定してやる。

すると

%Error: src/test_QPMM.sv:51:5: Cannot find file containing module: 'QPMM_d0'
   51 |     QPMM_d0 DUT(.clk, .rstn, .A, .B, .Z);
      |     ^~~~~~~

モジュールが見つからんといわれる。カレントディレクトリしか探してくれないんだろうな。ということでsrcフォルダに入ってやってみる。

%Error: QPMM_d0.sv:53:24: syntax error, unexpected default, expecting TYPE-IDENTIFIER
   53 |     assign reg_S[0] = {default: '0};
      |                        ^~~~~~~
        test_QPMM.sv:53:1: ... note: In file included from test_QPMM.sv
%Error: Exiting due to 1 error(s)

全ての構造体に同じ値を設定するdefaultは使えない?
とりあえず全部0にすればいいので`0にしてみる。

すると次は

%Error: DSP_mul.sv:144:13: Cannot find file containing module: 'xbip_dsp48_macro_ab_c_2'
  144 |             xbip_dsp48_macro_ab_c_2 mul_ab (
      |             ^~~~~~~~~~~~~~~~~~~~~~~

こんな感じでIPはやっぱりだめみたい。ifdefで消されると期待したけどそうではなかった。
というわけでip記述全部消す。

%Error: Internal Error: test_QPMM.sv:106:57: ../V3Scope.cpp:80: Can't locate varref scope
  106 |         logic [999:0] tmp_ans = (PARAMS_BN254_d0::R_INV*A);

これはたぶんlogicには宣言と代入が同時にできないというやつだろう。functionの中ではなぜか許されてたので使っていたが

test_QPMM.sv
logic [999:0] tmp_ans;
assign tmp_ans = (PARAMS_BN254_d0::R_INV*A);

このように直した。

まだまだありそう

%Error-BLKANDNBLK: QPMM_d0.sv:37:25: Unsupported: Blocked and non-blocking assignments to same variable: 'test_QPMM.DUT.reg_B'
   37 |     qpmm_fpb_t[N+D-1:0] reg_B, buf_B, buf_B2, buf_B3;
      |                         ^~~~~
                   test_QPMM.sv:53:1: ... note: In file included from test_QPMM.sv
                   QPMM_d0.sv:29:33: ... Location of blocking assignment
   29 |     input [LEN_1024M_TILDE-1:0] B,
      |                                 ^
                   QPMM_d0.sv:156:32: ... Location of nonblocking assignment
  156 |                     reg_B[i+1] <= buf_B2[i];
      |                                ^~

ブロッキングとノンブロッキング代入を同じ変数に使えないらしい。エラーでは29行目でブロッキングしていることになってるが実際には52行目でassign reg_B[0] = ($bits(qpmm_fpb_t))'(B);をやってるのでこれのことだろう。
これはassignなのでブロッキング代入してるわけではなくて、reg_B[0]をwireとして使ってるということで、for文で0番目の要素を特別扱いしなくてよくなるので使ってたんだけど、コーディングスタイルが悪いかな・・・
always_combの中で<=演算子を使えばいいかと思ったけど、<=がブロッキング代入として認識されてうまくいかなかった。
しかたないのでひとつずつ見て行ってノンブロッキング/ブロッキングの変換を行った。これは結構手間だし、変換前後で合ってるかどうかわからないので、かなりストレス。

ここまで来てエラーは消えたが、53個ワーニングが残っているせいか実行形式が作成されない。
ひとまず-Wno-lint -Wno-fatalで無視しよう。すると

g++: error: unrecognized command line option ‘-fcoroutines’; did you mean ‘-fc-prototypes’?

エラーが出る。g++7/8/9で試したがだめだった。最終的にg++-11でコンパイルできることを確認

気になる実行結果は

実行結果
Test QPMM start

*** stack smashing detected ***: <unknown> terminated
Aborted

まあそんなもん。$displayの文字列が表示されてるので動いてはいるので前進かな・・・
とりあえず、今までのsvが簡単に動くという感じではないということはわかった。

Discussion