In [1]:
%load_ext autoreload
%autoreload 2

from tvm.relay.testing.resnet import get_workload as get_resnet
from compiler import Compiler
from tempfile import NamedTemporaryFile
from subprocess import run, CalledProcessError
import numpy as np
from print_array_efficiently import print_array_efficiently
import tvm
import pickle
from os import linesep
In [2]:
resnet50_module, resnet50_params = get_resnet(
    layout="NHWC", image_shape=(224, 224, 3), num_layers=50
)

compiled = Compiler(resnet50_module)

c_filename = "resnet50.c"
with open(c_filename, 'w') as f:
    f.write(compiled.get_file())

assert run(["clang-format", "-i", c_filename]).returncode == 0

print(open(c_filename).read())
extern void rtml_systolic_array_weight_stationary_fc(
    int hardware_id, float *out, float *activations, float *weights,
    int input_vector_size, int output_vector_size, int batch);
extern void rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
    int hardware_id, float *out, float *activations, float *weights, int h,
    int w, int kernel_h, int kernel_w, int in_channels, int out_channels,
    int stride_h, int stride_w);
extern void batchNormInference(float *X, float *Y, int N, int H, int W, int C,
                               float *gamma, float *beta, float *mu, float *var,
                               float epsilon);
extern void softmax1D(float *X, float *Y, int N);
extern void relu(float *X, float *Y, int N, int H, int W, int C);
extern void globalAvgPool(float *X, float *Y, int N, int H, int W, int C);
extern void add_with_broadcasting(float *out, float *a, float *b,
                                  int *out_shape, int out_ndims, int *a_shape,
                                  int a_ndims, int *b_shape, int b_ndims);
extern void add(float *out, float *a, float *b);
extern void maxpool2D3x3_resnet18_op6(float *X, float *Y);
extern void zero_pad_nhwc(float *out, float *in, int h, int w, int c,
                          int pad_north, int pad_east, int pad_south,
                          int pad_west);
float b94157387959776[1][224][224][3];
float b94157388635216prepadding[1][112][112][64];
float b94157388635216[1][112][112][64];
float b94157387960288[1][112][112][64];
float b94157387960480[1][112][112][64];
float b94157387960768[1][56][56][64];
float b94157387961328[1][56][56][64];
float b94157387961520[1][56][56][64];
float b94157387961808prepadding[1][56][56][64];
float b94157387961808[1][56][56][64];
float b94157387962368[1][56][56][64];
float b94157387962560[1][56][56][64];
float b94157387962848prepadding[1][56][56][64];
float b94157387962848[1][56][56][64];
float b94157387963408[1][56][56][64];
float b94157387963600[1][56][56][64];
float b94157387963888prepadding[1][56][56][256];
float b94157387963888[1][56][56][256];
float b94157387964176prepadding[1][56][56][256];
float b94157387964176[1][56][56][256];
float b94157387964464[1][56][56][256];
float b94157387965024[1][56][56][256];
float b94157387965216[1][56][56][256];
float b94157387965504prepadding[1][56][56][64];
float b94157387965504[1][56][56][64];
float b94157387966064[1][56][56][64];
float b94157387966256[1][56][56][64];
float b94157387966544prepadding[1][56][56][64];
float b94157387966544[1][56][56][64];
float b94157387967104[1][56][56][64];
float b94157387967296[1][56][56][64];
float b94157387967584prepadding[1][56][56][256];
float b94157387967584[1][56][56][256];
float b94157387967872[1][56][56][256];
float b94157387968432[1][56][56][256];
float b94157387968624[1][56][56][256];
float b94157387968912prepadding[1][56][56][64];
float b94157387968912[1][56][56][64];
float b94157387969472[1][56][56][64];
float b94157387969664[1][56][56][64];
float b94157387969952prepadding[1][56][56][64];
float b94157387969952[1][56][56][64];
float b94157387970512[1][56][56][64];
float b94157387970704[1][56][56][64];
float b94157387970992prepadding[1][56][56][256];
float b94157387970992[1][56][56][256];
float b94157387971280[1][56][56][256];
float b94157387971840[1][56][56][256];
float b94157387972032[1][56][56][256];
float b94157387972320prepadding[1][28][28][128];
float b94157387972320[1][28][28][128];
float b94157387972880[1][28][28][128];
float b94157387973072[1][28][28][128];
float b94157387973360prepadding[1][28][28][128];
float b94157387973360[1][28][28][128];
float b94157387973920[1][28][28][128];
float b94157387974112[1][28][28][128];
float b94157387974400prepadding[1][28][28][512];
float b94157387974400[1][28][28][512];
float b94157387974688prepadding[1][28][28][512];
float b94157387974688[1][28][28][512];
float b94157387974976[1][28][28][512];
float b94157388802288[1][28][28][512];
float b94157388802480[1][28][28][512];
float b94157388802768prepadding[1][28][28][128];
float b94157388802768[1][28][28][128];
float b94157388803328[1][28][28][128];
float b94157387941008[1][28][28][128];
float b94157387941296prepadding[1][28][28][128];
float b94157387941296[1][28][28][128];
float b94157387941856[1][28][28][128];
float b94157387942048[1][28][28][128];
float b94157387942336prepadding[1][28][28][512];
float b94157387942336[1][28][28][512];
float b94157387942624[1][28][28][512];
float b94157387943184[1][28][28][512];
float b94157387943376[1][28][28][512];
float b94157388809200prepadding[1][28][28][128];
float b94157388809200[1][28][28][128];
float b94157388809712[1][28][28][128];
float b94157388809904[1][28][28][128];
float b94157388810192prepadding[1][28][28][128];
float b94157388810192[1][28][28][128];
float b94157388810752[1][28][28][128];
float b94157388810944[1][28][28][128];
float b94157388811232prepadding[1][28][28][512];
float b94157388811232[1][28][28][512];
float b94157388811520[1][28][28][512];
float b94157388812080[1][28][28][512];
float b94157388812272[1][28][28][512];
float b94157388812560prepadding[1][28][28][128];
float b94157388812560[1][28][28][128];
float b94157388813120[1][28][28][128];
float b94157388813312[1][28][28][128];
float b94157388813600prepadding[1][28][28][128];
float b94157388813600[1][28][28][128];
float b94157388814160[1][28][28][128];
float b94157388814352[1][28][28][128];
float b94157388814640prepadding[1][28][28][512];
float b94157388814640[1][28][28][512];
float b94157388814928[1][28][28][512];
float b94157388815488[1][28][28][512];
float b94157388815680[1][28][28][512];
float b94157388815968prepadding[1][14][14][256];
float b94157388815968[1][14][14][256];
float b94157388816528[1][14][14][256];
float b94157388816720[1][14][14][256];
float b94157388817008prepadding[1][14][14][256];
float b94157388817008[1][14][14][256];
float b94157388817568[1][14][14][256];
float b94157388817760[1][14][14][256];
float b94157388818048prepadding[1][14][14][1024];
float b94157388818048[1][14][14][1024];
float b94157388818336prepadding[1][14][14][1024];
float b94157388818336[1][14][14][1024];
float b94157388818624[1][14][14][1024];
float b94157388819184[1][14][14][1024];
float b94157388819376[1][14][14][1024];
float b94157388819664prepadding[1][14][14][256];
float b94157388819664[1][14][14][256];
float b94157388820224[1][14][14][256];
float b94157388820416[1][14][14][256];
float b94157388820704prepadding[1][14][14][256];
float b94157388820704[1][14][14][256];
float b94157388821264[1][14][14][256];
float b94157388821456[1][14][14][256];
float b94157388821744prepadding[1][14][14][1024];
float b94157388821744[1][14][14][1024];
float b94157388822032[1][14][14][1024];
float b94157388822592[1][14][14][1024];
float b94157388822784[1][14][14][1024];
float b94157388823072prepadding[1][14][14][256];
float b94157388823072[1][14][14][256];
float b94157388823632[1][14][14][256];
float b94157388823824[1][14][14][256];
float b94157388824112prepadding[1][14][14][256];
float b94157388824112[1][14][14][256];
float b94157388824672[1][14][14][256];
float b94157388824864[1][14][14][256];
float b94157388825152prepadding[1][14][14][1024];
float b94157388825152[1][14][14][1024];
float b94157388825440[1][14][14][1024];
float b94157388826000[1][14][14][1024];
float b94157388826192[1][14][14][1024];
float b94157388826480prepadding[1][14][14][256];
float b94157388826480[1][14][14][256];
float b94157388827040[1][14][14][256];
float b94157388827232[1][14][14][256];
float b94157388827520prepadding[1][14][14][256];
float b94157388827520[1][14][14][256];
float b94157388828080[1][14][14][256];
float b94157388828272[1][14][14][256];
float b94157388828560prepadding[1][14][14][1024];
float b94157388828560[1][14][14][1024];
float b94157388828848[1][14][14][1024];
float b94157388829408[1][14][14][1024];
float b94157388829600[1][14][14][1024];
float b94157388829888prepadding[1][14][14][256];
float b94157388829888[1][14][14][256];
float b94157388830448[1][14][14][256];
float b94157388830640[1][14][14][256];
float b94157388830928prepadding[1][14][14][256];
float b94157388830928[1][14][14][256];
float b94157388831488[1][14][14][256];
float b94157388831680[1][14][14][256];
float b94157388831968prepadding[1][14][14][1024];
float b94157388831968[1][14][14][1024];
float b94157388832256[1][14][14][1024];
float b94157388832816[1][14][14][1024];
float b94157388833008[1][14][14][1024];
float b94157388833296prepadding[1][14][14][256];
float b94157388833296[1][14][14][256];
float b94157388833856[1][14][14][256];
float b94157388834048[1][14][14][256];
float b94157388834336prepadding[1][14][14][256];
float b94157388834336[1][14][14][256];
float b94157388834896[1][14][14][256];
float b94157388835088[1][14][14][256];
float b94157388835376prepadding[1][14][14][1024];
float b94157388835376[1][14][14][1024];
float b94157388835664[1][14][14][1024];
float b94157388836224[1][14][14][1024];
float b94157388836416[1][14][14][1024];
float b94157388836704prepadding[1][7][7][512];
float b94157388836704[1][7][7][512];
float b94157388837264[1][7][7][512];
float b94157388837456[1][7][7][512];
float b94157388837744prepadding[1][7][7][512];
float b94157388837744[1][7][7][512];
float b94157388838304[1][7][7][512];
float b94157388838496[1][7][7][512];
float b94157388838784prepadding[1][7][7][2048];
float b94157388838784[1][7][7][2048];
float b94157388839072prepadding[1][7][7][2048];
float b94157388839072[1][7][7][2048];
float b94157388839360[1][7][7][2048];
float b94157388839920[1][7][7][2048];
float b94157388840112[1][7][7][2048];
float b94157388840400prepadding[1][7][7][512];
float b94157388840400[1][7][7][512];
float b94157388840960[1][7][7][512];
float b94157388841152[1][7][7][512];
float b94157388841440prepadding[1][7][7][512];
float b94157388841440[1][7][7][512];
float b94157388842000[1][7][7][512];
float b94157388842192[1][7][7][512];
float b94157388842480prepadding[1][7][7][2048];
float b94157388842480[1][7][7][2048];
float b94157388842768[1][7][7][2048];
float b94157388843328[1][7][7][2048];
float b94157388843520[1][7][7][2048];
float b94157388843808prepadding[1][7][7][512];
float b94157388843808[1][7][7][512];
float b94157388844368[1][7][7][512];
float b94157388844560[1][7][7][512];
float b94157388844848prepadding[1][7][7][512];
float b94157388844848[1][7][7][512];
float b94157388845408[1][7][7][512];
float b94157388845600[1][7][7][512];
float b94157388845888prepadding[1][7][7][2048];
float b94157388845888[1][7][7][2048];
float b94157388846176[1][7][7][2048];
float b94157388846736[1][7][7][2048];
float b94157388846928[1][7][7][2048];
float b94157388847216[1][1][1][2048];
float b94157388847792[1][1000];
float b94157388848080[1][1000];

void compiled(
    float *out, float *data, float *bn_data_gamma, float *bn_data_beta,
    float *bn_data_moving_mean, float *bn_data_moving_var, float *conv0_weight,
    float *bn0_gamma, float *bn0_beta, float *bn0_moving_mean,
    float *bn0_moving_var, float *stage1_unit1_bn1_gamma,
    float *stage1_unit1_bn1_beta, float *stage1_unit1_bn1_moving_mean,
    float *stage1_unit1_bn1_moving_var, float *stage1_unit1_conv1_weight,
    float *stage1_unit1_bn2_gamma, float *stage1_unit1_bn2_beta,
    float *stage1_unit1_bn2_moving_mean, float *stage1_unit1_bn2_moving_var,
    float *stage1_unit1_conv2_weight, float *stage1_unit1_bn3_gamma,
    float *stage1_unit1_bn3_beta, float *stage1_unit1_bn3_moving_mean,
    float *stage1_unit1_bn3_moving_var, float *stage1_unit1_conv3_weight,
    float *stage1_unit1_sc_weight, float *stage1_unit2_bn1_gamma,
    float *stage1_unit2_bn1_beta, float *stage1_unit2_bn1_moving_mean,
    float *stage1_unit2_bn1_moving_var, float *stage1_unit2_conv1_weight,
    float *stage1_unit2_bn2_gamma, float *stage1_unit2_bn2_beta,
    float *stage1_unit2_bn2_moving_mean, float *stage1_unit2_bn2_moving_var,
    float *stage1_unit2_conv2_weight, float *stage1_unit2_bn3_gamma,
    float *stage1_unit2_bn3_beta, float *stage1_unit2_bn3_moving_mean,
    float *stage1_unit2_bn3_moving_var, float *stage1_unit2_conv3_weight,
    float *stage1_unit3_bn1_gamma, float *stage1_unit3_bn1_beta,
    float *stage1_unit3_bn1_moving_mean, float *stage1_unit3_bn1_moving_var,
    float *stage1_unit3_conv1_weight, float *stage1_unit3_bn2_gamma,
    float *stage1_unit3_bn2_beta, float *stage1_unit3_bn2_moving_mean,
    float *stage1_unit3_bn2_moving_var, float *stage1_unit3_conv2_weight,
    float *stage1_unit3_bn3_gamma, float *stage1_unit3_bn3_beta,
    float *stage1_unit3_bn3_moving_mean, float *stage1_unit3_bn3_moving_var,
    float *stage1_unit3_conv3_weight, float *stage2_unit1_bn1_gamma,
    float *stage2_unit1_bn1_beta, float *stage2_unit1_bn1_moving_mean,
    float *stage2_unit1_bn1_moving_var, float *stage2_unit1_conv1_weight,
    float *stage2_unit1_bn2_gamma, float *stage2_unit1_bn2_beta,
    float *stage2_unit1_bn2_moving_mean, float *stage2_unit1_bn2_moving_var,
    float *stage2_unit1_conv2_weight, float *stage2_unit1_bn3_gamma,
    float *stage2_unit1_bn3_beta, float *stage2_unit1_bn3_moving_mean,
    float *stage2_unit1_bn3_moving_var, float *stage2_unit1_conv3_weight,
    float *stage2_unit1_sc_weight, float *stage2_unit2_bn1_gamma,
    float *stage2_unit2_bn1_beta, float *stage2_unit2_bn1_moving_mean,
    float *stage2_unit2_bn1_moving_var, float *stage2_unit2_conv1_weight,
    float *stage2_unit2_bn2_gamma, float *stage2_unit2_bn2_beta,
    float *stage2_unit2_bn2_moving_mean, float *stage2_unit2_bn2_moving_var,
    float *stage2_unit2_conv2_weight, float *stage2_unit2_bn3_gamma,
    float *stage2_unit2_bn3_beta, float *stage2_unit2_bn3_moving_mean,
    float *stage2_unit2_bn3_moving_var, float *stage2_unit2_conv3_weight,
    float *stage2_unit3_bn1_gamma, float *stage2_unit3_bn1_beta,
    float *stage2_unit3_bn1_moving_mean, float *stage2_unit3_bn1_moving_var,
    float *stage2_unit3_conv1_weight, float *stage2_unit3_bn2_gamma,
    float *stage2_unit3_bn2_beta, float *stage2_unit3_bn2_moving_mean,
    float *stage2_unit3_bn2_moving_var, float *stage2_unit3_conv2_weight,
    float *stage2_unit3_bn3_gamma, float *stage2_unit3_bn3_beta,
    float *stage2_unit3_bn3_moving_mean, float *stage2_unit3_bn3_moving_var,
    float *stage2_unit3_conv3_weight, float *stage2_unit4_bn1_gamma,
    float *stage2_unit4_bn1_beta, float *stage2_unit4_bn1_moving_mean,
    float *stage2_unit4_bn1_moving_var, float *stage2_unit4_conv1_weight,
    float *stage2_unit4_bn2_gamma, float *stage2_unit4_bn2_beta,
    float *stage2_unit4_bn2_moving_mean, float *stage2_unit4_bn2_moving_var,
    float *stage2_unit4_conv2_weight, float *stage2_unit4_bn3_gamma,
    float *stage2_unit4_bn3_beta, float *stage2_unit4_bn3_moving_mean,
    float *stage2_unit4_bn3_moving_var, float *stage2_unit4_conv3_weight,
    float *stage3_unit1_bn1_gamma, float *stage3_unit1_bn1_beta,
    float *stage3_unit1_bn1_moving_mean, float *stage3_unit1_bn1_moving_var,
    float *stage3_unit1_conv1_weight, float *stage3_unit1_bn2_gamma,
    float *stage3_unit1_bn2_beta, float *stage3_unit1_bn2_moving_mean,
    float *stage3_unit1_bn2_moving_var, float *stage3_unit1_conv2_weight,
    float *stage3_unit1_bn3_gamma, float *stage3_unit1_bn3_beta,
    float *stage3_unit1_bn3_moving_mean, float *stage3_unit1_bn3_moving_var,
    float *stage3_unit1_conv3_weight, float *stage3_unit1_sc_weight,
    float *stage3_unit2_bn1_gamma, float *stage3_unit2_bn1_beta,
    float *stage3_unit2_bn1_moving_mean, float *stage3_unit2_bn1_moving_var,
    float *stage3_unit2_conv1_weight, float *stage3_unit2_bn2_gamma,
    float *stage3_unit2_bn2_beta, float *stage3_unit2_bn2_moving_mean,
    float *stage3_unit2_bn2_moving_var, float *stage3_unit2_conv2_weight,
    float *stage3_unit2_bn3_gamma, float *stage3_unit2_bn3_beta,
    float *stage3_unit2_bn3_moving_mean, float *stage3_unit2_bn3_moving_var,
    float *stage3_unit2_conv3_weight, float *stage3_unit3_bn1_gamma,
    float *stage3_unit3_bn1_beta, float *stage3_unit3_bn1_moving_mean,
    float *stage3_unit3_bn1_moving_var, float *stage3_unit3_conv1_weight,
    float *stage3_unit3_bn2_gamma, float *stage3_unit3_bn2_beta,
    float *stage3_unit3_bn2_moving_mean, float *stage3_unit3_bn2_moving_var,
    float *stage3_unit3_conv2_weight, float *stage3_unit3_bn3_gamma,
    float *stage3_unit3_bn3_beta, float *stage3_unit3_bn3_moving_mean,
    float *stage3_unit3_bn3_moving_var, float *stage3_unit3_conv3_weight,
    float *stage3_unit4_bn1_gamma, float *stage3_unit4_bn1_beta,
    float *stage3_unit4_bn1_moving_mean, float *stage3_unit4_bn1_moving_var,
    float *stage3_unit4_conv1_weight, float *stage3_unit4_bn2_gamma,
    float *stage3_unit4_bn2_beta, float *stage3_unit4_bn2_moving_mean,
    float *stage3_unit4_bn2_moving_var, float *stage3_unit4_conv2_weight,
    float *stage3_unit4_bn3_gamma, float *stage3_unit4_bn3_beta,
    float *stage3_unit4_bn3_moving_mean, float *stage3_unit4_bn3_moving_var,
    float *stage3_unit4_conv3_weight, float *stage3_unit5_bn1_gamma,
    float *stage3_unit5_bn1_beta, float *stage3_unit5_bn1_moving_mean,
    float *stage3_unit5_bn1_moving_var, float *stage3_unit5_conv1_weight,
    float *stage3_unit5_bn2_gamma, float *stage3_unit5_bn2_beta,
    float *stage3_unit5_bn2_moving_mean, float *stage3_unit5_bn2_moving_var,
    float *stage3_unit5_conv2_weight, float *stage3_unit5_bn3_gamma,
    float *stage3_unit5_bn3_beta, float *stage3_unit5_bn3_moving_mean,
    float *stage3_unit5_bn3_moving_var, float *stage3_unit5_conv3_weight,
    float *stage3_unit6_bn1_gamma, float *stage3_unit6_bn1_beta,
    float *stage3_unit6_bn1_moving_mean, float *stage3_unit6_bn1_moving_var,
    float *stage3_unit6_conv1_weight, float *stage3_unit6_bn2_gamma,
    float *stage3_unit6_bn2_beta, float *stage3_unit6_bn2_moving_mean,
    float *stage3_unit6_bn2_moving_var, float *stage3_unit6_conv2_weight,
    float *stage3_unit6_bn3_gamma, float *stage3_unit6_bn3_beta,
    float *stage3_unit6_bn3_moving_mean, float *stage3_unit6_bn3_moving_var,
    float *stage3_unit6_conv3_weight, float *stage4_unit1_bn1_gamma,
    float *stage4_unit1_bn1_beta, float *stage4_unit1_bn1_moving_mean,
    float *stage4_unit1_bn1_moving_var, float *stage4_unit1_conv1_weight,
    float *stage4_unit1_bn2_gamma, float *stage4_unit1_bn2_beta,
    float *stage4_unit1_bn2_moving_mean, float *stage4_unit1_bn2_moving_var,
    float *stage4_unit1_conv2_weight, float *stage4_unit1_bn3_gamma,
    float *stage4_unit1_bn3_beta, float *stage4_unit1_bn3_moving_mean,
    float *stage4_unit1_bn3_moving_var, float *stage4_unit1_conv3_weight,
    float *stage4_unit1_sc_weight, float *stage4_unit2_bn1_gamma,
    float *stage4_unit2_bn1_beta, float *stage4_unit2_bn1_moving_mean,
    float *stage4_unit2_bn1_moving_var, float *stage4_unit2_conv1_weight,
    float *stage4_unit2_bn2_gamma, float *stage4_unit2_bn2_beta,
    float *stage4_unit2_bn2_moving_mean, float *stage4_unit2_bn2_moving_var,
    float *stage4_unit2_conv2_weight, float *stage4_unit2_bn3_gamma,
    float *stage4_unit2_bn3_beta, float *stage4_unit2_bn3_moving_mean,
    float *stage4_unit2_bn3_moving_var, float *stage4_unit2_conv3_weight,
    float *stage4_unit3_bn1_gamma, float *stage4_unit3_bn1_beta,
    float *stage4_unit3_bn1_moving_mean, float *stage4_unit3_bn1_moving_var,
    float *stage4_unit3_conv1_weight, float *stage4_unit3_bn2_gamma,
    float *stage4_unit3_bn2_beta, float *stage4_unit3_bn2_moving_mean,
    float *stage4_unit3_bn2_moving_var, float *stage4_unit3_conv2_weight,
    float *stage4_unit3_bn3_gamma, float *stage4_unit3_bn3_beta,
    float *stage4_unit3_bn3_moving_mean, float *stage4_unit3_bn3_moving_var,
    float *stage4_unit3_conv3_weight, float *bn1_gamma, float *bn1_beta,
    float *bn1_moving_mean, float *bn1_moving_var, float *fc1_weight,
    float *fc1_bias) {

  batchNormInference((float *)data, (float *)b94157387959776, 1, 224, 224, 3,
                     bn_data_gamma, bn_data_beta, bn_data_moving_mean,
                     bn_data_moving_var, 0.00002);

  zero_pad_nhwc((float *)b94157388635216prepadding, (float *)b94157387959776,
                224, 224, 3, 3, 3, 3, 3);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157388635216, (float *)b94157388635216prepadding,
      conv0_weight, 230, 230, 7, 7, 3, 64, 2, 2);

  batchNormInference((float *)b94157388635216, (float *)b94157387960288, 1, 112,
                     112, 64, bn0_gamma, bn0_beta, bn0_moving_mean,
                     bn0_moving_var, 0.00002);

  relu((float *)b94157387960288, (float *)b94157387960480, 1, 112, 112, 64);

  maxpool2D3x3_resnet18_op6((float *)b94157387960480, (float *)b94157387960768);

  batchNormInference((float *)b94157387960768, (float *)b94157387961328, 1, 56,
                     56, 64, stage1_unit1_bn1_gamma, stage1_unit1_bn1_beta,
                     stage1_unit1_bn1_moving_mean, stage1_unit1_bn1_moving_var,
                     0.00002);

  relu((float *)b94157387961328, (float *)b94157387961520, 1, 56, 56, 64);

  zero_pad_nhwc((float *)b94157387961808prepadding, (float *)b94157387961520,
                56, 56, 64, 0, 0, 0, 0);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157387961808, (float *)b94157387961808prepadding,
      stage1_unit1_conv1_weight, 56, 56, 1, 1, 64, 64, 1, 1);

  batchNormInference((float *)b94157387961808, (float *)b94157387962368, 1, 56,
                     56, 64, stage1_unit1_bn2_gamma, stage1_unit1_bn2_beta,
                     stage1_unit1_bn2_moving_mean, stage1_unit1_bn2_moving_var,
                     0.00002);

  relu((float *)b94157387962368, (float *)b94157387962560, 1, 56, 56, 64);

  zero_pad_nhwc((float *)b94157387962848prepadding, (float *)b94157387962560,
                56, 56, 64, 1, 1, 1, 1);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157387962848, (float *)b94157387962848prepadding,
      stage1_unit1_conv2_weight, 58, 58, 3, 3, 64, 64, 1, 1);

  batchNormInference((float *)b94157387962848, (float *)b94157387963408, 1, 56,
                     56, 64, stage1_unit1_bn3_gamma, stage1_unit1_bn3_beta,
                     stage1_unit1_bn3_moving_mean, stage1_unit1_bn3_moving_var,
                     0.00002);

  relu((float *)b94157387963408, (float *)b94157387963600, 1, 56, 56, 64);

  zero_pad_nhwc((float *)b94157387963888prepadding, (float *)b94157387963600,
                56, 56, 64, 0, 0, 0, 0);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157387963888, (float *)b94157387963888prepadding,
      stage1_unit1_conv3_weight, 56, 56, 1, 1, 64, 256, 1, 1);

  zero_pad_nhwc((float *)b94157387964176prepadding, (float *)b94157387961520,
                56, 56, 64, 0, 0, 0, 0);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157387964176, (float *)b94157387964176prepadding,
      stage1_unit1_sc_weight, 56, 56, 1, 1, 64, 256, 1, 1);

  int b94157387964464_out_shape[4] = {1, 56, 56, 256};
  int b94157387964464_a_shape[4] = {1, 56, 56, 256};
  int b94157387964464_b_shape[4] = {1, 56, 56, 256};
  add_with_broadcasting(
      (float *)b94157387964464, (float *)b94157387963888,
      (float *)b94157387964176, (int *)b94157387964464_out_shape, 4,
      (int *)b94157387964464_a_shape, 4, (int *)b94157387964464_b_shape, 4);

  batchNormInference((float *)b94157387964464, (float *)b94157387965024, 1, 56,
                     56, 256, stage1_unit2_bn1_gamma, stage1_unit2_bn1_beta,
                     stage1_unit2_bn1_moving_mean, stage1_unit2_bn1_moving_var,
                     0.00002);

  relu((float *)b94157387965024, (float *)b94157387965216, 1, 56, 56, 256);

  zero_pad_nhwc((float *)b94157387965504prepadding, (float *)b94157387965216,
                56, 56, 256, 0, 0, 0, 0);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157387965504, (float *)b94157387965504prepadding,
      stage1_unit2_conv1_weight, 56, 56, 1, 1, 256, 64, 1, 1);

  batchNormInference((float *)b94157387965504, (float *)b94157387966064, 1, 56,
                     56, 64, stage1_unit2_bn2_gamma, stage1_unit2_bn2_beta,
                     stage1_unit2_bn2_moving_mean, stage1_unit2_bn2_moving_var,
                     0.00002);

  relu((float *)b94157387966064, (float *)b94157387966256, 1, 56, 56, 64);

  zero_pad_nhwc((float *)b94157387966544prepadding, (float *)b94157387966256,
                56, 56, 64, 1, 1, 1, 1);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157387966544, (float *)b94157387966544prepadding,
      stage1_unit2_conv2_weight, 58, 58, 3, 3, 64, 64, 1, 1);

  batchNormInference((float *)b94157387966544, (float *)b94157387967104, 1, 56,
                     56, 64, stage1_unit2_bn3_gamma, stage1_unit2_bn3_beta,
                     stage1_unit2_bn3_moving_mean, stage1_unit2_bn3_moving_var,
                     0.00002);

  relu((float *)b94157387967104, (float *)b94157387967296, 1, 56, 56, 64);

  zero_pad_nhwc((float *)b94157387967584prepadding, (float *)b94157387967296,
                56, 56, 64, 0, 0, 0, 0);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157387967584, (float *)b94157387967584prepadding,
      stage1_unit2_conv3_weight, 56, 56, 1, 1, 64, 256, 1, 1);

  int b94157387967872_out_shape[4] = {1, 56, 56, 256};
  int b94157387967872_a_shape[4] = {1, 56, 56, 256};
  int b94157387967872_b_shape[4] = {1, 56, 56, 256};
  add_with_broadcasting(
      (float *)b94157387967872, (float *)b94157387967584,
      (float *)b94157387964464, (int *)b94157387967872_out_shape, 4,
      (int *)b94157387967872_a_shape, 4, (int *)b94157387967872_b_shape, 4);

  batchNormInference((float *)b94157387967872, (float *)b94157387968432, 1, 56,
                     56, 256, stage1_unit3_bn1_gamma, stage1_unit3_bn1_beta,
                     stage1_unit3_bn1_moving_mean, stage1_unit3_bn1_moving_var,
                     0.00002);

  relu((float *)b94157387968432, (float *)b94157387968624, 1, 56, 56, 256);

  zero_pad_nhwc((float *)b94157387968912prepadding, (float *)b94157387968624,
                56, 56, 256, 0, 0, 0, 0);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157387968912, (float *)b94157387968912prepadding,
      stage1_unit3_conv1_weight, 56, 56, 1, 1, 256, 64, 1, 1);

  batchNormInference((float *)b94157387968912, (float *)b94157387969472, 1, 56,
                     56, 64, stage1_unit3_bn2_gamma, stage1_unit3_bn2_beta,
                     stage1_unit3_bn2_moving_mean, stage1_unit3_bn2_moving_var,
                     0.00002);

  relu((float *)b94157387969472, (float *)b94157387969664, 1, 56, 56, 64);

  zero_pad_nhwc((float *)b94157387969952prepadding, (float *)b94157387969664,
                56, 56, 64, 1, 1, 1, 1);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157387969952, (float *)b94157387969952prepadding,
      stage1_unit3_conv2_weight, 58, 58, 3, 3, 64, 64, 1, 1);

  batchNormInference((float *)b94157387969952, (float *)b94157387970512, 1, 56,
                     56, 64, stage1_unit3_bn3_gamma, stage1_unit3_bn3_beta,
                     stage1_unit3_bn3_moving_mean, stage1_unit3_bn3_moving_var,
                     0.00002);

  relu((float *)b94157387970512, (float *)b94157387970704, 1, 56, 56, 64);

  zero_pad_nhwc((float *)b94157387970992prepadding, (float *)b94157387970704,
                56, 56, 64, 0, 0, 0, 0);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157387970992, (float *)b94157387970992prepadding,
      stage1_unit3_conv3_weight, 56, 56, 1, 1, 64, 256, 1, 1);

  int b94157387971280_out_shape[4] = {1, 56, 56, 256};
  int b94157387971280_a_shape[4] = {1, 56, 56, 256};
  int b94157387971280_b_shape[4] = {1, 56, 56, 256};
  add_with_broadcasting(
      (float *)b94157387971280, (float *)b94157387970992,
      (float *)b94157387967872, (int *)b94157387971280_out_shape, 4,
      (int *)b94157387971280_a_shape, 4, (int *)b94157387971280_b_shape, 4);

  batchNormInference((float *)b94157387971280, (float *)b94157387971840, 1, 56,
                     56, 256, stage2_unit1_bn1_gamma, stage2_unit1_bn1_beta,
                     stage2_unit1_bn1_moving_mean, stage2_unit1_bn1_moving_var,
                     0.00002);

  relu((float *)b94157387971840, (float *)b94157387972032, 1, 56, 56, 256);

  zero_pad_nhwc((float *)b94157387972320prepadding, (float *)b94157387972032,
                56, 56, 256, 0, 0, 0, 0);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157387972320, (float *)b94157387972320prepadding,
      stage2_unit1_conv1_weight, 56, 56, 1, 1, 256, 128, 2, 2);

  batchNormInference((float *)b94157387972320, (float *)b94157387972880, 1, 28,
                     28, 128, stage2_unit1_bn2_gamma, stage2_unit1_bn2_beta,
                     stage2_unit1_bn2_moving_mean, stage2_unit1_bn2_moving_var,
                     0.00002);

  relu((float *)b94157387972880, (float *)b94157387973072, 1, 28, 28, 128);

  zero_pad_nhwc((float *)b94157387973360prepadding, (float *)b94157387973072,
                28, 28, 128, 1, 1, 1, 1);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157387973360, (float *)b94157387973360prepadding,
      stage2_unit1_conv2_weight, 30, 30, 3, 3, 128, 128, 1, 1);

  batchNormInference((float *)b94157387973360, (float *)b94157387973920, 1, 28,
                     28, 128, stage2_unit1_bn3_gamma, stage2_unit1_bn3_beta,
                     stage2_unit1_bn3_moving_mean, stage2_unit1_bn3_moving_var,
                     0.00002);

  relu((float *)b94157387973920, (float *)b94157387974112, 1, 28, 28, 128);

  zero_pad_nhwc((float *)b94157387974400prepadding, (float *)b94157387974112,
                28, 28, 128, 0, 0, 0, 0);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157387974400, (float *)b94157387974400prepadding,
      stage2_unit1_conv3_weight, 28, 28, 1, 1, 128, 512, 1, 1);

  zero_pad_nhwc((float *)b94157387974688prepadding, (float *)b94157387972032,
                56, 56, 256, 0, 0, 0, 0);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157387974688, (float *)b94157387974688prepadding,
      stage2_unit1_sc_weight, 56, 56, 1, 1, 256, 512, 2, 2);

  int b94157387974976_out_shape[4] = {1, 28, 28, 512};
  int b94157387974976_a_shape[4] = {1, 28, 28, 512};
  int b94157387974976_b_shape[4] = {1, 28, 28, 512};
  add_with_broadcasting(
      (float *)b94157387974976, (float *)b94157387974400,
      (float *)b94157387974688, (int *)b94157387974976_out_shape, 4,
      (int *)b94157387974976_a_shape, 4, (int *)b94157387974976_b_shape, 4);

  batchNormInference((float *)b94157387974976, (float *)b94157388802288, 1, 28,
                     28, 512, stage2_unit2_bn1_gamma, stage2_unit2_bn1_beta,
                     stage2_unit2_bn1_moving_mean, stage2_unit2_bn1_moving_var,
                     0.00002);

  relu((float *)b94157388802288, (float *)b94157388802480, 1, 28, 28, 512);

  zero_pad_nhwc((float *)b94157388802768prepadding, (float *)b94157388802480,
                28, 28, 512, 0, 0, 0, 0);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157388802768, (float *)b94157388802768prepadding,
      stage2_unit2_conv1_weight, 28, 28, 1, 1, 512, 128, 1, 1);

  batchNormInference((float *)b94157388802768, (float *)b94157388803328, 1, 28,
                     28, 128, stage2_unit2_bn2_gamma, stage2_unit2_bn2_beta,
                     stage2_unit2_bn2_moving_mean, stage2_unit2_bn2_moving_var,
                     0.00002);

  relu((float *)b94157388803328, (float *)b94157387941008, 1, 28, 28, 128);

  zero_pad_nhwc((float *)b94157387941296prepadding, (float *)b94157387941008,
                28, 28, 128, 1, 1, 1, 1);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157387941296, (float *)b94157387941296prepadding,
      stage2_unit2_conv2_weight, 30, 30, 3, 3, 128, 128, 1, 1);

  batchNormInference((float *)b94157387941296, (float *)b94157387941856, 1, 28,
                     28, 128, stage2_unit2_bn3_gamma, stage2_unit2_bn3_beta,
                     stage2_unit2_bn3_moving_mean, stage2_unit2_bn3_moving_var,
                     0.00002);

  relu((float *)b94157387941856, (float *)b94157387942048, 1, 28, 28, 128);

  zero_pad_nhwc((float *)b94157387942336prepadding, (float *)b94157387942048,
                28, 28, 128, 0, 0, 0, 0);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157387942336, (float *)b94157387942336prepadding,
      stage2_unit2_conv3_weight, 28, 28, 1, 1, 128, 512, 1, 1);

  int b94157387942624_out_shape[4] = {1, 28, 28, 512};
  int b94157387942624_a_shape[4] = {1, 28, 28, 512};
  int b94157387942624_b_shape[4] = {1, 28, 28, 512};
  add_with_broadcasting(
      (float *)b94157387942624, (float *)b94157387942336,
      (float *)b94157387974976, (int *)b94157387942624_out_shape, 4,
      (int *)b94157387942624_a_shape, 4, (int *)b94157387942624_b_shape, 4);

  batchNormInference((float *)b94157387942624, (float *)b94157387943184, 1, 28,
                     28, 512, stage2_unit3_bn1_gamma, stage2_unit3_bn1_beta,
                     stage2_unit3_bn1_moving_mean, stage2_unit3_bn1_moving_var,
                     0.00002);

  relu((float *)b94157387943184, (float *)b94157387943376, 1, 28, 28, 512);

  zero_pad_nhwc((float *)b94157388809200prepadding, (float *)b94157387943376,
                28, 28, 512, 0, 0, 0, 0);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157388809200, (float *)b94157388809200prepadding,
      stage2_unit3_conv1_weight, 28, 28, 1, 1, 512, 128, 1, 1);

  batchNormInference((float *)b94157388809200, (float *)b94157388809712, 1, 28,
                     28, 128, stage2_unit3_bn2_gamma, stage2_unit3_bn2_beta,
                     stage2_unit3_bn2_moving_mean, stage2_unit3_bn2_moving_var,
                     0.00002);

  relu((float *)b94157388809712, (float *)b94157388809904, 1, 28, 28, 128);

  zero_pad_nhwc((float *)b94157388810192prepadding, (float *)b94157388809904,
                28, 28, 128, 1, 1, 1, 1);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157388810192, (float *)b94157388810192prepadding,
      stage2_unit3_conv2_weight, 30, 30, 3, 3, 128, 128, 1, 1);

  batchNormInference((float *)b94157388810192, (float *)b94157388810752, 1, 28,
                     28, 128, stage2_unit3_bn3_gamma, stage2_unit3_bn3_beta,
                     stage2_unit3_bn3_moving_mean, stage2_unit3_bn3_moving_var,
                     0.00002);

  relu((float *)b94157388810752, (float *)b94157388810944, 1, 28, 28, 128);

  zero_pad_nhwc((float *)b94157388811232prepadding, (float *)b94157388810944,
                28, 28, 128, 0, 0, 0, 0);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157388811232, (float *)b94157388811232prepadding,
      stage2_unit3_conv3_weight, 28, 28, 1, 1, 128, 512, 1, 1);

  int b94157388811520_out_shape[4] = {1, 28, 28, 512};
  int b94157388811520_a_shape[4] = {1, 28, 28, 512};
  int b94157388811520_b_shape[4] = {1, 28, 28, 512};
  add_with_broadcasting(
      (float *)b94157388811520, (float *)b94157388811232,
      (float *)b94157387942624, (int *)b94157388811520_out_shape, 4,
      (int *)b94157388811520_a_shape, 4, (int *)b94157388811520_b_shape, 4);

  batchNormInference((float *)b94157388811520, (float *)b94157388812080, 1, 28,
                     28, 512, stage2_unit4_bn1_gamma, stage2_unit4_bn1_beta,
                     stage2_unit4_bn1_moving_mean, stage2_unit4_bn1_moving_var,
                     0.00002);

  relu((float *)b94157388812080, (float *)b94157388812272, 1, 28, 28, 512);

  zero_pad_nhwc((float *)b94157388812560prepadding, (float *)b94157388812272,
                28, 28, 512, 0, 0, 0, 0);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157388812560, (float *)b94157388812560prepadding,
      stage2_unit4_conv1_weight, 28, 28, 1, 1, 512, 128, 1, 1);

  batchNormInference((float *)b94157388812560, (float *)b94157388813120, 1, 28,
                     28, 128, stage2_unit4_bn2_gamma, stage2_unit4_bn2_beta,
                     stage2_unit4_bn2_moving_mean, stage2_unit4_bn2_moving_var,
                     0.00002);

  relu((float *)b94157388813120, (float *)b94157388813312, 1, 28, 28, 128);

  zero_pad_nhwc((float *)b94157388813600prepadding, (float *)b94157388813312,
                28, 28, 128, 1, 1, 1, 1);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157388813600, (float *)b94157388813600prepadding,
      stage2_unit4_conv2_weight, 30, 30, 3, 3, 128, 128, 1, 1);

  batchNormInference((float *)b94157388813600, (float *)b94157388814160, 1, 28,
                     28, 128, stage2_unit4_bn3_gamma, stage2_unit4_bn3_beta,
                     stage2_unit4_bn3_moving_mean, stage2_unit4_bn3_moving_var,
                     0.00002);

  relu((float *)b94157388814160, (float *)b94157388814352, 1, 28, 28, 128);

  zero_pad_nhwc((float *)b94157388814640prepadding, (float *)b94157388814352,
                28, 28, 128, 0, 0, 0, 0);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157388814640, (float *)b94157388814640prepadding,
      stage2_unit4_conv3_weight, 28, 28, 1, 1, 128, 512, 1, 1);

  int b94157388814928_out_shape[4] = {1, 28, 28, 512};
  int b94157388814928_a_shape[4] = {1, 28, 28, 512};
  int b94157388814928_b_shape[4] = {1, 28, 28, 512};
  add_with_broadcasting(
      (float *)b94157388814928, (float *)b94157388814640,
      (float *)b94157388811520, (int *)b94157388814928_out_shape, 4,
      (int *)b94157388814928_a_shape, 4, (int *)b94157388814928_b_shape, 4);

  batchNormInference((float *)b94157388814928, (float *)b94157388815488, 1, 28,
                     28, 512, stage3_unit1_bn1_gamma, stage3_unit1_bn1_beta,
                     stage3_unit1_bn1_moving_mean, stage3_unit1_bn1_moving_var,
                     0.00002);

  relu((float *)b94157388815488, (float *)b94157388815680, 1, 28, 28, 512);

  zero_pad_nhwc((float *)b94157388815968prepadding, (float *)b94157388815680,
                28, 28, 512, 0, 0, 0, 0);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157388815968, (float *)b94157388815968prepadding,
      stage3_unit1_conv1_weight, 28, 28, 1, 1, 512, 256, 2, 2);

  batchNormInference((float *)b94157388815968, (float *)b94157388816528, 1, 14,
                     14, 256, stage3_unit1_bn2_gamma, stage3_unit1_bn2_beta,
                     stage3_unit1_bn2_moving_mean, stage3_unit1_bn2_moving_var,
                     0.00002);

  relu((float *)b94157388816528, (float *)b94157388816720, 1, 14, 14, 256);

  zero_pad_nhwc((float *)b94157388817008prepadding, (float *)b94157388816720,
                14, 14, 256, 1, 1, 1, 1);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157388817008, (float *)b94157388817008prepadding,
      stage3_unit1_conv2_weight, 16, 16, 3, 3, 256, 256, 1, 1);

  batchNormInference((float *)b94157388817008, (float *)b94157388817568, 1, 14,
                     14, 256, stage3_unit1_bn3_gamma, stage3_unit1_bn3_beta,
                     stage3_unit1_bn3_moving_mean, stage3_unit1_bn3_moving_var,
                     0.00002);

  relu((float *)b94157388817568, (float *)b94157388817760, 1, 14, 14, 256);

  zero_pad_nhwc((float *)b94157388818048prepadding, (float *)b94157388817760,
                14, 14, 256, 0, 0, 0, 0);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157388818048, (float *)b94157388818048prepadding,
      stage3_unit1_conv3_weight, 14, 14, 1, 1, 256, 1024, 1, 1);

  zero_pad_nhwc((float *)b94157388818336prepadding, (float *)b94157388815680,
                28, 28, 512, 0, 0, 0, 0);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157388818336, (float *)b94157388818336prepadding,
      stage3_unit1_sc_weight, 28, 28, 1, 1, 512, 1024, 2, 2);

  int b94157388818624_out_shape[4] = {1, 14, 14, 1024};
  int b94157388818624_a_shape[4] = {1, 14, 14, 1024};
  int b94157388818624_b_shape[4] = {1, 14, 14, 1024};
  add_with_broadcasting(
      (float *)b94157388818624, (float *)b94157388818048,
      (float *)b94157388818336, (int *)b94157388818624_out_shape, 4,
      (int *)b94157388818624_a_shape, 4, (int *)b94157388818624_b_shape, 4);

  batchNormInference((float *)b94157388818624, (float *)b94157388819184, 1, 14,
                     14, 1024, stage3_unit2_bn1_gamma, stage3_unit2_bn1_beta,
                     stage3_unit2_bn1_moving_mean, stage3_unit2_bn1_moving_var,
                     0.00002);

  relu((float *)b94157388819184, (float *)b94157388819376, 1, 14, 14, 1024);

  zero_pad_nhwc((float *)b94157388819664prepadding, (float *)b94157388819376,
                14, 14, 1024, 0, 0, 0, 0);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157388819664, (float *)b94157388819664prepadding,
      stage3_unit2_conv1_weight, 14, 14, 1, 1, 1024, 256, 1, 1);

  batchNormInference((float *)b94157388819664, (float *)b94157388820224, 1, 14,
                     14, 256, stage3_unit2_bn2_gamma, stage3_unit2_bn2_beta,
                     stage3_unit2_bn2_moving_mean, stage3_unit2_bn2_moving_var,
                     0.00002);

  relu((float *)b94157388820224, (float *)b94157388820416, 1, 14, 14, 256);

  zero_pad_nhwc((float *)b94157388820704prepadding, (float *)b94157388820416,
                14, 14, 256, 1, 1, 1, 1);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157388820704, (float *)b94157388820704prepadding,
      stage3_unit2_conv2_weight, 16, 16, 3, 3, 256, 256, 1, 1);

  batchNormInference((float *)b94157388820704, (float *)b94157388821264, 1, 14,
                     14, 256, stage3_unit2_bn3_gamma, stage3_unit2_bn3_beta,
                     stage3_unit2_bn3_moving_mean, stage3_unit2_bn3_moving_var,
                     0.00002);

  relu((float *)b94157388821264, (float *)b94157388821456, 1, 14, 14, 256);

  zero_pad_nhwc((float *)b94157388821744prepadding, (float *)b94157388821456,
                14, 14, 256, 0, 0, 0, 0);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157388821744, (float *)b94157388821744prepadding,
      stage3_unit2_conv3_weight, 14, 14, 1, 1, 256, 1024, 1, 1);

  int b94157388822032_out_shape[4] = {1, 14, 14, 1024};
  int b94157388822032_a_shape[4] = {1, 14, 14, 1024};
  int b94157388822032_b_shape[4] = {1, 14, 14, 1024};
  add_with_broadcasting(
      (float *)b94157388822032, (float *)b94157388821744,
      (float *)b94157388818624, (int *)b94157388822032_out_shape, 4,
      (int *)b94157388822032_a_shape, 4, (int *)b94157388822032_b_shape, 4);

  batchNormInference((float *)b94157388822032, (float *)b94157388822592, 1, 14,
                     14, 1024, stage3_unit3_bn1_gamma, stage3_unit3_bn1_beta,
                     stage3_unit3_bn1_moving_mean, stage3_unit3_bn1_moving_var,
                     0.00002);

  relu((float *)b94157388822592, (float *)b94157388822784, 1, 14, 14, 1024);

  zero_pad_nhwc((float *)b94157388823072prepadding, (float *)b94157388822784,
                14, 14, 1024, 0, 0, 0, 0);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157388823072, (float *)b94157388823072prepadding,
      stage3_unit3_conv1_weight, 14, 14, 1, 1, 1024, 256, 1, 1);

  batchNormInference((float *)b94157388823072, (float *)b94157388823632, 1, 14,
                     14, 256, stage3_unit3_bn2_gamma, stage3_unit3_bn2_beta,
                     stage3_unit3_bn2_moving_mean, stage3_unit3_bn2_moving_var,
                     0.00002);

  relu((float *)b94157388823632, (float *)b94157388823824, 1, 14, 14, 256);

  zero_pad_nhwc((float *)b94157388824112prepadding, (float *)b94157388823824,
                14, 14, 256, 1, 1, 1, 1);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157388824112, (float *)b94157388824112prepadding,
      stage3_unit3_conv2_weight, 16, 16, 3, 3, 256, 256, 1, 1);

  batchNormInference((float *)b94157388824112, (float *)b94157388824672, 1, 14,
                     14, 256, stage3_unit3_bn3_gamma, stage3_unit3_bn3_beta,
                     stage3_unit3_bn3_moving_mean, stage3_unit3_bn3_moving_var,
                     0.00002);

  relu((float *)b94157388824672, (float *)b94157388824864, 1, 14, 14, 256);

  zero_pad_nhwc((float *)b94157388825152prepadding, (float *)b94157388824864,
                14, 14, 256, 0, 0, 0, 0);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157388825152, (float *)b94157388825152prepadding,
      stage3_unit3_conv3_weight, 14, 14, 1, 1, 256, 1024, 1, 1);

  int b94157388825440_out_shape[4] = {1, 14, 14, 1024};
  int b94157388825440_a_shape[4] = {1, 14, 14, 1024};
  int b94157388825440_b_shape[4] = {1, 14, 14, 1024};
  add_with_broadcasting(
      (float *)b94157388825440, (float *)b94157388825152,
      (float *)b94157388822032, (int *)b94157388825440_out_shape, 4,
      (int *)b94157388825440_a_shape, 4, (int *)b94157388825440_b_shape, 4);

  batchNormInference((float *)b94157388825440, (float *)b94157388826000, 1, 14,
                     14, 1024, stage3_unit4_bn1_gamma, stage3_unit4_bn1_beta,
                     stage3_unit4_bn1_moving_mean, stage3_unit4_bn1_moving_var,
                     0.00002);

  relu((float *)b94157388826000, (float *)b94157388826192, 1, 14, 14, 1024);

  zero_pad_nhwc((float *)b94157388826480prepadding, (float *)b94157388826192,
                14, 14, 1024, 0, 0, 0, 0);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157388826480, (float *)b94157388826480prepadding,
      stage3_unit4_conv1_weight, 14, 14, 1, 1, 1024, 256, 1, 1);

  batchNormInference((float *)b94157388826480, (float *)b94157388827040, 1, 14,
                     14, 256, stage3_unit4_bn2_gamma, stage3_unit4_bn2_beta,
                     stage3_unit4_bn2_moving_mean, stage3_unit4_bn2_moving_var,
                     0.00002);

  relu((float *)b94157388827040, (float *)b94157388827232, 1, 14, 14, 256);

  zero_pad_nhwc((float *)b94157388827520prepadding, (float *)b94157388827232,
                14, 14, 256, 1, 1, 1, 1);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157388827520, (float *)b94157388827520prepadding,
      stage3_unit4_conv2_weight, 16, 16, 3, 3, 256, 256, 1, 1);

  batchNormInference((float *)b94157388827520, (float *)b94157388828080, 1, 14,
                     14, 256, stage3_unit4_bn3_gamma, stage3_unit4_bn3_beta,
                     stage3_unit4_bn3_moving_mean, stage3_unit4_bn3_moving_var,
                     0.00002);

  relu((float *)b94157388828080, (float *)b94157388828272, 1, 14, 14, 256);

  zero_pad_nhwc((float *)b94157388828560prepadding, (float *)b94157388828272,
                14, 14, 256, 0, 0, 0, 0);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157388828560, (float *)b94157388828560prepadding,
      stage3_unit4_conv3_weight, 14, 14, 1, 1, 256, 1024, 1, 1);

  int b94157388828848_out_shape[4] = {1, 14, 14, 1024};
  int b94157388828848_a_shape[4] = {1, 14, 14, 1024};
  int b94157388828848_b_shape[4] = {1, 14, 14, 1024};
  add_with_broadcasting(
      (float *)b94157388828848, (float *)b94157388828560,
      (float *)b94157388825440, (int *)b94157388828848_out_shape, 4,
      (int *)b94157388828848_a_shape, 4, (int *)b94157388828848_b_shape, 4);

  batchNormInference((float *)b94157388828848, (float *)b94157388829408, 1, 14,
                     14, 1024, stage3_unit5_bn1_gamma, stage3_unit5_bn1_beta,
                     stage3_unit5_bn1_moving_mean, stage3_unit5_bn1_moving_var,
                     0.00002);

  relu((float *)b94157388829408, (float *)b94157388829600, 1, 14, 14, 1024);

  zero_pad_nhwc((float *)b94157388829888prepadding, (float *)b94157388829600,
                14, 14, 1024, 0, 0, 0, 0);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157388829888, (float *)b94157388829888prepadding,
      stage3_unit5_conv1_weight, 14, 14, 1, 1, 1024, 256, 1, 1);

  batchNormInference((float *)b94157388829888, (float *)b94157388830448, 1, 14,
                     14, 256, stage3_unit5_bn2_gamma, stage3_unit5_bn2_beta,
                     stage3_unit5_bn2_moving_mean, stage3_unit5_bn2_moving_var,
                     0.00002);

  relu((float *)b94157388830448, (float *)b94157388830640, 1, 14, 14, 256);

  zero_pad_nhwc((float *)b94157388830928prepadding, (float *)b94157388830640,
                14, 14, 256, 1, 1, 1, 1);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157388830928, (float *)b94157388830928prepadding,
      stage3_unit5_conv2_weight, 16, 16, 3, 3, 256, 256, 1, 1);

  batchNormInference((float *)b94157388830928, (float *)b94157388831488, 1, 14,
                     14, 256, stage3_unit5_bn3_gamma, stage3_unit5_bn3_beta,
                     stage3_unit5_bn3_moving_mean, stage3_unit5_bn3_moving_var,
                     0.00002);

  relu((float *)b94157388831488, (float *)b94157388831680, 1, 14, 14, 256);

  zero_pad_nhwc((float *)b94157388831968prepadding, (float *)b94157388831680,
                14, 14, 256, 0, 0, 0, 0);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157388831968, (float *)b94157388831968prepadding,
      stage3_unit5_conv3_weight, 14, 14, 1, 1, 256, 1024, 1, 1);

  int b94157388832256_out_shape[4] = {1, 14, 14, 1024};
  int b94157388832256_a_shape[4] = {1, 14, 14, 1024};
  int b94157388832256_b_shape[4] = {1, 14, 14, 1024};
  add_with_broadcasting(
      (float *)b94157388832256, (float *)b94157388831968,
      (float *)b94157388828848, (int *)b94157388832256_out_shape, 4,
      (int *)b94157388832256_a_shape, 4, (int *)b94157388832256_b_shape, 4);

  batchNormInference((float *)b94157388832256, (float *)b94157388832816, 1, 14,
                     14, 1024, stage3_unit6_bn1_gamma, stage3_unit6_bn1_beta,
                     stage3_unit6_bn1_moving_mean, stage3_unit6_bn1_moving_var,
                     0.00002);

  relu((float *)b94157388832816, (float *)b94157388833008, 1, 14, 14, 1024);

  zero_pad_nhwc((float *)b94157388833296prepadding, (float *)b94157388833008,
                14, 14, 1024, 0, 0, 0, 0);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157388833296, (float *)b94157388833296prepadding,
      stage3_unit6_conv1_weight, 14, 14, 1, 1, 1024, 256, 1, 1);

  batchNormInference((float *)b94157388833296, (float *)b94157388833856, 1, 14,
                     14, 256, stage3_unit6_bn2_gamma, stage3_unit6_bn2_beta,
                     stage3_unit6_bn2_moving_mean, stage3_unit6_bn2_moving_var,
                     0.00002);

  relu((float *)b94157388833856, (float *)b94157388834048, 1, 14, 14, 256);

  zero_pad_nhwc((float *)b94157388834336prepadding, (float *)b94157388834048,
                14, 14, 256, 1, 1, 1, 1);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157388834336, (float *)b94157388834336prepadding,
      stage3_unit6_conv2_weight, 16, 16, 3, 3, 256, 256, 1, 1);

  batchNormInference((float *)b94157388834336, (float *)b94157388834896, 1, 14,
                     14, 256, stage3_unit6_bn3_gamma, stage3_unit6_bn3_beta,
                     stage3_unit6_bn3_moving_mean, stage3_unit6_bn3_moving_var,
                     0.00002);

  relu((float *)b94157388834896, (float *)b94157388835088, 1, 14, 14, 256);

  zero_pad_nhwc((float *)b94157388835376prepadding, (float *)b94157388835088,
                14, 14, 256, 0, 0, 0, 0);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157388835376, (float *)b94157388835376prepadding,
      stage3_unit6_conv3_weight, 14, 14, 1, 1, 256, 1024, 1, 1);

  int b94157388835664_out_shape[4] = {1, 14, 14, 1024};
  int b94157388835664_a_shape[4] = {1, 14, 14, 1024};
  int b94157388835664_b_shape[4] = {1, 14, 14, 1024};
  add_with_broadcasting(
      (float *)b94157388835664, (float *)b94157388835376,
      (float *)b94157388832256, (int *)b94157388835664_out_shape, 4,
      (int *)b94157388835664_a_shape, 4, (int *)b94157388835664_b_shape, 4);

  batchNormInference((float *)b94157388835664, (float *)b94157388836224, 1, 14,
                     14, 1024, stage4_unit1_bn1_gamma, stage4_unit1_bn1_beta,
                     stage4_unit1_bn1_moving_mean, stage4_unit1_bn1_moving_var,
                     0.00002);

  relu((float *)b94157388836224, (float *)b94157388836416, 1, 14, 14, 1024);

  zero_pad_nhwc((float *)b94157388836704prepadding, (float *)b94157388836416,
                14, 14, 1024, 0, 0, 0, 0);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157388836704, (float *)b94157388836704prepadding,
      stage4_unit1_conv1_weight, 14, 14, 1, 1, 1024, 512, 2, 2);

  batchNormInference((float *)b94157388836704, (float *)b94157388837264, 1, 7,
                     7, 512, stage4_unit1_bn2_gamma, stage4_unit1_bn2_beta,
                     stage4_unit1_bn2_moving_mean, stage4_unit1_bn2_moving_var,
                     0.00002);

  relu((float *)b94157388837264, (float *)b94157388837456, 1, 7, 7, 512);

  zero_pad_nhwc((float *)b94157388837744prepadding, (float *)b94157388837456, 7,
                7, 512, 1, 1, 1, 1);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157388837744, (float *)b94157388837744prepadding,
      stage4_unit1_conv2_weight, 9, 9, 3, 3, 512, 512, 1, 1);

  batchNormInference((float *)b94157388837744, (float *)b94157388838304, 1, 7,
                     7, 512, stage4_unit1_bn3_gamma, stage4_unit1_bn3_beta,
                     stage4_unit1_bn3_moving_mean, stage4_unit1_bn3_moving_var,
                     0.00002);

  relu((float *)b94157388838304, (float *)b94157388838496, 1, 7, 7, 512);

  zero_pad_nhwc((float *)b94157388838784prepadding, (float *)b94157388838496, 7,
                7, 512, 0, 0, 0, 0);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157388838784, (float *)b94157388838784prepadding,
      stage4_unit1_conv3_weight, 7, 7, 1, 1, 512, 2048, 1, 1);

  zero_pad_nhwc((float *)b94157388839072prepadding, (float *)b94157388836416,
                14, 14, 1024, 0, 0, 0, 0);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157388839072, (float *)b94157388839072prepadding,
      stage4_unit1_sc_weight, 14, 14, 1, 1, 1024, 2048, 2, 2);

  int b94157388839360_out_shape[4] = {1, 7, 7, 2048};
  int b94157388839360_a_shape[4] = {1, 7, 7, 2048};
  int b94157388839360_b_shape[4] = {1, 7, 7, 2048};
  add_with_broadcasting(
      (float *)b94157388839360, (float *)b94157388838784,
      (float *)b94157388839072, (int *)b94157388839360_out_shape, 4,
      (int *)b94157388839360_a_shape, 4, (int *)b94157388839360_b_shape, 4);

  batchNormInference((float *)b94157388839360, (float *)b94157388839920, 1, 7,
                     7, 2048, stage4_unit2_bn1_gamma, stage4_unit2_bn1_beta,
                     stage4_unit2_bn1_moving_mean, stage4_unit2_bn1_moving_var,
                     0.00002);

  relu((float *)b94157388839920, (float *)b94157388840112, 1, 7, 7, 2048);

  zero_pad_nhwc((float *)b94157388840400prepadding, (float *)b94157388840112, 7,
                7, 2048, 0, 0, 0, 0);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157388840400, (float *)b94157388840400prepadding,
      stage4_unit2_conv1_weight, 7, 7, 1, 1, 2048, 512, 1, 1);

  batchNormInference((float *)b94157388840400, (float *)b94157388840960, 1, 7,
                     7, 512, stage4_unit2_bn2_gamma, stage4_unit2_bn2_beta,
                     stage4_unit2_bn2_moving_mean, stage4_unit2_bn2_moving_var,
                     0.00002);

  relu((float *)b94157388840960, (float *)b94157388841152, 1, 7, 7, 512);

  zero_pad_nhwc((float *)b94157388841440prepadding, (float *)b94157388841152, 7,
                7, 512, 1, 1, 1, 1);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157388841440, (float *)b94157388841440prepadding,
      stage4_unit2_conv2_weight, 9, 9, 3, 3, 512, 512, 1, 1);

  batchNormInference((float *)b94157388841440, (float *)b94157388842000, 1, 7,
                     7, 512, stage4_unit2_bn3_gamma, stage4_unit2_bn3_beta,
                     stage4_unit2_bn3_moving_mean, stage4_unit2_bn3_moving_var,
                     0.00002);

  relu((float *)b94157388842000, (float *)b94157388842192, 1, 7, 7, 512);

  zero_pad_nhwc((float *)b94157388842480prepadding, (float *)b94157388842192, 7,
                7, 512, 0, 0, 0, 0);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157388842480, (float *)b94157388842480prepadding,
      stage4_unit2_conv3_weight, 7, 7, 1, 1, 512, 2048, 1, 1);

  int b94157388842768_out_shape[4] = {1, 7, 7, 2048};
  int b94157388842768_a_shape[4] = {1, 7, 7, 2048};
  int b94157388842768_b_shape[4] = {1, 7, 7, 2048};
  add_with_broadcasting(
      (float *)b94157388842768, (float *)b94157388842480,
      (float *)b94157388839360, (int *)b94157388842768_out_shape, 4,
      (int *)b94157388842768_a_shape, 4, (int *)b94157388842768_b_shape, 4);

  batchNormInference((float *)b94157388842768, (float *)b94157388843328, 1, 7,
                     7, 2048, stage4_unit3_bn1_gamma, stage4_unit3_bn1_beta,
                     stage4_unit3_bn1_moving_mean, stage4_unit3_bn1_moving_var,
                     0.00002);

  relu((float *)b94157388843328, (float *)b94157388843520, 1, 7, 7, 2048);

  zero_pad_nhwc((float *)b94157388843808prepadding, (float *)b94157388843520, 7,
                7, 2048, 0, 0, 0, 0);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157388843808, (float *)b94157388843808prepadding,
      stage4_unit3_conv1_weight, 7, 7, 1, 1, 2048, 512, 1, 1);

  batchNormInference((float *)b94157388843808, (float *)b94157388844368, 1, 7,
                     7, 512, stage4_unit3_bn2_gamma, stage4_unit3_bn2_beta,
                     stage4_unit3_bn2_moving_mean, stage4_unit3_bn2_moving_var,
                     0.00002);

  relu((float *)b94157388844368, (float *)b94157388844560, 1, 7, 7, 512);

  zero_pad_nhwc((float *)b94157388844848prepadding, (float *)b94157388844560, 7,
                7, 512, 1, 1, 1, 1);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157388844848, (float *)b94157388844848prepadding,
      stage4_unit3_conv2_weight, 9, 9, 3, 3, 512, 512, 1, 1);

  batchNormInference((float *)b94157388844848, (float *)b94157388845408, 1, 7,
                     7, 512, stage4_unit3_bn3_gamma, stage4_unit3_bn3_beta,
                     stage4_unit3_bn3_moving_mean, stage4_unit3_bn3_moving_var,
                     0.00002);

  relu((float *)b94157388845408, (float *)b94157388845600, 1, 7, 7, 512);

  zero_pad_nhwc((float *)b94157388845888prepadding, (float *)b94157388845600, 7,
                7, 512, 0, 0, 0, 0);

  rtml_systolic_array_weight_stationary_conv2d_nhwc_hwio_prepadded(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157388845888, (float *)b94157388845888prepadding,
      stage4_unit3_conv3_weight, 7, 7, 1, 1, 512, 2048, 1, 1);

  int b94157388846176_out_shape[4] = {1, 7, 7, 2048};
  int b94157388846176_a_shape[4] = {1, 7, 7, 2048};
  int b94157388846176_b_shape[4] = {1, 7, 7, 2048};
  add_with_broadcasting(
      (float *)b94157388846176, (float *)b94157388845888,
      (float *)b94157388842768, (int *)b94157388846176_out_shape, 4,
      (int *)b94157388846176_a_shape, 4, (int *)b94157388846176_b_shape, 4);

  batchNormInference((float *)b94157388846176, (float *)b94157388846736, 1, 7,
                     7, 2048, bn1_gamma, bn1_beta, bn1_moving_mean,
                     bn1_moving_var, 0.00002);

  relu((float *)b94157388846736, (float *)b94157388846928, 1, 7, 7, 2048);

  globalAvgPool((float *)b94157388846928, (float *)b94157388847216, 1, 7, 7,
                2048);

  rtml_systolic_array_weight_stationary_fc(
      0, // hardware id hardwired to 0 for monolithic case
      (float *)b94157388847792, (float *)b94157388847216, fc1_weight, 2048,
      1000, 1);

  int b94157388848080_out_shape[2] = {1, 1000};
  int b94157388848080_a_shape[2] = {1, 1000};
  int b94157388848080_b_shape[1] = {1000};
  add_with_broadcasting((float *)b94157388848080, (float *)b94157388847792,
                        (float *)fc1_bias, (int *)b94157388848080_out_shape, 2,
                        (int *)b94157388848080_a_shape, 2,
                        (int *)b94157388848080_b_shape, 1);

  softmax1D((float *)b94157388848080, (float *)out, 1000);
}

In [3]:
data_shape = [int(v) for v in resnet50_module['main'].params[0].checked_type.shape]
data = np.random.rand(*data_shape).astype('float32')
expected = tvm.relay.create_executor(mod=resnet50_module).evaluate()(data, **resnet50_params).asnumpy()
conv2d NHWC layout is not optimized for x86 with autotvm.
conv2d NHWC layout is not optimized for x86 with autotvm.
conv2d NHWC layout is not optimized for x86 with autotvm.
conv2d NHWC layout is not optimized for x86 with autotvm.
conv2d NHWC layout is not optimized for x86 with autotvm.
conv2d NHWC layout is not optimized for x86 with autotvm.
conv2d NHWC layout is not optimized for x86 with autotvm.
conv2d NHWC layout is not optimized for x86 with autotvm.
conv2d NHWC layout is not optimized for x86 with autotvm.
conv2d NHWC layout is not optimized for x86 with autotvm.
conv2d NHWC layout is not optimized for x86 with autotvm.
conv2d NHWC layout is not optimized for x86 with autotvm.
conv2d NHWC layout is not optimized for x86 with autotvm.
conv2d NHWC layout is not optimized for x86 with autotvm.
conv2d NHWC layout is not optimized for x86 with autotvm.
conv2d NHWC layout is not optimized for x86 with autotvm.
conv2d NHWC layout is not optimized for x86 with autotvm.
conv2d NHWC layout is not optimized for x86 with autotvm.
conv2d NHWC layout is not optimized for x86 with autotvm.
conv2d NHWC layout is not optimized for x86 with autotvm.
Cannot find config for target=llvm -keys=cpu -link-params=0, workload=('dense_nopack.x86', ('TENSOR', (1, 2048), 'float32'), ('TENSOR', (1000, 2048), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
In [4]:
testbench_filename = 'resnet50_testbench.c'
with open(testbench_filename, 'w') as f:
    # Includes
    f.write("""
#include <assert.h>
#include <math.h>
""")

    # Output buffer
    out_shape = [int(v) for v in resnet50_module['main'].body.checked_type.shape]
    f.write(f"float out{''.join(f'[{v}]' for v in out_shape)} = ")
    print_array_efficiently(np.zeros(shape=out_shape), f)
    f.write(";\n")

    # Input buffer
    f.write(f"float data{''.join(f'[{v}]' for v in data_shape)} = ")
    print_array_efficiently(data, f)
    f.write(";\n")

    # Expected value buffer
    f.write(f"float expected{''.join(f'[{v}]' for v in expected.shape)} = ")
    print_array_efficiently(expected, f)
    f.write(";\n")


    for name, param in resnet50_params.items():
        f.write(f"float {name}{''.join(f'[{int(v)}]' for v in param.shape)} = ")
        print_array_efficiently(param.asnumpy(), f)
        f.write(";\n")

    f.write(f'''
int main() {{
    compiled((float*) out, (float*) data, {
        ','.join(f'(float*){name}' for name in resnet50_params)
    });

    for (int i = 0; i < {expected.size}; ++i) {{
        assert(fabs(((float*)out)[i] - ((float*)expected)[i]) < 0.0001);
    }}
}}''')
In [5]:
try:
    run([
        'gcc', '-lm', c_filename, testbench_filename, 'ops.c', '-o', 'resnet50_testbench'
    ], check=True, capture_output=True)
except CalledProcessError as e:
    print(e.stderr.decode())
In [6]:
try:
    run([
        './resnet50_testbench'
    ], check=True, capture_output=True)
except CalledProcessError as e:
    print(e.stderr.decode())

We're looking for a better way to pass parameters to simulation. It's really expensive and annoying to write and compile these giant C testbenches with all the parameters packed in. We'd much rather just package up our parameters e.g. as a pickle file and load it from C. So let's try that. First, we make a dictionary of parameters to be pickled:

In [7]:
params_to_pickle = { name: ndarray.asnumpy() for name, ndarray in resnet50_params.items() }
resnet50_params_pickle_filename = "resnet50_params.pickle"
with open(resnet50_params_pickle_filename, 'wb') as f:
    pickle.dump(params_to_pickle, f)
In [8]:
resnet50_wrapper_filename = "resnet50_wrapper.c"
with open(resnet50_wrapper_filename, 'w') as f:
    f.write(f'''
void wrapper() {{
    {linesep.join(f'float* {name} = NULL;' for name in resnet50_params)}

    PyObject* db = bsg_py_load_pickle_db("{resnet50_params_pickle_filename}");
    {linesep.join(f'bsg_py_load_array_from_db(db, "{name}", {name});' for name in resnet50_params)}

    compiled(out, data, {', '.join(name for name in resnet50_params)});

    {linesep.join(f'free({name});' for name in resnet50_params)}

}}''')