Update for sharing w/ ETH

siracusa-soc · Scheremo · May 19, 2022 · May 20, 2022 · May 24, 2022 · May 24, 2022
commit e7346966b2390cf04f9e718ba431d3d195b8e9c5
diff --git a/confs/testconfig_HT_frontEnd.json b/confs/testconfig_HT_frontEnd.json
@@ -1,7 +1,7 @@
 {
     "BNRelu_bits": 32,
     "onnx_file": "../../quant/doryStimuli_frontEnd/QL_testnet.onnx_ql_integerized.onnx",
-    "code reserved space": 260000,
+    "code reserved space": 570000,
     "input_bits": 8,
     "input_signed": false,
     "use_wmem": true,

diff --git a/dory/Hardware_targets/Siracusa/Common/C_Parser.py b/dory/Hardware_targets/Siracusa/Common/C_Parser.py
@@ -31,15 +31,15 @@
 
 class C_Parser_Siracusa(Parser_HW_to_C):
     # Used to manage the ONNX files. By now, supported Convolutions (PW and DW), Pooling, Fully Connected and Relu.
-    def __init__(self, graph, config_file, config_file_dir, verbose_level, perf_layer, precision_library, app_directory, n_inputs=1):
+    def __init__(self, graph, config_file, config_file_dir, verbose_level, perf_layer, precision_library, app_directory, n_inputs=1, prefix=''):
 
         file_path = self.get_file_path()
         with open(os.path.join(file_path, "HW_description.json")) as f:
             HW_description = json.load(f)
         self.precision_library = precision_library
         self.source_Constant_bits_library = config_file["BNRelu_bits"]
         self.config_file = config_file
-        super().__init__(graph, os.path.join(config_file_dir, os.path.dirname(config_file["onnx_file"])), HW_description, verbose_level, perf_layer, "Makefile", app_directory, n_inputs)
+        super().__init__(graph, os.path.join(config_file_dir, os.path.dirname(config_file["onnx_file"])), HW_description, verbose_level, perf_layer, "Makefile", app_directory, n_inputs, prefix)
         self.acc = Neureka()
         try:
             db = HW_description['double_buffering']
@@ -159,6 +159,7 @@ def mapping_layers_to_C_files(self):
         n_memory_levels = self.HW_description['memory']['levels']
 
         for i, node in enumerate(self.HWgraph):
+            node.prefix = self.prefix
             if not hasattr(node, "offloadable") or not node.offloadable:
                 self.map_layer_to_C_file(node, n_memory_levels, tmpl_dir, out_dir)
             else:
@@ -189,10 +190,10 @@ def create_hex_weight(self, node):
                 weights += bytearray([0] * (4 - len(weights) % 4))
 
             weightstr = ''
-            weightstr += f"#include \"{node.name}_weights.h\"\r\n"
+            weightstr += f"#include \"{node.prefix}{node.name}_weights.h\"\r\n"
             weightstr += f"#include \"pmsis.h\"\r\n"
             weightstr += '__attribute__ ((section(".weightmem_sram"))) '
-            weightstr += f"unsigned char {node.name}_weights[{len(weights)}] = "
+            weightstr += f"unsigned char {node.prefix}{node.name}_weights[{len(weights)}] = "
             weightstr += "{"
             weightstr += ", ".join("0x"+format(x, '02x') for x in weights)
             weightstr += "};\r\n"
@@ -201,25 +202,25 @@ def create_hex_weight(self, node):
                 if const != 0:
                     val = bytes(getattr(node,const)['value'])
                     weightstr += 'PI_L2 '
-                    weightstr += f"unsigned char {node.name}_{const}[{len(val)}] = "
+                    weightstr += f"unsigned char {node.prefix}{node.name}_{const}[{len(val)}] = "
                     weightstr += "{"
                     weightstr += ", ".join("0x"+format(x, '02x') for x in val)
                     weightstr += "};\r\n"
 
-            weightstr_h = f"#ifndef __INCLUDE_GUARD_{node.name}\r\n"
-            weightstr_h += f"#define __INCLUDE_GUARD_{node.name}\r\n"
-            weightstr_h += f"extern unsigned char {node.name}_weights[{len(weights)}];\r\n"
+            weightstr_h = f"#ifndef __INCLUDE_GUARD_{node.prefix}{node.name}\r\n"
+            weightstr_h += f"#define __INCLUDE_GUARD_{node.prefix}{node.name}\r\n"
+            weightstr_h += f"extern unsigned char {node.prefix}{node.name}_weights[{len(weights)}];\r\n"
             for const in constants[1:]:
                 if const != 0:
                     val = bytes(getattr(node,const)['value'])
-                    weightstr_h += f"extern unsigned char {node.name}_{const}[{len(val)}];\r\n"
+                    weightstr_h += f"extern unsigned char {node.prefix}{node.name}_{const}[{len(val)}];\r\n"
             weightstr_h += f"\r\n#endif"
 
-            filepath = os.path.join(self.app_directory, 'src', node.name + "_weights.c")
+            filepath = os.path.join(self.app_directory, 'src', node.prefix + node.name + "_weights.c")
             with open(filepath, 'w') as file:
                 file.write(weightstr)
 
-            filepath = os.path.join(self.app_directory, 'inc', node.name + "_weights.h")
+            filepath = os.path.join(self.app_directory, 'inc', node.prefix + node.name + "_weights.h")
             with open(filepath, 'w') as file:
                 file.write(weightstr_h)
         else:
@@ -250,16 +251,17 @@ def create_hex_weight(self, node):
             tk['weights_vectors'] = self.weights_vectors
             tk['weights_dimensions'] = self.weights_dimensions
             tk['DORY_HW_graph'] = self.HWgraph
+            tk['prefix'] = node.prefix
             tk['sdk'] = node.HW_description["software development kit"]["name"]
             root = os.path.dirname(__file__)
             tmpl = Template(filename=os.path.join(root, "Templates/weights_h_template.h"))
             s = tmpl.render(**tk)
-            save_string = os.path.join(self.inc_dir, 'weights.h')
+            save_string = os.path.join(self.inc_dir, f'{node.prefix}weights.h')
             with open(save_string, "w") as f:
                 f.write(s)
             tmpl = Template(filename=os.path.join(root, "Templates/weights_definition_h_template.h"))
             s = tmpl.render(**tk)
-            save_string = os.path.join(self.inc_dir, 'weights_definition.h')
+            save_string = os.path.join(self.inc_dir, f'{node.prefix}weights_definition.h')
             with open(save_string, "w") as f:
                 f.write(s)
 
@@ -294,11 +296,12 @@ def create_hex_input(self):
                 s += f"{hex(np.uint8(num+256))}, "
         tk = OrderedDict([])
         tk['input_values'] = s[:-2]
+        tk['prefix'] = self.prefix
         tk['dimension'] = len(x_in)
         tk['sdk'] = self.HW_description["software development kit"]["name"]
         root = os.path.dirname(__file__)
         tmpl = Template(filename=os.path.join(root, "Templates/input_h_template.h"))
         s = tmpl.render(**tk)
-        save_string = os.path.join(self.inc_dir, 'input.h')
+        save_string = os.path.join(self.inc_dir, f'{self.prefix}input.h')
         with open(save_string, "w") as f:
             f.write(s)

diff --git a/dory/Hardware_targets/Siracusa/Common/HW_Parser.py b/dory/Hardware_targets/Siracusa/Common/HW_Parser.py
@@ -133,29 +133,28 @@ def check_parameters(self):
     @staticmethod
     def is_offloadable(node: Layer_node) -> bool:
         #SCHEREMO: Check if it's an 8-Bit x 8-Bit or lower convolution
-        try:
-            memEstimate = (np.prod(node.input_dimensions)*node.input_channels + np.prod(node.output_dimensions)*node.output_channels + np.prod(node.kernel_shape)*node.input_channels*node.output_channels)
+        # try:
+        #     memEstimate = (np.prod(node.input_dimensions)*node.input_channels + np.prod(node.output_dimensions)*node.output_channels + np.prod(node.kernel_shape)*node.input_channels*node.output_channels)
 
-            # SCHEREMO: MVN2 Hack
-            # if memEstimate > 1500000:
-            #     return False
-        except:
-            return False
+        #     # SCHEREMO: MVN2 Hack
+        #     # if memEstimate > 1500000:
+        #     #     return False
+        # except:
+        #     return False
 
         if node.op_type == "BNReluConv" and node.weight_bits == 8 and node.input_activation_bits == 8:
             #SCHEREMO: Check if it's a pointwise convolution:
             if node.group == 1 and node.kernel_shape == [1,1]:
                 print("1x1 dense - Offloading to NEUREKA...")
                 return True
             #SCHEREMO: Check if it's a dense 3x3 convolution:
-            elif node.input_channels == node.output_channels and node.group == 1 and node.kernel_shape == [3,3]:
+            elif node.group == 1 and node.kernel_shape == [3,3]:
                 print("3x3 dense - Offloading to NEUREKA...")
                 return True
             elif node.input_channels == node.output_channels and node.group == node.input_channels and node.kernel_shape == [3,3]: #and node.input_dimensions[0] < 8:
                 #print("Not offloading to NEUREKA...")
                 print("3x3 dw - Offloading to NEUREKA...")
                 return True
-
         return False
 
 
@@ -166,7 +165,7 @@ def mapping_to_HW_nodes(self):
         if 'offload' in self.config_file and self.config_file['offload'] == True:
             print("Offloading to N-EUREKA")
             for idx, node in enumerate(self.DORY_Graph):
-                if (self.HW_description['memory']['levels']>2 and idx==0):
+                if (self.HW_description['memory']['levels'] > 2 and idx==0):
                     node.offloadable = False
                 else:
                     node.offloadable  = onnx_manager_Siracusa.is_offloadable(node)

diff --git a/dory/Hardware_targets/Siracusa/Common/Templates/Makefile.t b/dory/Hardware_targets/Siracusa/Common/Templates/Makefile.t
@@ -22,8 +22,8 @@ APP = main
 APP_SRCS := $(wildcard src/*.c)
 # -O2 with -fno-indirect-inlining is just as fast as -O3 and reduces code size considerably
 # by not inlining of small functions in the managemengt code
-APP_CFLAGS += -DNUM_CORES=$(CORE) -Iinc -O2 -fno-indirect-inlining -w -g3
-APP_LDFLAGS += -lm -Wl,--print-memory-usage
+APP_CFLAGS += -DNUM_CORES=$(CORE) -Iinc -O3  -w -flto
+APP_LDFLAGS += -lm -Wl,--print-memory-usage -flto
 FLASH_TYPE ?= HYPERFLASH
 RAM_TYPE ?= HYPERRAM
 

diff --git a/dory/Hardware_targets/Siracusa/Common/Templates/input_h_template.h b/dory/Hardware_targets/Siracusa/Common/Templates/input_h_template.h
@@ -25,9 +25,9 @@
 #define __INPUT_H__
 #include "pmsis.h"
 % if sdk == 'gap_sdk':
-L2_DATA uint8_t L2_input_h[${dimension}] = {
+L2_DATA uint8_t ${prefix}L2_input_h[${dimension}] = {
 % else:
-PI_L2 uint8_t L2_input_h[${dimension}] = {
+PI_L2 uint8_t ${prefix}L2_input_h[${dimension}] = {
 % endif
 ${input_values}};
 #endif
diff --git a/dory/Hardware_targets/Siracusa/Common/Templates/main.c.t b/dory/Hardware_targets/Siracusa/Common/Templates/main.c.t
@@ -22,18 +22,18 @@ n_inputs = DORY_HW_graph[0].n_test_inputs
 single_input = n_inputs==1
 %>\
 % if not l3_supported:
-#include "input.h"
+#include "${prefix}input.h"
 % else:
 #include "mem.h"
 % endif
-#include "network.h"
+#include "${prefix}network.h"
 #include "siracusa_padctrl.h"
 #include "pmsis.h"
 
 % if verbose:
 #define VERBOSE 1
 % endif
-
+    
 % if sdk == 'pulp-sdk':
 unsigned int PMU_set_voltage(unsigned int Voltage, unsigned int CheckFrequencies) {
   return 0;
@@ -89,7 +89,7 @@ int main () {
   ram_read(l2_buffer, ram_input, l2_input_size);
   % endif
 
-      network_run(l2_buffer, ${l2_buffer_size}, l2_buffer, ${"0" if single_input else "exec"}${f", L2_input_h{' + exec * l2_input_size' if not single_input else ''}" if not l3_supported else ""});
+      network_run(l2_buffer, ${l2_buffer_size}, l2_buffer, ${"0" if single_input else "exec"}${f", {prefix}L2_input_h{' + exec * l2_input_size' if not single_input else ''}" if not l3_supported else ""});
 
   % if not single_input:
   }

diff --git a/dory/Hardware_targets/Siracusa/Common/Templates/network.c.t b/dory/Hardware_targets/Siracusa/Common/Templates/network.c.t
@@ -22,15 +22,15 @@ l3_supported = DORY_HW_graph[0].HW_description['memory']['levels'] > 2
 %>\
 #define DEFINE_CONSTANTS
 % if not l3_supported and files_list != ' ':
-#include "weights.h"
+#include "${prefix}weights.h"
 %endif
 #include "pmsis.h"
-#include "network.h"
+#include "${prefix}network.h"
 #include "directional_allocator.h"
 #include "mem.h"
 #include <string.h>
 % for layer in list_h:
-#include "${layer}"
+#include "${prefix}${layer}"
 % endfor
 
 % if sdk == 'pulp-sdk':
@@ -39,8 +39,10 @@ l3_supported = DORY_HW_graph[0].HW_description['memory']['levels'] > 2
 % endif
 
 % if verbose:
-#define VERBOSE 1
+#define VERBOSE 0
 % endif
+
+
 static int nb_callback_exec=0;
 
 static void cluster_task_callback(void *arg)
@@ -50,11 +52,11 @@ static void cluster_task_callback(void *arg)
 
 % if 'Yes' in performance or 'Perf_final' in verbose_level:
 static void print_perf(const char *name, const int cycles, const int macs) {
-  float perf = (float) macs / cycles;
+  int32_t perf =  macs / cycles;
   printf("\r\n%s performance:\r\n", name);
   printf("  - num cycles: %d\r\n", cycles);
   printf("  - MACs: %d\r\n", macs );
-  printf("  - MAC/cycle: %g\r\n", perf);
+  printf("  - MAC/cycle: %d\r\n", perf);
   printf("  - n. of Cores: %d\r\n\r\n", NUM_CORES);
 }
 
@@ -70,11 +72,11 @@ static void checksum(const char *name, const uint8_t *d, size_t size, uint32_t s
         printf("OK\r\n");
     else{
         printf("Failed: true [%u] vs. calculated [%u]\r\n", sum_true, sum);
-	printf("Got the following:\r\n");
-	for (int i = 0; i < size; i++){
-	  printf("%u, ", d[i]);
-	}
-	printf("\r\n");
+	/* printf("Got the following:\r\n"); */
+	/* for (int i = 0; i < size; i++){ */
+	/*   printf("%u, ", d[i]); */
+	/* } */
+	/* printf("\r\n"); */
     }
 }
 #endif
@@ -132,9 +134,9 @@ void execute_layer_fork(void *args) {
 % for i, node in enumerate(DORY_HW_graph):
     case ${i}:
       %if hasattr(node, "offloadable") and node.offloadable:
-      ${func_name[i]}(args);
+      ${prefix}${func_name[i]}(args);
       %else:
-      pi_cl_team_fork(NUM_CORES, (void *)${func_name[i]}, args);
+      pi_cl_team_fork(NUM_CORES, (void *)${prefix}${func_name[i]}, args);
       %endif
       break;
 % endfor
@@ -309,7 +311,7 @@ void network_run(void *l2_buffer, size_t l2_buffer_size, void *l2_final_output,
     asm volatile("": : :"memory");
 
 % if 'Yes' in performance:
-    //print_perf(Layers_name[i], perf_cyc, NODEs_MACS[i]);
+    print_perf(Layers_name[i], perf_cyc, NODEs_MACS[i]);
 % endif
 
     // TODO: What error?
@@ -433,7 +435,7 @@ void network_run(void *l2_buffer, size_t l2_buffer_size, void *l2_final_output,
     dir = !dir;
   }
 
-  memcpy(L2_output, l2_final_output, activations_out_size[${len(DORY_HW_graph)-1}]);
+    memcpy(l2_final_output, L2_output, activations_out_size[${len(DORY_HW_graph)-1}]);
 
 
 

diff --git a/dory/Hardware_targets/Siracusa/Common/Templates/network.h.t b/dory/Hardware_targets/Siracusa/Common/Templates/network.h.t
@@ -17,8 +17,15 @@
  * limitations under the License. 
  */
 
-#ifndef __NETWORK_H__
-#define __NETWORK_H__
+#ifndef __${prefix}NETWORK_H__
+#define __${prefix}NETWORK_H__
+
+% if prefix != "":
+// SCHEREMO: Let the preprocessor mangle for us...
+#define execute_layer_fork ${prefix}execute_layer_fork
+#define network_run ${prefix}network_run
+#define layer_args_t ${prefix}layer_args_t
+% endif
 
 % if sdk == 'gap_sdk':
 #include "pulp.h"
@@ -28,13 +35,13 @@
    single_input = n_inputs==1
 %>\
    % if not l3_supported and files_list != ' ':
-#include "weights_definition.h"
+#include "${prefix}weights_definition.h"
 % endif
 #include <stddef.h>
 
 %for node in DORY_HW_graph:
    %if hasattr(node, "offloadable") and node.offloadable and hasattr(node, "use_wmem") and node.use_wmem:
-#include "${node.name}_weights.h"
+#include "${prefix}${node.name}_weights.h"
    %endif
 %endfor
 
@@ -78,7 +85,7 @@ static int layers_pointers[${len(DORY_HW_graph)}];
 % endif
 static char * Layers_name[${len(DORY_HW_graph)}] = {\
 % for node in DORY_HW_graph:
-"${node.name}"${'' if loop.last else ', '}\
+"${prefix}${node.name}"${'' if loop.last else ', '}\
 % endfor
 };
 % if l3_supported:
@@ -114,9 +121,9 @@ static int allocate_layer[${len(DORY_HW_graph)}] = {\
 static char *Weights_name[${len(DORY_HW_graph)}] = {\
 % for i in range(len(DORY_HW_graph)):
 % if (not (hasattr(DORY_HW_graph[i], "offloadable") and DORY_HW_graph[i].offloadable and hasattr(DORY_HW_graph[i], "use_wmem") and DORY_HW_graph[i].use_wmem)) and( 'Conv' in DORY_HW_graph[i].name or 'FullyConnected' in DORY_HW_graph[i].name):
-Weights_${DORY_HW_graph[i].name}${'' if loop.last else ', '}\
+${prefix}Weights_${DORY_HW_graph[i].name}${'' if loop.last else ', '} \
 % elif (hasattr(DORY_HW_graph[i], "offloadable") and DORY_HW_graph[i].offloadable and hasattr(DORY_HW_graph[i], "use_wmem") and DORY_HW_graph[i].use_wmem) and( 'Conv' in DORY_HW_graph[i].name or 'FullyConnected' in DORY_HW_graph[i].name):
-${DORY_HW_graph[i].name}_weights${'' if loop.last else ', '}\
+${prefix}${DORY_HW_graph[i].name}_weights${'' if loop.last else ', '}\
 % else:
 "None"${'' if loop.last else ', '}\
 % endif
@@ -230,7 +237,7 @@ static int layer_with_weights[${len(DORY_HW_graph)}] = {\
 static void* layer_wmem_ptr[${len(DORY_HW_graph)}] = {\
 % for node in DORY_HW_graph:
 % if hasattr(node, "offloadable") and node.offloadable and hasattr(node, "use_wmem") and node.use_wmem:
-${node.name}_weights${'' if loop.last else ', '}\
+${prefix}${node.name}_weights${'' if loop.last else ', '}\
 % else:
 NULL${'' if loop.last else ', '}\
 % endif

diff --git a/dory/Hardware_targets/Siracusa/Common/Templates/weights_definition_h_template.h b/dory/Hardware_targets/Siracusa/Common/Templates/weights_definition_h_template.h
@@ -22,7 +22,7 @@
 #define __WEIGHTS_DEFINITION_H__
 % for i in range(len(weights_vectors)):
 % if weights_dimensions[i] > 0:
-extern uint8_t Weights_${weights_names[i]}[${weights_dimensions[i]}];
+extern uint8_t ${prefix}Weights_${weights_names[i]}[${weights_dimensions[i]}];
 % endif
 % endfor
 #endif

diff --git a/dory/Hardware_targets/Siracusa/Common/Templates/weights_h_template.h b/dory/Hardware_targets/Siracusa/Common/Templates/weights_h_template.h
@@ -26,9 +26,9 @@
 % for i in range(len(weights_vectors)):
 % if weights_dimensions[i] > 0:
 % if sdk == 'gap_sdk':
-L2_DATA uint8_t Weights_${weights_names[i]}[${weights_dimensions[i]}] = {
+L2_DATA uint8_t ${prefix}Weights_${weights_names[i]}[${weights_dimensions[i]}] = {
 % else:
-PI_L2 uint8_t Weights_${weights_names[i]}[${weights_dimensions[i]}] = {
+PI_L2 uint8_t ${prefix}Weights_${weights_names[i]}[${weights_dimensions[i]}] = {
 % endif
 ${weights_vectors[i]}};
 % endif