diff --git a/.gitignore b/.gitignore index 8e237dfd..3fdb2d06 100644 --- a/.gitignore +++ b/.gitignore @@ -11,7 +11,9 @@ src/statsEnums.h src/cscope tools/mem_trace_generator/*.raw tools/mem_trace_generator/mem_trace -bin/ +bin/macsim +bin/*.out +bin/*.out.0 .dbg_build/ *.lo *.la diff --git a/.gitmodules b/.gitmodules index 6145f6cf..b2424635 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,9 @@ [submodule "internal"] path = internal - url = https://github.com/gthparch/macsim_internal + url = git@github.com:gthparch/macsim_internal.git [submodule "src/rwqueue"] path = src/rwqueue url = https://github.com/cameron314/readerwriterqueue +[submodule "tools/CUDA_trace_generator"] + path = tools/CUDA_trace_generator + url = git@github.com:ejchung0406/CUDA_trace_generator.git diff --git a/Makefile.am b/Makefile.am index 2771980c..357a530f 100644 --- a/Makefile.am +++ b/Makefile.am @@ -85,6 +85,7 @@ src/trace_read_cpu.cc src/trace_read_cpu.h \ src/trace_read_gpu.cc src/trace_read_gpu.h \ src/trace_read_a64.cc src/trace_read_a64.h \ src/trace_read_igpu.cc src/trace_read_igpu.h \ +src/trace_read_nvbit.cc src/trace_read_nvbit.h \ src/uop.cc src/uop.h \ src/utils.cc src/utils.h \ src/network.cc src/network.h \ diff --git a/SConscript b/SConscript index 5cfcdb8b..49315654 100644 --- a/SConscript +++ b/SConscript @@ -274,6 +274,7 @@ macsim_src = [ 'src/trace_read_gpu.cc', 'src/trace_read_a64.cc', 'src/trace_read_igpu.cc', + 'src/trace_read_nvbit.cc', 'src/page_mapping.cc', 'src/dyfr.cc', 'src/hmc_process.cc', diff --git a/bin/params.in b/bin/params.in index c9926c24..2759ac9f 100644 --- a/bin/params.in +++ b/bin/params.in @@ -1,71 +1,82 @@ + # Simulation Configuration -num_sim_cores 1 -num_sim_small_cores 0 +num_sim_cores 16 +num_sim_small_cores 16 +core_type nvbit +max_threads_per_core 1024 num_sim_medium_cores 0 -num_sim_large_cores 1 -large_core_type x86 -sim_cycle_count 0 -max_insts 3000000 -heartbeat_interval 1000000 -forward_progress_limit 1000000 -core_thread_sched balanced +num_sim_large_cores 0 + # Clock -clock_cpu 1.15 -clock_gpu 1.15 -clock_noc 1.15 -clock_mc 1.15 +# from device query for gtx580 on damint - gpu clock - 1.66 GHz, mem clock - 2100 MHz +clock_cpu 2.0 +clock_gpu 2.0 +clock_llc 2.0 +clock_noc 2.0 +clock_mc 2.0 + -# Common Core Configuration +# Small Core Configuration +fetch_wdith 4 +width 1 +fetch_latency 5 +alloc_latency 5 +rob_size 1024 +schedule ooo +isched_rate 4 +msched_rate 4 +fsched_rate 4 +bp_hist_length 14 +max_block_per_core 8 fetch_policy rr mt_no_fetch_br 1 -one_cycle_exec 0 -uop_latency_map x86 - -# Large Core Configuration -large_width 2 -large_core_fetch_latency 5 -large_core_alloc_latency 10 -isched_large_rate 4 -msched_large_rate 2 -fsched_large_rate 2 -ssched_large_rate 1 -isched_large_size 64 -msched_large_size 32 -fsched_large_size 96 -ssched_large_size 128 -bp_hist_length 16 -rob_large_size 512 -large_core_schedule ooo -max_threads_per_large_core 7 - -mem_mshr_size 9 - -# L3-I -icache_large_num_set 4096 # 768 KB -icache_large_assoc 3 -icache_large_line_size 64 -icache_large_cycles 14 - -# L3-D -l1_large_num_set 512 # 512 KB -l1_large_assoc 16 -l1_large_line_size 64 -l1_large_latency 100 -l1_large_bypass 0 +fetch_only_load_ready 0 +schedule_ratio 4 +fetch_ratio 4 +gpu_sched 1 +icache_num_set 8 + # Memory -memory_type igpu_network +memory_type l2_decoupled_network +perfect_dcache 0 +enable_cache_coherence 0 +dram_merge_requests 1 +mem_ooo_stores 0 +ptx_common_cache 0 +const_cache_size 8192 +texture_cache_size 8192 +shared_mem_size 16384 +shared_mem_banks 32 +shared_mem_cycles 2 +shared_mem_ports 1 +byte_level_access 0 + +l1_small_line_size 128 +#96 KB +l1_small_num_set 128 +l1_small_assoc 6 +#16 KB +#l1_small_num_set 32 +#l1_small_assoc 4 + +l1_small_latency 30 +l2_small_latency 100 +llc_latency 200 + +# L3 Cache (4.5MB 24 way) +num_llc 12 +llc_num_set 128 +llc_line_size 128 +llc_assoc 24 +llc_num_bank 4 +llc_latency 200 -# LLC -num_l3 1 -l3_num_set 8192 -l3_assoc 32 -l3_line_size 64 -l3_latency 100 # DRAM -dram_bus_width 4 +dram_num_mc 6 +dram_bus_width 8 dram_column 11 dram_activate 25 dram_precharge 10 @@ -73,13 +84,29 @@ dram_num_banks 16 dram_num_channel 8 dram_rowbuffer_size 2048 dram_scheduling_policy FRFCFS -dram_additional_latency 95 -# ETC + + +infinite_port 0 +pref_train_inst_once 0 +pref_framework_on 1 + + + bug_detector_enable 1 -perfect_icache 1 -ideal_noc 1 +sim_cycle_count 0 +max_insts 200000000 +heartbeat_interval 1000000 +forward_progress_limit 100000 +blocks_to_simulate 0 +ptx_exec_ratio 2 +num_warp_scheduler 2 + + +noc_topology simple_noc +noc_dimension 0 +link_width 32 # DEBUG debug_core_id 0 @@ -93,9 +120,8 @@ debug_dcu_stage 0 debug_retire_stage 0 debug_map_stage 0 debug_mem 0 -debug_trace_read 0 +debug_trace_read 1 +debug_print_trace 1 debug_sim_thread_schedule 0 debug_cache_lib 0 debug_bp_dir 0 -debug_print_trace 0 -debug_noc 0 diff --git a/bin/trace_file_list b/bin/trace_file_list index 6ea4f4b1..f84c351d 100644 --- a/bin/trace_file_list +++ b/bin/trace_file_list @@ -1,2 +1,2 @@ 1 -../sst-unit-test/traces/cachesize_1/trace.txt \ No newline at end of file +../sst-unit-test/traces/nvbit/vectormultadd/65536/kernel_config.txt \ No newline at end of file diff --git a/def/uoplatency_nvbit580.def b/def/uoplatency_nvbit580.def new file mode 100644 index 00000000..063d017d --- /dev/null +++ b/def/uoplatency_nvbit580.def @@ -0,0 +1,508 @@ +/* +Copyright (c) <2012>, All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted +provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list of conditions +and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this list of +conditions and the following disclaimer in the documentation and/or other materials provided +with the distribution. + +Neither the name of the nor the names of its contributors +may be used to endorse or promote products derived from this software without specific prior +written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY +AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +*/ + + + /* + * The arguments to DEFUOP(opcode, latency) are as follows: + * opcode - opcode name + * latency - opcode latency + */ + + +DEFUOP( UOP_INV, 25) +DEFUOP( UOP_NOP, 25) +DEFUOP( UOP_CF, 25) +DEFUOP( UOP_CMOV, 25) +DEFUOP( UOP_LDA, 25) +DEFUOP( UOP_IMEM, 25) +DEFUOP( UOP_LD, 25) +DEFUOP( UOP_ST, 25) +DEFUOP( UOP_IADD, 25) +DEFUOP( UOP_IMUL, 100) +DEFUOP( UOP_ICMP, 25) +DEFUOP( UOP_LOGIC, 25) +DEFUOP( UOP_SHIFT, 25) +DEFUOP( UOP_BYTE, 25) +DEFUOP( UOP_MM, 25) + +DEFUOP( UOP_FMEM, 19) +DEFUOP( UOP_FCF, 19) +DEFUOP( UOP_FCVT, 19) +DEFUOP( UOP_FADD, 19) +DEFUOP( UOP_FMUL, 19) +DEFUOP( UOP_FDIV, 152) +DEFUOP( UOP_FCMP, 19) +DEFUOP( UOP_FBIT, 19) +DEFUOP( UOP_FCMOV, 19) + +DEFUOP( UOP_SSE, 19) + +DEFUOP( UOP_GPU_ABS, 25) +DEFUOP( UOP_GPU_ABS64, 25) +DEFUOP( UOP_GPU_ADD, 25) +DEFUOP( UOP_GPU_ADD64, 25) +DEFUOP( UOP_GPU_ADDC, 25) +DEFUOP( UOP_GPU_AND, 25) +DEFUOP( UOP_GPU_AND64, 25) +DEFUOP( UOP_GPU_ATOM_GM, 25) +DEFUOP( UOP_GPU_ATOM_SM, 25) +DEFUOP( UOP_GPU_ATOM64_GM, 25) +DEFUOP( UOP_GPU_ATOM64_SM, 25) +DEFUOP( UOP_GPU_BAR_ARRIVE, 25) +DEFUOP( UOP_GPU_BAR_SYNC, 25) +DEFUOP( UOP_GPU_BAR_RED, 25) +DEFUOP( UOP_GPU_BFE, 25) +DEFUOP( UOP_GPU_BFE64, 25) +DEFUOP( UOP_GPU_BFI, 25) +DEFUOP( UOP_GPU_BFI64, 25) +DEFUOP( UOP_GPU_BFIND, 25) +DEFUOP( UOP_GPU_BFIND64, 25) +DEFUOP( UOP_GPU_BRA, 25) +DEFUOP( UOP_GPU_BREV, 25) +DEFUOP( UOP_GPU_BREV64, 25) +DEFUOP( UOP_GPU_BRKPT, 25) +DEFUOP( UOP_GPU_CALL, 25) +DEFUOP( UOP_GPU_CLZ, 25) +DEFUOP( UOP_GPU_CLZ64, 25) +DEFUOP( UOP_GPU_CNOT, 25) +DEFUOP( UOP_GPU_CNOT64, 25) +DEFUOP( UOP_GPU_COPYSIGN, 25) +DEFUOP( UOP_GPU_COPYSIGN64, 25) +DEFUOP( UOP_GPU_COS, 25) +DEFUOP( UOP_GPU_CVT, 25) +DEFUOP( UOP_GPU_CVT64, 25) +DEFUOP( UOP_GPU_CVTA, 25) +DEFUOP( UOP_GPU_CVTA64, 25) +DEFUOP( UOP_GPU_DIV, 25) +DEFUOP( UOP_GPU_DIV64, 25) +DEFUOP( UOP_GPU_EX2, 25) +DEFUOP( UOP_GPU_EXIT, 25) +DEFUOP( UOP_GPU_FMA, 25) +DEFUOP( UOP_GPU_FMA64, 25) +DEFUOP( UOP_GPU_ISSPACEP, 25) +DEFUOP( UOP_GPU_LD, 25) +DEFUOP( UOP_GPU_LD64, 25) +DEFUOP( UOP_GPU_LDU, 25) +DEFUOP( UOP_GPU_LDU64, 25) +DEFUOP( UOP_GPU_LG2, 25) +DEFUOP( UOP_GPU_MAD24, 25) +DEFUOP( UOP_GPU_MAD, 25) +DEFUOP( UOP_GPU_MAD64, 25) +DEFUOP( UOP_GPU_MADC, 25) +DEFUOP( UOP_GPU_MADC64, 25) +DEFUOP( UOP_GPU_MAX, 25) +DEFUOP( UOP_GPU_MAX64, 25) +DEFUOP( UOP_GPU_MEMBAR_CTA, 25) +DEFUOP( UOP_GPU_MEMBAR_GL, 25) +DEFUOP( UOP_GPU_MEMBAR_SYS, 25) +DEFUOP( UOP_GPU_MIN, 25) +DEFUOP( UOP_GPU_MIN64, 25) +DEFUOP( UOP_GPU_MOV, 25) +DEFUOP( UOP_GPU_MOV64, 25) +DEFUOP( UOP_GPU_MUL24, 25) +DEFUOP( UOP_GPU_MUL, 25) +DEFUOP( UOP_GPU_MUL64, 25) +DEFUOP( UOP_GPU_NEG, 25) +DEFUOP( UOP_GPU_NEG64, 25) +DEFUOP( UOP_GPU_NOT, 25) +DEFUOP( UOP_GPU_NOT64, 25) +DEFUOP( UOP_GPU_OR, 25) +DEFUOP( UOP_GPU_OR64, 25) +DEFUOP( UOP_GPU_PMEVENT, 25) +DEFUOP( UOP_GPU_POPC, 25) +DEFUOP( UOP_GPU_POPC64, 25) +DEFUOP( UOP_GPU_PREFETCH, 25) +DEFUOP( UOP_GPU_PREFETCHU, 25) +DEFUOP( UOP_GPU_PRMT, 25) +DEFUOP( UOP_GPU_RCP, 25) +DEFUOP( UOP_GPU_RCP64, 25) +DEFUOP( UOP_GPU_RED_GM, 25) +DEFUOP( UOP_GPU_RED_SM, 25) +DEFUOP( UOP_GPU_RED64_GM, 25) +DEFUOP( UOP_GPU_RED64_SM, 25) +DEFUOP( UOP_GPU_REM, 25) +DEFUOP( UOP_GPU_REM64, 25) +DEFUOP( UOP_GPU_RET, 25) +DEFUOP( UOP_GPU_RSQRT, 25) +DEFUOP( UOP_GPU_RSQRT64, 25) +DEFUOP( UOP_GPU_SAD, 25) +DEFUOP( UOP_GPU_SAD64, 25) +DEFUOP( UOP_GPU_SELP, 25) +DEFUOP( UOP_GPU_SELP64, 25) +DEFUOP( UOP_GPU_SET, 25) +DEFUOP( UOP_GPU_SET64, 25) +DEFUOP( UOP_GPU_SETP, 25) +DEFUOP( UOP_GPU_SETP64, 25) +DEFUOP( UOP_GPU_SHL, 25) +DEFUOP( UOP_GPU_SHL64, 25) +DEFUOP( UOP_GPU_SHFL, 25) +DEFUOP( UOP_GPU_SHFL64, 25) +DEFUOP( UOP_GPU_SHR, 25) +DEFUOP( UOP_GPU_SHR64, 25) +DEFUOP( UOP_GPU_SIN, 25) +DEFUOP( UOP_GPU_SLCT, 25) +DEFUOP( UOP_GPU_SLCT64, 25) +DEFUOP( UOP_GPU_SQRT, 25) +DEFUOP( UOP_GPU_SQRT64, 25) +DEFUOP( UOP_GPU_ST, 25) +DEFUOP( UOP_GPU_ST64, 25) +DEFUOP( UOP_GPU_SUB, 25) +DEFUOP( UOP_GPU_SUB64, 25) +DEFUOP( UOP_GPU_SUBC, 25) +DEFUOP( UOP_GPU_SULD, 25) +DEFUOP( UOP_GPU_SULD64, 25) +DEFUOP( UOP_GPU_SURED, 25) +DEFUOP( UOP_GPU_SURED64, 25) +DEFUOP( UOP_GPU_SUST, 25) +DEFUOP( UOP_GPU_SUST64, 25) +DEFUOP( UOP_GPU_SUQ, 25) +DEFUOP( UOP_GPU_TESTP, 25) +DEFUOP( UOP_GPU_TESTP64, 25) +DEFUOP( UOP_GPU_TEX, 25) +DEFUOP( UOP_GPU_TLD4, 25) +DEFUOP( UOP_GPU_TXQ, 25) +DEFUOP( UOP_GPU_TRAP, 25) +DEFUOP( UOP_GPU_VABSDIFF, 25) +DEFUOP( UOP_GPU_VADD, 25) +DEFUOP( UOP_GPU_VMAD, 25) +DEFUOP( UOP_GPU_VMAX, 25) +DEFUOP( UOP_GPU_VMIN, 25) +DEFUOP( UOP_GPU_VSET, 25) +DEFUOP( UOP_GPU_VSHL, 25) +DEFUOP( UOP_GPU_VSHR, 25) +DEFUOP( UOP_GPU_VSUB, 25) +DEFUOP( UOP_GPU_VOTE, 25) +DEFUOP( UOP_GPU_XOR, 25) +DEFUOP( UOP_GPU_XOR64, 25) +DEFUOP( UOP_GPU_RECONVERGE, 25) +DEFUOP( UOP_GPU_PHI, 25) + +DEFUOP( UOP_GPU_FABS, 19) +DEFUOP( UOP_GPU_FABS64, 19) +DEFUOP( UOP_GPU_FADD, 19) +DEFUOP( UOP_GPU_FADD64, 19) +DEFUOP( UOP_GPU_FADDC, 19) +DEFUOP( UOP_GPU_FAND, 19) +DEFUOP( UOP_GPU_FAND64, 19) +DEFUOP( UOP_GPU_FATOM_GM, 19) +DEFUOP( UOP_GPU_FATOM_SM, 19) +DEFUOP( UOP_GPU_FATOM64_GM, 19) +DEFUOP( UOP_GPU_FATOM64_SM, 19) +DEFUOP( UOP_GPU_FBAR_ARRIVE, 19) +DEFUOP( UOP_GPU_FBAR_SYNC, 19) +DEFUOP( UOP_GPU_FBAR_RED, 19) +DEFUOP( UOP_GPU_FBFE, 19) +DEFUOP( UOP_GPU_FBFE64, 19) +DEFUOP( UOP_GPU_FBFI, 19) +DEFUOP( UOP_GPU_FBFI64, 19) +DEFUOP( UOP_GPU_FBFIND, 19) +DEFUOP( UOP_GPU_FBFIND64, 19) +DEFUOP( UOP_GPU_FBRA, 19) +DEFUOP( UOP_GPU_FBREV, 19) +DEFUOP( UOP_GPU_FBREV64, 19) +DEFUOP( UOP_GPU_FBRKPT, 19) +DEFUOP( UOP_GPU_FCALL, 19) +DEFUOP( UOP_GPU_FCLZ, 19) +DEFUOP( UOP_GPU_FCLZ64, 19) +DEFUOP( UOP_GPU_FCNOT, 19) +DEFUOP( UOP_GPU_FCNOT64, 19) +DEFUOP( UOP_GPU_FCOPYSIGN, 19) +DEFUOP( UOP_GPU_FCOPYSIGN64, 19) +DEFUOP( UOP_GPU_FCOS, 19) +DEFUOP( UOP_GPU_FCVT, 19) +DEFUOP( UOP_GPU_FCVT64, 19) +DEFUOP( UOP_GPU_FCVTA, 19) +DEFUOP( UOP_GPU_FCVTA64, 19) +DEFUOP( UOP_GPU_FDIV, 19) +DEFUOP( UOP_GPU_FDIV64, 19) +DEFUOP( UOP_GPU_FEX2, 19) +DEFUOP( UOP_GPU_FEXIT, 19) +DEFUOP( UOP_GPU_FFMA, 19) +DEFUOP( UOP_GPU_FFMA64, 19) +DEFUOP( UOP_GPU_FISSPACEP, 19) +DEFUOP( UOP_GPU_FLD, 19) +DEFUOP( UOP_GPU_FLD64, 19) +DEFUOP( UOP_GPU_FLDU, 19) +DEFUOP( UOP_GPU_FLDU64, 19) +DEFUOP( UOP_GPU_FLG2, 19) +DEFUOP( UOP_GPU_FMAD24, 19) +DEFUOP( UOP_GPU_FMAD, 19) +DEFUOP( UOP_GPU_FMAD64, 19) +DEFUOP( UOP_GPU_FMADC, 19) +DEFUOP( UOP_GPU_FMADC64, 19) +DEFUOP( UOP_GPU_FMAX, 19) +DEFUOP( UOP_GPU_FMAX64, 19) +DEFUOP( UOP_GPU_FMEMBAR_CTA, 19) +DEFUOP( UOP_GPU_FMEMBAR_GL, 19) +DEFUOP( UOP_GPU_FMEMBAR_SYS, 19) +DEFUOP( UOP_GPU_FMIN, 19) +DEFUOP( UOP_GPU_FMIN64, 19) +DEFUOP( UOP_GPU_FMOV, 19) +DEFUOP( UOP_GPU_FMOV64, 19) +DEFUOP( UOP_GPU_FMUL24, 19) +DEFUOP( UOP_GPU_FMUL, 19) +DEFUOP( UOP_GPU_FMUL64, 19) +DEFUOP( UOP_GPU_FNEG, 19) +DEFUOP( UOP_GPU_FNEG64, 19) +DEFUOP( UOP_GPU_FNOT, 19) +DEFUOP( UOP_GPU_FNOT64, 19) +DEFUOP( UOP_GPU_FOR, 19) +DEFUOP( UOP_GPU_FOR64, 19) +DEFUOP( UOP_GPU_FPMEVENT, 19) +DEFUOP( UOP_GPU_FPOPC, 19) +DEFUOP( UOP_GPU_FPOPC64, 19) +DEFUOP( UOP_GPU_FPREFETCH, 19) +DEFUOP( UOP_GPU_FPREFETCHU, 19) +DEFUOP( UOP_GPU_FPRMT, 19) +DEFUOP( UOP_GPU_FRCP, 19) +DEFUOP( UOP_GPU_FRCP64, 19) +DEFUOP( UOP_GPU_FRED_GM, 19) +DEFUOP( UOP_GPU_FRED_SM, 19) +DEFUOP( UOP_GPU_FRED64_GM, 19) +DEFUOP( UOP_GPU_FRED64_SM, 19) +DEFUOP( UOP_GPU_FREM, 19) +DEFUOP( UOP_GPU_FREM64, 19) +DEFUOP( UOP_GPU_FRET, 19) +DEFUOP( UOP_GPU_FRSQRT, 19) +DEFUOP( UOP_GPU_FRSQRT64, 19) +DEFUOP( UOP_GPU_FSAD, 19) +DEFUOP( UOP_GPU_FSAD64, 19) +DEFUOP( UOP_GPU_FSELP, 19) +DEFUOP( UOP_GPU_FSELP64, 19) +DEFUOP( UOP_GPU_FSET, 19) +DEFUOP( UOP_GPU_FSET64, 19) +DEFUOP( UOP_GPU_FSETP, 19) +DEFUOP( UOP_GPU_FSETP64, 19) +DEFUOP( UOP_GPU_FSHFL, 19) +DEFUOP( UOP_GPU_FSHFL64, 19) +DEFUOP( UOP_GPU_FSHL, 19) +DEFUOP( UOP_GPU_FSHL64, 19) +DEFUOP( UOP_GPU_FSHR, 19) +DEFUOP( UOP_GPU_FSHR64, 19) +DEFUOP( UOP_GPU_FSIN, 19) +DEFUOP( UOP_GPU_FSLCT, 19) +DEFUOP( UOP_GPU_FSLCT64, 19) +DEFUOP( UOP_GPU_FSQRT, 19) +DEFUOP( UOP_GPU_FSQRT64, 19) +DEFUOP( UOP_GPU_FST, 19) +DEFUOP( UOP_GPU_FST64, 19) +DEFUOP( UOP_GPU_FSUB, 19) +DEFUOP( UOP_GPU_FSUB64, 19) +DEFUOP( UOP_GPU_FSUBC, 19) +DEFUOP( UOP_GPU_FSULD, 19) +DEFUOP( UOP_GPU_FSULD64, 19) +DEFUOP( UOP_GPU_FSURED, 19) +DEFUOP( UOP_GPU_FSURED64, 19) +DEFUOP( UOP_GPU_FSUST, 19) +DEFUOP( UOP_GPU_FSUST64, 19) +DEFUOP( UOP_GPU_FSUQ, 19) +DEFUOP( UOP_GPU_FTESTP, 19) +DEFUOP( UOP_GPU_FTESTP64, 19) +DEFUOP( UOP_GPU_FTEX, 19) +DEFUOP( UOP_GPU_FTLD4, 19) +DEFUOP( UOP_GPU_FTXQ, 19) +DEFUOP( UOP_GPU_FTRAP, 19) +DEFUOP( UOP_GPU_FVABSDIFF, 19) +DEFUOP( UOP_GPU_FVADD, 19) +DEFUOP( UOP_GPU_FVMAD, 19) +DEFUOP( UOP_GPU_FVMAX, 19) +DEFUOP( UOP_GPU_FVMIN, 19) +DEFUOP( UOP_GPU_FVSET, 19) +DEFUOP( UOP_GPU_FVSHL, 19) +DEFUOP( UOP_GPU_FVSHR, 19) +DEFUOP( UOP_GPU_FVSUB, 19) +DEFUOP( UOP_GPU_FVOTE, 19) +DEFUOP( UOP_GPU_FXOR, 19) +DEFUOP( UOP_GPU_FXOR64, 19) +DEFUOP( UOP_GPU_FRECONVERGE, 19) +DEFUOP( UOP_GPU_FPHI, 19) + + +DEFUOP( UOP_NVBIT_FADD , 25) +DEFUOP( UOP_NVBIT_FADD32I , 25) +DEFUOP( UOP_NVBIT_FCHK , 25) +DEFUOP( UOP_NVBIT_FFMA32I , 25) +DEFUOP( UOP_NVBIT_FFMA , 25) +DEFUOP( UOP_NVBIT_FMNMX , 25) +DEFUOP( UOP_NVBIT_FMUL , 25) +DEFUOP( UOP_NVBIT_FMUL32I , 25) +DEFUOP( UOP_NVBIT_FSEL , 25) +DEFUOP( UOP_NVBIT_FSET , 25) +DEFUOP( UOP_NVBIT_FSETP , 25) +DEFUOP( UOP_NVBIT_FSWZADD , 25) +DEFUOP( UOP_NVBIT_MUFU , 25) +DEFUOP( UOP_NVBIT_HADD2 , 25) +DEFUOP( UOP_NVBIT_HADD2_32I , 25) +DEFUOP( UOP_NVBIT_HFMA2 , 25) +DEFUOP( UOP_NVBIT_HFMA2_32I , 25) +DEFUOP( UOP_NVBIT_HMMA , 25) +DEFUOP( UOP_NVBIT_HMUL2 , 25) +DEFUOP( UOP_NVBIT_HMUL2_32I , 25) +DEFUOP( UOP_NVBIT_HSET2 , 25) +DEFUOP( UOP_NVBIT_HSETP2 , 25) +DEFUOP( UOP_NVBIT_DADD , 25) +DEFUOP( UOP_NVBIT_DFMA , 25) +DEFUOP( UOP_NVBIT_DMUL , 25) +DEFUOP( UOP_NVBIT_DSETP , 25) +DEFUOP( UOP_NVBIT_BMMA , 25) +DEFUOP( UOP_NVBIT_BMSK , 25) +DEFUOP( UOP_NVBIT_BREV , 25) +DEFUOP( UOP_NVBIT_FLO , 25) +DEFUOP( UOP_NVBIT_IABS , 25) +DEFUOP( UOP_NVBIT_IADD , 25) +DEFUOP( UOP_NVBIT_IADD3 , 25) +DEFUOP( UOP_NVBIT_IADD32I , 25) +DEFUOP( UOP_NVBIT_IDP , 25) +DEFUOP( UOP_NVBIT_IDP4A , 25) +DEFUOP( UOP_NVBIT_IMAD , 25) +DEFUOP( UOP_NVBIT_IMMA , 25) +DEFUOP( UOP_NVBIT_IMNMX , 25) +DEFUOP( UOP_NVBIT_IMUL , 25) +DEFUOP( UOP_NVBIT_IMUL32I , 25) +DEFUOP( UOP_NVBIT_ISCADD , 25) +DEFUOP( UOP_NVBIT_ISCADD32I , 25) +DEFUOP( UOP_NVBIT_ISETP , 25) +DEFUOP( UOP_NVBIT_LEA , 25) +DEFUOP( UOP_NVBIT_LOP , 25) +DEFUOP( UOP_NVBIT_LOP3 , 25) +DEFUOP( UOP_NVBIT_LOP32I , 25) +DEFUOP( UOP_NVBIT_POPC , 25) +DEFUOP( UOP_NVBIT_SHF , 25) +DEFUOP( UOP_NVBIT_SHL , 25) +DEFUOP( UOP_NVBIT_SHR , 25) +DEFUOP( UOP_NVBIT_VABSDIFF , 25) +DEFUOP( UOP_NVBIT_VABSDIFF4 , 25) +DEFUOP( UOP_NVBIT_F2F , 25) +DEFUOP( UOP_NVBIT_F2I , 25) +DEFUOP( UOP_NVBIT_I2F , 25) +DEFUOP( UOP_NVBIT_I2I , 25) +DEFUOP( UOP_NVBIT_I2IP , 25) +DEFUOP( UOP_NVBIT_FRND , 25) +DEFUOP( UOP_NVBIT_MOV , 25) +DEFUOP( UOP_NVBIT_MOV32I , 25) +DEFUOP( UOP_NVBIT_MOVM , 25) +DEFUOP( UOP_NVBIT_PRMT , 25) +DEFUOP( UOP_NVBIT_SEL , 25) +DEFUOP( UOP_NVBIT_SGXT , 25) +DEFUOP( UOP_NVBIT_SHFL , 25) +DEFUOP( UOP_NVBIT_PLOP3 , 25) +DEFUOP( UOP_NVBIT_PSETP , 25) +DEFUOP( UOP_NVBIT_P2R , 25) +DEFUOP( UOP_NVBIT_R2P , 25) +DEFUOP( UOP_NVBIT_LD , 25) +DEFUOP( UOP_NVBIT_LDC , 25) +DEFUOP( UOP_NVBIT_LDG , 25) +DEFUOP( UOP_NVBIT_LDL , 25) +DEFUOP( UOP_NVBIT_LDS , 25) +DEFUOP( UOP_NVBIT_LDSM , 25) +DEFUOP( UOP_NVBIT_ST , 25) +DEFUOP( UOP_NVBIT_STG , 25) +DEFUOP( UOP_NVBIT_STL , 25) +DEFUOP( UOP_NVBIT_STS , 25) +DEFUOP( UOP_NVBIT_MATCH , 25) +DEFUOP( UOP_NVBIT_QSPC , 25) +DEFUOP( UOP_NVBIT_ATOM , 25) +DEFUOP( UOP_NVBIT_ATOMS , 25) +DEFUOP( UOP_NVBIT_ATOMG , 25) +DEFUOP( UOP_NVBIT_RED , 25) +DEFUOP( UOP_NVBIT_CCTL , 25) +DEFUOP( UOP_NVBIT_CCTLL , 25) +DEFUOP( UOP_NVBIT_ERRBAR , 25) +DEFUOP( UOP_NVBIT_MEMBAR , 25) +DEFUOP( UOP_NVBIT_CCTLT , 25) +DEFUOP( UOP_NVBIT_R2UR , 25) +DEFUOP( UOP_NVBIT_S2UR , 25) +DEFUOP( UOP_NVBIT_UBMSK , 25) +DEFUOP( UOP_NVBIT_UBREV , 25) +DEFUOP( UOP_NVBIT_UCLEA , 25) +DEFUOP( UOP_NVBIT_UFLO , 25) +DEFUOP( UOP_NVBIT_UIADD3 , 25) +DEFUOP( UOP_NVBIT_UIADD3_64 , 25) +DEFUOP( UOP_NVBIT_UIMAD , 25) +DEFUOP( UOP_NVBIT_UISETP , 25) +DEFUOP( UOP_NVBIT_ULDC , 25) +DEFUOP( UOP_NVBIT_ULEA , 25) +DEFUOP( UOP_NVBIT_ULOP , 25) +DEFUOP( UOP_NVBIT_ULOP3 , 25) +DEFUOP( UOP_NVBIT_ULOP32I , 25) +DEFUOP( UOP_NVBIT_UMOV , 25) +DEFUOP( UOP_NVBIT_UP2UR , 25) +DEFUOP( UOP_NVBIT_UPLOP3, 25) +DEFUOP( UOP_NVBIT_UPOPC, 25) +DEFUOP( UOP_NVBIT_UPRMT, 25) +DEFUOP( UOP_NVBIT_UPSETP, 25) +DEFUOP( UOP_NVBIT_UR2UP, 25) +DEFUOP( UOP_NVBIT_USEL , 25) +DEFUOP( UOP_NVBIT_USGXT , 25) +DEFUOP( UOP_NVBIT_USHF , 25) +DEFUOP( UOP_NVBIT_USHL , 25) +DEFUOP( UOP_NVBIT_USHR , 25) +DEFUOP( UOP_NVBIT_VOTEU , 25) +DEFUOP( UOP_NVBIT_TEX , 25) +DEFUOP( UOP_NVBIT_TLD , 25) +DEFUOP( UOP_NVBIT_TLD4 , 25) +DEFUOP( UOP_NVBIT_TMML , 25) +DEFUOP( UOP_NVBIT_TXD , 25) +DEFUOP( UOP_NVBIT_TXQ , 25) +DEFUOP( UOP_NVBIT_SUATOM , 25) +DEFUOP( UOP_NVBIT_SULD , 25) +DEFUOP( UOP_NVBIT_SURED , 25) +DEFUOP( UOP_NVBIT_SUST , 25) +DEFUOP( UOP_NVBIT_BMOV , 25) +DEFUOP( UOP_NVBIT_BPT , 25) +DEFUOP( UOP_NVBIT_BRA , 25) +DEFUOP( UOP_NVBIT_BREAK , 25) +DEFUOP( UOP_NVBIT_BRX , 25) +DEFUOP( UOP_NVBIT_BRXU , 25) +DEFUOP( UOP_NVBIT_BSSY , 25) +DEFUOP( UOP_NVBIT_BSYNC , 25) +DEFUOP( UOP_NVBIT_CALL , 25) +DEFUOP( UOP_NVBIT_EXIT , 25) +DEFUOP( UOP_NVBIT_JMP , 25) +DEFUOP( UOP_NVBIT_JMX , 25) +DEFUOP( UOP_NVBIT_JMXU , 25) +DEFUOP( UOP_NVBIT_KILL , 25) +DEFUOP( UOP_NVBIT_NANOSLEEP , 25) +DEFUOP( UOP_NVBIT_RET , 25) +DEFUOP( UOP_NVBIT_RPCMOV , 25) +DEFUOP( UOP_NVBIT_RTT , 25) +DEFUOP( UOP_NVBIT_WARPSYNC , 25) +DEFUOP( UOP_NVBIT_YIELD , 25) +DEFUOP( UOP_NVBIT_B2R , 25) +DEFUOP( UOP_NVBIT_BAR , 25) +DEFUOP( UOP_NVBIT_CS2R , 25) +DEFUOP( UOP_NVBIT_DEPBAR , 25) +DEFUOP( UOP_NVBIT_GETLMEMBASE , 25) +DEFUOP( UOP_NVBIT_LEPC , 25) +DEFUOP( UOP_NVBIT_NOP , 25) +DEFUOP( UOP_NVBIT_PMTRIG , 25) +DEFUOP( UOP_NVBIT_R2B , 25) +DEFUOP( UOP_NVBIT_S2R , 25) +DEFUOP( UOP_NVBIT_SETCTAID , 25) +DEFUOP( UOP_NVBIT_SETLMEMBASE , 25) +DEFUOP( UOP_NVBIT_VOTE , 25) diff --git a/internal b/internal index 2a78c449..0af43849 160000 --- a/internal +++ b/internal @@ -1 +1 @@ -Subproject commit 2a78c4493c05682101c0ce6ff66c673a2f332800 +Subproject commit 0af4384915735adaa3851a6b1149dc5cbc91c673 diff --git a/macsimComponent.cpp b/macsimComponent.cpp index 4bfc53af..681e4b15 100644 --- a/macsimComponent.cpp +++ b/macsimComponent.cpp @@ -66,6 +66,10 @@ macsimComponent::macsimComponent(ComponentId_t id, Params& params) } else if (params.find("igpu_core", 0)) { m_acc_type = IGPU_CORE; m_acc_core = 1; + } else if (params.find("nvbit_core", 0)) { + m_acc_type = NVBIT_CORE; + m_acc_core = 1; + } else { m_acc_core = 0; m_acc_type = NO_ACC; diff --git a/run_cmd.py b/run_cmd.py new file mode 100755 index 00000000..903e75db --- /dev/null +++ b/run_cmd.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 + +import os +import sys +import datetime +import numpy as np + +def main(): + now = datetime.datetime.now() + date = '%s%d' % (now.strftime("%b").lower(), now.day) + #date='nov23' + ptx_simulation = 0; + igpu_simulation = 0; + nvbit_simulation = 1; + + # bin = '/fast_data/jaewon/macsim_memsafety_mmu_eval/.opt_build/macsim' + bin = '/fast_data/echung67/macsim/bin/macsim' + ## ptx gpu simulation + + # tools = '/home/hyesoon/macsim_aos/internal/tools/run_batch_local.py' + # tools= '/fast_data/jaewon/macsim_memsafety_mmu_eval/internal/tools/run_batch_local.py' + tools = '/fast_data/echung67/macsim/internal/tools/run_batch_local.py' + # get_data_tool = '/home/hyesoon/macsim_aos/internal/tools/get_data.py' + # get_data_tool = '/fast_data/jaewon/macsim_memsafety_mmu_eval/internal/tools/get_data.py' + get_data_tool = '/fast_data/echung67/macsim/internal/tools/get_data.py' + + if (ptx_simulation == 1): + param = '/home/hyesoon/macsim_aos/bin/ptx_test/params.in' + max_insts = 1000 ## PTX simulation + #suite = 'ptx14-mb' + #suite = 'ptx14-all' + suite = 'aos-mb' + #suite = 'aos-stream' + + #get_data_suite_list = ['ptx14-all'] + #get_data_suite_list = ['aos-mb'] + #get_data_suite_list = ['aos-stream'] + #suite = 'aos-mb' + num_sim_cores = 16 + elif (igpu_simulation == 1): + #param = '/home/hyesoon/macsim_aos/bin/igpu_test/params.in' + param= '/fast_data/jaewon/macsim_memsafety_mmu_eval/bin/igpu_test/params.in' + max_insts = 1000000 ## igpu simulation + suite = 'aos-igpu-2' + num_sim_cores = 24 + get_data_suite_list = ['aos-igpu-2'] + elif (nvbit_simulation == 1): + param = '/fast_data/echung67/macsim/bin/params.in' + max_insts = 1000000 ## nvbit simulation + suite = 'rodinia31-nvbit-all' + num_sim_cores = 16 + get_data_suite_list = ['rodinia31-nvbit-all'] + + + # max_insts = 100000 + + + max_cycle = 0 + sim_cycle_count= 0 + #max_cycle = 1000000 + forward_progress_limit = 50000000 + + + + perfect_dcache = 0 + enable_physical_mapping = 1 + + + per_thread_frontend_q_size = 16 + per_thread_allocate_q_size = 16 + per_thread_schedule_q_size = 16 + + + desc = 'SIMPLE-LAT' + #base_stat_list = '-stat INST_COUNT_TOT -stat CYC_COUNT_TOT ' + stat_list = '-stat INST_COUNT_TOT -stat CYC_COUNT_TOT -stat L3_HIT_GPU -stat TOTAL_DRAM' + stat_core_list = ["OP_CAT_GED_SEND", "OP_CAT_GED_ADD", "OP_CAT_GED_ADDC", "OP_CAT_GED_OR", "OP_CAT_GED_AND"] + #stat_core_list = ["NUM_OF_BOUNDS_CHECKING", "BOUNDS_L0_CACHE_HIT" , "BOUNDS_L1_CACHE_HIT" ,"BOUNDS_INFO_INSERT", "BOUNDS_CHECK_SKIP_STATIC" ] + #stat_core_list = ["NUM_OF_BOUNDS_CHECKING", "BOUNDS_L0_CACHE_HIT" , "BOUNDS_L1_CACHE_HIT" ,"BOUNDS_INFO_INSERT", "BOUNDS_CHECK_SKIP_STATIC" ] + for stat_name in stat_core_list: + # addr = '%s' %(stat_name) + + #print addr + # for core_counts in range (0, 16): + for core_counts in range (0, 1): + new_stat_list = '-stat %s_CORE_%d' %(stat_name, core_counts) + stat_list = '%s %s' %(stat_list, new_stat_list) + base_cmd = '%s -bin %s -param %s -suite %s -cmd' % (tools, bin, param, suite) + + base_cmd = '%s \'num_sim_cores=%d --num_sim_small_cores=%d --sim_cycle_count=%d --max_insts=%d --forward_progress_limit=%d --perfect_dcache=%d --enable_physical_mapping=%d ' % (base_cmd, num_sim_cores, num_sim_cores, max_cycle, max_insts, forward_progress_limit, perfect_dcache, enable_physical_mapping) + + # bounds_l0_cache_lat bounds_l1_cache_lat 1, 3, -- 3, 10 " + + base_dir = 'nvbit/%s/%d-%d' % (date, max_insts, max_cycle) + file_name = 'get_data_cmd_%s.txt' %(date) + + # get_data_suite_list = ['aos-ML', 'aos-LA', 'aos-GT', 'aos-GI', 'aos-PS', 'aos-im', 'aos-dm'] + base_test = 0 + l1_lat_list = [ 1] + l2_lat_list = [3 ] + # l1_lat_list = [1,2,3] + #l2_lat_list = [3, 10, 20] + # l1_lat_list = [ 0, 1, 2, 3 ] + #l2_lat_list = [ 0, 1, 3, 10, 20] + #l2_lat_list = [1, 3] + #l0_entry_list = [ 1, 2, 4, 8]j + #l0_entry_list = [ 1,2, 4, 8, 16] + l0_entry_list = [ 4 ] + with open (file_name, 'w') as f: + for suite in get_data_suite_list: + for enable_bounds_checking in range (0,1) : + for enable_bounds_static_filter in range (0, 1): + for bounds_only_global_load_store in range (0, 1): + #for enable_bounds_checking in range (1,2) : + for l0_entry_num in l0_entry_list: + for l1_lat in l1_lat_list: + for l2_lat in l2_lat_list: + bounds_l0_cache_lat = l1_lat + bounds_l1_cache_lat = l2_lat + if (base_test == 0 or enable_bounds_checking != 0): + new_desc= '%s-bc-%s-l0lat%s-l1lat%s-l0entry%s-filter-%s-ldst-%s' % (desc, enable_bounds_checking, bounds_l0_cache_lat, bounds_l1_cache_lat, l0_entry_num,enable_bounds_static_filter,bounds_only_global_load_store) + + #new_cmd = '%s --enable_bounds_ids_file=1 --enable_bounds_checking=%d --bounds_l0_cache_lat=%d --bounds_l1_cache_lat=%d --bounds_l0_cache_entry=%d --bounds_l0_insert_latency=%d' % (base_cmd, enable_bounds_checking, bounds_l0_cache_lat, bounds_l1_cache_lat, l0_entry_num, bounds_l1_cache_lat) + + new_cmd = '%s --enable_bounds_ids_file=0 --bounds_only_global_load_store=%d --enable_bounds_static_filter=%d --enable_bounds_checking=%d --bounds_l0_cache_lat=%d --bounds_l1_cache_lat=%d --bounds_l0_cache_entry=%d --bounds_l0_insert_latency=%d' % (base_cmd, bounds_only_global_load_store, enable_bounds_static_filter, enable_bounds_checking, bounds_l0_cache_lat, bounds_l1_cache_lat, l0_entry_num, bounds_l1_cache_lat) + + #new_dir = '%s/ptx14-all/%s' % (base_dir, new_desc) + new_dir = '%s/%s/%s' %(base_dir,suite,new_desc) + + new_cmd = '%s\' -dir %s' % (new_cmd, new_dir) + + # get_data_cmd = '%s -d %s -widenames -disable-warning -amean -b base -prec 3 -suite %s %s >> summary_%s.txt\n'% (get_data_tool, new_dir, suite, base_stat_list, desc) + get_data_cmd = '%s -d %s -widenames -disable-warning -suite %s %s >> summary_%s.txt\n'% (get_data_tool, new_dir, suite, stat_list, desc) + + #print(new_cmd) + os.system(new_cmd) + f.write(get_data_cmd) + + if (enable_bounds_checking == 0): + base_test = 1 + + +if __name__ == '__main__': + main() diff --git a/src/bug_detector.cc b/src/bug_detector.cc index 8d79d0ba..4d003af9 100644 --- a/src/bug_detector.cc +++ b/src/bug_detector.cc @@ -169,7 +169,7 @@ void bug_detector_c::print(int core_id, int thread_id) { << left << (*m_uop_table[ii])[(*I)] << setw(15) << left << CYCLE - (*m_uop_table[ii])[(*I)] << setw(25) << left << uop_c::g_uop_state_name[uop->m_state] << setw(25) << left - << (core_type == "ptx" + << ((core_type == "ptx") || (core_type == "nvbit") ? gpu_decoder_c::g_tr_opcode_names[uop->m_opcode] : cpu_decoder_c::g_tr_opcode_names[uop->m_opcode]) << setw(20) << left << uop_c::g_uop_type_name[uop->m_uop_type] diff --git a/src/config.h b/src/config.h index 8a334276..25c2a076 100644 --- a/src/config.h +++ b/src/config.h @@ -50,7 +50,10 @@ POSSIBILITY OF SUCH DAMAGE. m_igpu_sim = m_simBase->m_knobs->KNOB_CORE_TYPE->getValue() == "igpu" \ ? true \ : false; \ - m_acc_sim = (m_igpu_sim || m_ptx_sim); \ + m_nvbit_sim = \ + m_simBase->m_knobs->KNOB_CORE_TYPE->getValue() == "nvbit" ? true \ + : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim || m_nvbit_sim); \ m_num_read_port = *KNOB(KNOB_L1_READ_PORTS); \ m_num_write_port = *KNOB(KNOB_L1_WRITE_PORTS); \ } else if (level == MEM_L2) { \ @@ -66,7 +69,10 @@ POSSIBILITY OF SUCH DAMAGE. m_igpu_sim = m_simBase->m_knobs->KNOB_CORE_TYPE->getValue() == "igpu" \ ? true \ : false; \ - m_acc_sim = (m_igpu_sim || m_ptx_sim); \ + m_nvbit_sim = \ + m_simBase->m_knobs->KNOB_CORE_TYPE->getValue() == "nvbit" ? true \ + : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim || m_nvbit_sim); \ m_num_read_port = *KNOB(KNOB_L2_READ_PORTS); \ m_num_write_port = *KNOB(KNOB_L2_WRITE_PORTS); \ } else if (level == MEM_L3) { \ @@ -81,7 +87,10 @@ POSSIBILITY OF SUCH DAMAGE. m_igpu_sim = m_simBase->m_knobs->KNOB_CORE_TYPE->getValue() == "igpu" \ ? true \ : false; \ - m_acc_sim = (m_igpu_sim || m_ptx_sim); \ + m_nvbit_sim = \ + m_simBase->m_knobs->KNOB_CORE_TYPE->getValue() == "nvbit" ? true \ + : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim || m_nvbit_sim); \ m_num_read_port = *KNOB(KNOB_L3_READ_PORTS); \ m_num_write_port = *KNOB(KNOB_L3_WRITE_PORTS); \ } else if (m_level == MEM_LLC) { \ @@ -96,7 +105,10 @@ POSSIBILITY OF SUCH DAMAGE. m_igpu_sim = m_simBase->m_knobs->KNOB_CORE_TYPE->getValue() == "igpu" \ ? true \ : false; \ - m_acc_sim = (m_igpu_sim || m_ptx_sim); \ + m_nvbit_sim = \ + m_simBase->m_knobs->KNOB_CORE_TYPE->getValue() == "nvbit" ? true \ + : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim || m_nvbit_sim); \ m_num_read_port = *KNOB(KNOB_LLC_READ_PORTS); \ m_num_write_port = *KNOB(KNOB_LLC_WRITE_PORTS); \ } \ @@ -116,7 +128,11 @@ POSSIBILITY OF SUCH DAMAGE. m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE->getValue() == "igpu" \ ? true \ : false; \ - m_acc_sim = (m_igpu_sim || m_ptx_sim); \ + m_nvbit_sim = \ + m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE->getValue() == "nvbit" \ + ? true \ + : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim || m_nvbit_sim); \ m_num_read_port = *KNOB(KNOB_L1_READ_PORTS); \ m_num_write_port = *KNOB(KNOB_L1_WRITE_PORTS); \ } else if (level == MEM_L2) { \ @@ -134,7 +150,11 @@ POSSIBILITY OF SUCH DAMAGE. m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE->getValue() == "igpu" \ ? true \ : false; \ - m_acc_sim = (m_igpu_sim || m_ptx_sim); \ + m_nvbit_sim = \ + m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE->getValue() == "nvbit" \ + ? true \ + : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim || m_nvbit_sim); \ m_num_read_port = *KNOB(KNOB_L2_READ_PORTS); \ m_num_write_port = *KNOB(KNOB_L2_WRITE_PORTS); \ } else if (level == MEM_L3) { \ @@ -151,7 +171,11 @@ POSSIBILITY OF SUCH DAMAGE. m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE->getValue() == "igpu" \ ? true \ : false; \ - m_acc_sim = (m_igpu_sim || m_ptx_sim); \ + m_nvbit_sim = \ + m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE->getValue() == "nvbit" \ + ? true \ + : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim || m_nvbit_sim); \ m_num_read_port = *KNOB(KNOB_L3_READ_PORTS); \ m_num_write_port = *KNOB(KNOB_L3_WRITE_PORTS); \ } else if (level == MEM_LLC) { \ @@ -168,7 +192,11 @@ POSSIBILITY OF SUCH DAMAGE. m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE->getValue() == "igpu" \ ? true \ : false; \ - m_acc_sim = (m_igpu_sim || m_ptx_sim); \ + m_nvbit_sim = \ + m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE->getValue() == "nvbit" \ + ? true \ + : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim || m_nvbit_sim); \ m_num_read_port = *KNOB(KNOB_LLC_READ_PORTS); \ m_num_write_port = *KNOB(KNOB_LLC_WRITE_PORTS); \ } \ @@ -188,7 +216,11 @@ POSSIBILITY OF SUCH DAMAGE. m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE->getValue() == "igpu" \ ? true \ : false; \ - m_acc_sim = (m_igpu_sim || m_ptx_sim); \ + m_nvbit_sim = \ + m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE->getValue() == "nvbit" \ + ? true \ + : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim || m_nvbit_sim); \ m_num_read_port = *KNOB(KNOB_L1_READ_PORTS); \ m_num_write_port = *KNOB(KNOB_L1_WRITE_PORTS); \ } else if (level == MEM_L2) { \ @@ -206,7 +238,11 @@ POSSIBILITY OF SUCH DAMAGE. m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE->getValue() == "igpu" \ ? true \ : false; \ - m_acc_sim = (m_igpu_sim || m_ptx_sim); \ + m_nvbit_sim = \ + m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE->getValue() == "nvbit" \ + ? true \ + : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim || m_nvbit_sim); \ m_num_read_port = *KNOB(KNOB_L2_READ_PORTS); \ m_num_write_port = *KNOB(KNOB_L2_WRITE_PORTS); \ } else if (level == MEM_L3) { \ @@ -223,7 +259,11 @@ POSSIBILITY OF SUCH DAMAGE. m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE->getValue() == "igpu" \ ? true \ : false; \ - m_acc_sim = (m_igpu_sim || m_ptx_sim); \ + m_nvbit_sim = \ + m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE->getValue() == "nvbit" \ + ? true \ + : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim || m_nvbit_sim); \ m_num_read_port = *KNOB(KNOB_L3_READ_PORTS); \ m_num_write_port = *KNOB(KNOB_L3_WRITE_PORTS); \ } else if (level == MEM_LLC) { \ @@ -240,7 +280,11 @@ POSSIBILITY OF SUCH DAMAGE. m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE->getValue() == "igpu" \ ? true \ : false; \ - m_acc_sim = (m_igpu_sim || m_ptx_sim); \ + m_nvbit_sim = \ + m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE->getValue() == "nvbit" \ + ? true \ + : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim || m_nvbit_sim); \ m_num_read_port = *KNOB(KNOB_LLC_READ_PORTS); \ m_num_write_port = *KNOB(KNOB_LLC_WRITE_PORTS); \ } \ @@ -286,102 +330,130 @@ POSSIBILITY OF SUCH DAMAGE. break; \ } -#define RETIRE_CONFIG() \ - switch (m_unit_type) { \ - case UNIT_SMALL: \ - m_knob_width = *m_simBase->m_knobs->KNOB_WIDTH; \ - m_ptx_sim = \ - static_cast(*m_simBase->m_knobs->KNOB_CORE_TYPE) == "ptx" \ - ? true \ - : false; \ - m_igpu_sim = \ - static_cast(*m_simBase->m_knobs->KNOB_CORE_TYPE) == "igpu" \ - ? true \ - : false; \ - m_acc_sim = (m_igpu_sim || m_ptx_sim); \ - break; \ - case UNIT_MEDIUM: \ - m_knob_width = *m_simBase->m_knobs->KNOB_MEDIUM_WIDTH; \ - m_ptx_sim = static_cast( \ - *m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) == "ptx" \ - ? true \ - : false; \ - m_igpu_sim = static_cast( \ - *m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) == "igpu" \ - ? true \ - : false; \ - m_acc_sim = (m_igpu_sim || m_ptx_sim); \ - break; \ - case UNIT_LARGE: \ - m_knob_width = *m_simBase->m_knobs->KNOB_LARGE_WIDTH; \ - m_ptx_sim = static_cast( \ - *m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE) == "ptx" \ - ? true \ - : false; \ - m_igpu_sim = static_cast( \ - *m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE) == "igpu" \ - ? true \ - : false; \ - m_acc_sim = (m_igpu_sim || m_ptx_sim); \ - break; \ +#define RETIRE_CONFIG() \ + switch (m_unit_type) { \ + case UNIT_SMALL: \ + m_knob_width = *m_simBase->m_knobs->KNOB_WIDTH; \ + m_ptx_sim = \ + static_cast(*m_simBase->m_knobs->KNOB_CORE_TYPE) == "ptx" \ + ? true \ + : false; \ + m_igpu_sim = \ + static_cast(*m_simBase->m_knobs->KNOB_CORE_TYPE) == "igpu" \ + ? true \ + : false; \ + m_nvbit_sim = \ + static_cast(*m_simBase->m_knobs->KNOB_CORE_TYPE) == "nvbit" \ + ? true \ + : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim || m_nvbit_sim); \ + break; \ + case UNIT_MEDIUM: \ + m_knob_width = *m_simBase->m_knobs->KNOB_MEDIUM_WIDTH; \ + m_ptx_sim = static_cast( \ + *m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) == "ptx" \ + ? true \ + : false; \ + m_igpu_sim = static_cast( \ + *m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) == "igpu" \ + ? true \ + : false; \ + m_nvbit_sim = static_cast( \ + *m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) == "nvbit" \ + ? true \ + : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim || m_nvbit_sim); \ + break; \ + case UNIT_LARGE: \ + m_knob_width = *m_simBase->m_knobs->KNOB_LARGE_WIDTH; \ + m_ptx_sim = static_cast( \ + *m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE) == "ptx" \ + ? true \ + : false; \ + m_igpu_sim = static_cast( \ + *m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE) == "igpu" \ + ? true \ + : false; \ + m_nvbit_sim = static_cast( \ + *m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE) == "nvbit" \ + ? true \ + : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim || m_nvbit_sim); \ + break; \ } -#define EXEC_CONFIG() \ - int int_sched_rate = 0; \ - int mem_sched_rate = 0; \ - int fp_sched_rate = 0; \ - int simd_sched_rate = 0; \ - switch (m_unit_type) { \ - case UNIT_SMALL: \ - int_sched_rate = *m_simBase->m_knobs->KNOB_ISCHED_RATE; \ - mem_sched_rate = *m_simBase->m_knobs->KNOB_MSCHED_RATE; \ - fp_sched_rate = *m_simBase->m_knobs->KNOB_FSCHED_RATE; \ - simd_sched_rate = *m_simBase->m_knobs->KNOB_SSCHED_RATE; \ - m_dcache_cycles = *m_simBase->m_knobs->KNOB_L1_SMALL_LATENCY; \ - m_ptx_sim = \ - static_cast(*m_simBase->m_knobs->KNOB_CORE_TYPE) == "ptx" \ - ? true \ - : false; \ - m_igpu_sim = \ - static_cast(*m_simBase->m_knobs->KNOB_CORE_TYPE) == "igpu" \ - ? true \ - : false; \ - m_acc_sim = (m_igpu_sim || m_ptx_sim); \ - break; \ - \ - case UNIT_MEDIUM: \ - int_sched_rate = *m_simBase->m_knobs->KNOB_ISCHED_MEDIUM_RATE; \ - mem_sched_rate = *m_simBase->m_knobs->KNOB_MSCHED_MEDIUM_RATE; \ - fp_sched_rate = *m_simBase->m_knobs->KNOB_FSCHED_MEDIUM_RATE; \ - simd_sched_rate = *m_simBase->m_knobs->KNOB_SSCHED_MEDIUM_RATE; \ - m_dcache_cycles = *m_simBase->m_knobs->KNOB_L1_MEDIUM_LATENCY; \ - m_ptx_sim = static_cast( \ - *m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) == "ptx" \ - ? true \ - : false; \ - m_acc_sim = (m_igpu_sim || m_ptx_sim); \ - break; \ - \ - case UNIT_LARGE: \ - int_sched_rate = *m_simBase->m_knobs->KNOB_ISCHED_LARGE_RATE; \ - mem_sched_rate = *m_simBase->m_knobs->KNOB_MSCHED_LARGE_RATE; \ - fp_sched_rate = *m_simBase->m_knobs->KNOB_FSCHED_LARGE_RATE; \ - simd_sched_rate = *m_simBase->m_knobs->KNOB_SSCHED_LARGE_RATE; \ - m_dcache_cycles = *m_simBase->m_knobs->KNOB_L1_LARGE_LATENCY; \ - m_ptx_sim = static_cast( \ - *m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE) == "ptx" \ - ? true \ - : false; \ - m_igpu_sim = static_cast( \ - *m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE) == "igpu" \ - ? true \ - : false; \ - m_acc_sim = (m_igpu_sim || m_ptx_sim); \ - break; \ - } \ - m_max_port[gen_ALLOCQ] = int_sched_rate; \ - m_max_port[mem_ALLOCQ] = mem_sched_rate; \ - m_max_port[fp_ALLOCQ] = fp_sched_rate; \ +#define EXEC_CONFIG() \ + int int_sched_rate = 0; \ + int mem_sched_rate = 0; \ + int fp_sched_rate = 0; \ + int simd_sched_rate = 0; \ + switch (m_unit_type) { \ + case UNIT_SMALL: \ + int_sched_rate = *m_simBase->m_knobs->KNOB_ISCHED_RATE; \ + mem_sched_rate = *m_simBase->m_knobs->KNOB_MSCHED_RATE; \ + fp_sched_rate = *m_simBase->m_knobs->KNOB_FSCHED_RATE; \ + simd_sched_rate = *m_simBase->m_knobs->KNOB_SSCHED_RATE; \ + m_dcache_cycles = *m_simBase->m_knobs->KNOB_L1_SMALL_LATENCY; \ + m_ptx_sim = \ + static_cast(*m_simBase->m_knobs->KNOB_CORE_TYPE) == "ptx" \ + ? true \ + : false; \ + m_igpu_sim = \ + static_cast(*m_simBase->m_knobs->KNOB_CORE_TYPE) == "igpu" \ + ? true \ + : false; \ + m_nvbit_sim = \ + static_cast(*m_simBase->m_knobs->KNOB_CORE_TYPE) == "nvbit" \ + ? true \ + : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim || m_nvbit_sim); \ + break; \ + \ + case UNIT_MEDIUM: \ + int_sched_rate = *m_simBase->m_knobs->KNOB_ISCHED_MEDIUM_RATE; \ + mem_sched_rate = *m_simBase->m_knobs->KNOB_MSCHED_MEDIUM_RATE; \ + fp_sched_rate = *m_simBase->m_knobs->KNOB_FSCHED_MEDIUM_RATE; \ + simd_sched_rate = *m_simBase->m_knobs->KNOB_SSCHED_MEDIUM_RATE; \ + m_dcache_cycles = *m_simBase->m_knobs->KNOB_L1_MEDIUM_LATENCY; \ + m_ptx_sim = static_cast( \ + *m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) == "ptx" \ + ? true \ + : false; \ + m_igpu_sim = static_cast( \ + *m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) == "igpu" \ + ? true \ + : false; \ + m_nvbit_sim = static_cast( \ + *m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) == "nvbit" \ + ? true \ + : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim || m_nvbit_sim); \ + break; \ + \ + case UNIT_LARGE: \ + int_sched_rate = *m_simBase->m_knobs->KNOB_ISCHED_LARGE_RATE; \ + mem_sched_rate = *m_simBase->m_knobs->KNOB_MSCHED_LARGE_RATE; \ + fp_sched_rate = *m_simBase->m_knobs->KNOB_FSCHED_LARGE_RATE; \ + simd_sched_rate = *m_simBase->m_knobs->KNOB_SSCHED_LARGE_RATE; \ + m_dcache_cycles = *m_simBase->m_knobs->KNOB_L1_LARGE_LATENCY; \ + m_ptx_sim = static_cast( \ + *m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE) == "ptx" \ + ? true \ + : false; \ + m_igpu_sim = static_cast( \ + *m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE) == "igpu" \ + ? true \ + : false; \ + m_nvbit_sim = static_cast( \ + *m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE) == "nvbit" \ + ? true \ + : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim || m_nvbit_sim); \ + break; \ + } \ + m_max_port[gen_ALLOCQ] = int_sched_rate; \ + m_max_port[mem_ALLOCQ] = mem_sched_rate; \ + m_max_port[fp_ALLOCQ] = fp_sched_rate; \ m_max_port[simd_ALLOCQ] = simd_sched_rate; #define SCHED_CONFIG() \ @@ -463,12 +535,18 @@ POSSIBILITY OF SUCH DAMAGE. m_ptx_sim = false; \ m_fetch_ratio = *m_simBase->m_knobs->KNOB_CPU_FETCH_RATIO; \ } \ + m_igpu_sim = \ + static_cast(*m_simBase->m_knobs->KNOB_CORE_TYPE) == "igpu" \ + ? true \ + : false; \ + m_nvbit_sim = \ + static_cast(*m_simBase->m_knobs->KNOB_CORE_TYPE) == "nvbit" \ + ? true \ + : false; \ + if (m_nvbit_sim) \ + m_fetch_ratio = *m_simBase->m_knobs->KNOB_GPU_FETCH_RATIO; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim || m_nvbit_sim); \ break; \ - m_igpu_sim = static_cast( \ - *m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) == "igpu" \ - ? true \ - : false; \ - m_acc_sim = (m_igpu_sim || m_ptx_sim); \ case UNIT_MEDIUM: \ m_knob_width = *m_simBase->m_knobs->KNOB_MEDIUM_WIDTH; \ m_knob_fetch_width = *m_simBase->m_knobs->KNOB_FETCH_MEDIUM_WDITH; \ @@ -483,13 +561,19 @@ POSSIBILITY OF SUCH DAMAGE. m_ptx_sim = false; \ m_fetch_ratio = *m_simBase->m_knobs->KNOB_CPU_FETCH_RATIO; \ } \ - break; \ m_igpu_sim = static_cast( \ *m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) == "igpu" \ ? true \ : false; \ - m_acc_sim = (m_igpu_sim || m_ptx_sim); \ + m_nvbit_sim = static_cast( \ + *m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) == "nvbit" \ + ? true \ + : false; \ + if (m_nvbit_sim) \ + m_fetch_ratio = *m_simBase->m_knobs->KNOB_GPU_FETCH_RATIO; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim || m_nvbit_sim); \ \ + break; \ case UNIT_LARGE: \ m_knob_width = *m_simBase->m_knobs->KNOB_LARGE_WIDTH; \ m_knob_fetch_width = *m_simBase->m_knobs->KNOB_FETCH_LARGE_WDITH; \ @@ -504,12 +588,18 @@ POSSIBILITY OF SUCH DAMAGE. m_ptx_sim = false; \ m_fetch_ratio = *m_simBase->m_knobs->KNOB_CPU_FETCH_RATIO; \ } \ - break; \ m_igpu_sim = static_cast( \ *m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) == "igpu" \ ? true \ : false; \ - m_acc_sim = (m_igpu_sim || m_ptx_sim); \ + m_nvbit_sim = static_cast( \ + *m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) == "nvbit" \ + ? true \ + : false; \ + if (m_nvbit_sim) \ + m_fetch_ratio = *m_simBase->m_knobs->KNOB_GPU_FETCH_RATIO; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim || m_nvbit_sim); \ + break; \ } #define CORE_CONFIG() \ diff --git a/src/core.cc b/src/core.cc index 9b0d71fa..36c6f391 100644 --- a/src/core.cc +++ b/src/core.cc @@ -168,7 +168,7 @@ core_c::core_c(int c_id, macsim_c* simBase, Unit_Type type) { m_icache->set_core_id(m_core_id); // reorder buffer - if (m_core_type == "ptx" || m_core_type == "igpu") { + if (m_core_type == "ptx" || m_core_type == "igpu" || m_core_type == "nvbit") { m_rob = NULL; m_gpu_rob = new smc_rob_c(m_unit_type, m_core_id, m_simBase); } else { @@ -182,7 +182,7 @@ core_c::core_c(int c_id, macsim_c* simBase, Unit_Type type) { (m_knob_fetch_latency + m_knob_alloc_latency), "q_frontend", m_simBase); // allocation queue - if (m_core_type == "ptx" || m_core_type == "igpu") { + if (m_core_type == "ptx" || m_core_type == "igpu" || m_core_type == "nvbit") { m_q_iaq = NULL; m_gpu_q_iaq = new pqueue_c*[max_ALLOCQ]; } else { @@ -197,7 +197,7 @@ core_c::core_c(int c_id, macsim_c* simBase, Unit_Type type) { q_iaq_size[simd_ALLOCQ] = siaq_size; sstr.clear(); - if (m_core_type == "ptx" || m_core_type == "igpu") { + if (m_core_type == "ptx" || m_core_type == "igpu" || m_core_type == "nvbit") { for (int i = 0; i < max_ALLOCQ; ++i) { sstr << "q_iaq" << i; sstr >> name; @@ -227,7 +227,7 @@ core_c::core_c(int c_id, macsim_c* simBase, Unit_Type type) { FRONTEND_INTERFACE_ARGS(), m_simBase); // allocation stage - if (m_core_type == "ptx" || m_core_type == "igpu") { + if (m_core_type == "ptx" || m_core_type == "igpu" || m_core_type == "nvbit") { m_allocate = NULL; m_gpu_allocate = new smc_allocate_c(m_core_id, m_q_frontend, m_gpu_q_iaq, m_uop_pool, m_gpu_rob, m_unit_type, @@ -243,7 +243,7 @@ core_c::core_c(int c_id, macsim_c* simBase, Unit_Type type) { m_exec = new exec_c(EXEC_INTERFACE_ARGS(), m_simBase); // instruction scheduler - if (m_core_type == "ptx") { + if (m_core_type == "ptx" || m_core_type == "nvbit") { m_schedule = new schedule_smc_c(m_core_id, m_gpu_q_iaq, m_gpu_rob, m_exec, m_unit_type, m_frontend, m_simBase); } else if (m_core_type == "igpu") { @@ -271,7 +271,7 @@ core_c::core_c(int c_id, macsim_c* simBase, Unit_Type type) { m_hw_pref = new hwp_common_c(c_id, type, m_simBase); // const / texture cache - if (m_core_type == "ptx" && + if ((m_core_type == "ptx" || m_core_type == "nvbit") && *m_simBase->m_knobs->KNOB_USE_CONST_AND_TEX_CACHES) { m_const_cache = new readonly_cache_c( "const_cache", m_core_id, *KNOB(KNOB_CONST_CACHE_SIZE), @@ -290,7 +290,7 @@ core_c::core_c(int c_id, macsim_c* simBase, Unit_Type type) { } // shared memory - if (m_core_type == "ptx") { + if ((m_core_type == "ptx") || (m_core_type == "nvbit")) { m_shared_memory = new sw_managed_cache_c( "shared_memory", m_core_id, *KNOB(KNOB_SHARED_MEM_SIZE), *KNOB(KNOB_SHARED_MEM_ASSOC), *KNOB(KNOB_SHARED_MEM_LINE_SIZE), @@ -311,7 +311,7 @@ core_c::~core_c() { delete m_q_frontend; delete m_frontend; delete m_uop_pool; - if (m_core_type == "ptx" || m_core_type == "igpu") { + if (m_core_type == "ptx" || m_core_type == "igpu" || m_core_type == "nvbit") { delete m_gpu_rob; delete m_gpu_allocate; for (int i = 0; i < max_ALLOCQ; ++i) { @@ -377,7 +377,7 @@ void core_c::run_a_cycle(bool pll_lock) { // to simulate kernel invocation from host code if (*KNOB(KNOB_ENABLE_CONDITIONAL_EXECUTION)) { - if (m_core_type == "ptx" && m_simBase->m_gpu_paused) { + if ((m_core_type == "ptx" || m_core_type == "nvbit") && m_simBase->m_gpu_paused) { m_frontend->stop(); } } @@ -415,7 +415,7 @@ void core_c::advance_queues(void) { m_q_frontend->advance(); // advance allocation queue - if (m_core_type == "ptx" || m_core_type == "igpu") { + if (m_core_type == "ptx" || m_core_type == "igpu" || m_core_type == "nvbit") { for (int i = 0; i < max_ALLOCQ; ++i) { m_gpu_q_iaq[i]->advance(); } @@ -647,8 +647,8 @@ void core_c::allocate_thread_data(int tid) { m_retire->allocate_retire_data(tid); // allocate scheduler queue and rob for GPU simulation - if (m_core_type == "ptx" || m_core_type == "igpu") - m_gpu_rob->reserve_rob(tid); + if (m_core_type == "ptx" || m_core_type == "igpu" || m_core_type == "nvbit") + m_gpu_rob->reserve_rob(tid); } // When a thread is terminated, deallocate all data used by this thread @@ -680,7 +680,7 @@ void core_c::deallocate_thread_data(int tid) { m_last_terminated_tid = ++t_id; } - if (m_core_type == "ptx" || m_core_type == "igpu") m_gpu_rob->free_rob(tid); + if (m_core_type == "ptx" || m_core_type == "igpu" || m_core_type == "nvbit") m_gpu_rob->free_rob(tid); // check forward progress if (m_unique_scheduled_thread_num >= m_last_terminated_tid + 1000) { @@ -706,7 +706,7 @@ void core_c::train_hw_pref(int level, int tid, Addr addr, Addr pc, uop_c* uop, // hardware prefetcher initialization void core_c::pref_init(void) { if (*m_simBase->m_knobs->KNOB_PREF_FRAMEWORK_ON && m_knob_enable_pref) { - m_hw_pref->pref_init(m_core_type == "ptx" ? true : false); + m_hw_pref->pref_init((m_core_type == "ptx") || (m_core_type == "nvbit") ? true : false); } } diff --git a/src/exec.cc b/src/exec.cc index d9ba5932..43d3bd3c 100644 --- a/src/exec.cc +++ b/src/exec.cc @@ -123,6 +123,11 @@ static Uop_LatencyBinding_Init uop_latencybinding_init_igpu[] = { #include "../def/uoplatency_igpu.def" }; +static Uop_LatencyBinding_Init uop_latencybinding_init_nvbit[] = { +#define DEFUOP(A, B) {A, #A, B}, +#include "../def/uoplatency_nvbit580.def" +}; + // exec_c constructor exec_c::exec_c(EXEC_INTERFACE_PARAMS(), macsim_c* simBase) : EXEC_INTERFACE_INIT() { @@ -149,6 +154,14 @@ exec_c::exec_c(EXEC_INTERFACE_PARAMS(), macsim_c* simBase) m_latency[uop_latencybinding_init_igpu[ii].uop_type_s] = uop_latencybinding_init_igpu[ii].m_latency; } + } else if (m_nvbit_sim) { + int latency_array_size = (sizeof uop_latencybinding_init_nvbit / + sizeof(uop_latencybinding_init_nvbit[0])); + + for (int ii = 0; ii < latency_array_size; ++ii) { + m_latency[uop_latencybinding_init_nvbit[ii].uop_type_s] = + uop_latencybinding_init_nvbit[ii].m_latency; + } } else { latency_map lat_map = m_simBase->m_knobsContainer->getDecodedUOPLatencyKnob(); @@ -434,9 +447,8 @@ bool exec_c::exec(int thread_id, int entry, uop_c* uop) { // other (global, texture, local) memory access else { #if PORT_FIXME - if (0 && - m_bank_busy[uop->m_child_uops[next_set_bit] - ->m_dcache_bank_id] == true) { + if (0 && m_bank_busy[uop->m_child_uops[next_set_bit] + ->m_dcache_bank_id] == true) { STAT_EVENT(CACHE_BANK_BUSY); uop_latency = 0; } else { diff --git a/src/exec.h b/src/exec.h index ab80e142..bda237a0 100644 --- a/src/exec.h +++ b/src/exec.h @@ -187,6 +187,7 @@ class exec_c bool m_acc_sim; /**< gpu simulation */ bool m_igpu_sim; /**< intel gpu simulation */ bool m_ptx_sim; /**< PTX simulation */ + bool m_nvbit_sim; /**< NVBIT simulation */ int m_latency[NUM_UOP_TYPES]; /**< latency map */ Counter m_cur_core_cycle; /**< current core cycle */ int m_max_port[max_ALLOCQ]; /**< maximum port */ diff --git a/src/frontend.cc b/src/frontend.cc index fd41e17d..64c85ed9 100644 --- a/src/frontend.cc +++ b/src/frontend.cc @@ -302,7 +302,7 @@ void frontend_c::run_a_cycle(void) { // TONAGESH // nagesh - comments for BAR are incomplete... - if (m_ptx_sim) { + if (m_ptx_sim || m_nvbit_sim) { // handling of BAR instruction in PTX - can/should this be moved? // do we have any blocks for which all warps have reached (retired) // their next barrier? @@ -348,7 +348,7 @@ FRONTEND_MODE frontend_c::process_ifetch(unsigned int tid, // First time : set up traces for current thread if (fetch_data->m_first_time) { - m_simBase->m_trace_reader->setup_trace(m_core_id, tid, m_ptx_sim); + m_simBase->m_trace_reader->setup_trace(m_core_id, tid, m_ptx_sim || m_nvbit_sim); fetch_data->m_first_time = false; ++m_core->m_inst_fetched[tid]; /*! initial increase */ @@ -369,6 +369,11 @@ FRONTEND_MODE frontend_c::process_ifetch(unsigned int tid, static_cast(thread->m_prev_trace_info); fetch_data->m_MT_scheduler.m_next_fetch_addr = prev_trace_info->m_instruction_addr; + } else if (m_nvbit_sim) { + trace_info_nvbit_s *prev_trace_info = + static_cast(thread->m_prev_trace_info); + fetch_data->m_MT_scheduler.m_next_fetch_addr = + prev_trace_info->m_inst_addr; } } else { if (KNOB(KNOB_LARGE_CORE_TYPE)->getValue() == "x86") { @@ -462,7 +467,7 @@ FRONTEND_MODE frontend_c::process_ifetch(unsigned int tid, // read an uop from the traces if (!m_simBase->m_trace_reader->get_uops_from_traces(m_core_id, new_uop, - tid, m_ptx_sim)) { + tid, m_ptx_sim || m_nvbit_sim)) { // couldn't get an uop DEBUG_CORE(m_core_id, "not success\n"); m_uop_pool->release_entry(new_uop->free()); @@ -635,7 +640,7 @@ bool frontend_c::access_icache(int tid, Addr fetch_addr, int result = m_simBase->m_memory->new_mem_req( MRT_IFETCH, line_addr, m_knob_icache_line_size, false, false, 0, NULL, icache_fill_line_wrapper, m_core->get_unique_uop_num(), NULL, m_core_id, - tid, m_ptx_sim); + tid, m_ptx_sim||m_nvbit_sim); // mshr full if (!result) return false; @@ -810,7 +815,7 @@ int frontend_c::predict_bpu(uop_c *uop) { // no branch prediction else { // GPU : stall on branch policy, stop fetching - if (m_ptx_sim && *m_simBase->m_knobs->KNOB_MT_NO_FETCH_BR) { + if ((m_ptx_sim || m_nvbit_sim) && *m_simBase->m_knobs->KNOB_MT_NO_FETCH_BR) { set_br_wait(uop->m_thread_id); mispredicted = false; } @@ -910,7 +915,7 @@ int frontend_c::fetch_rr(void) { } // check the thread is ready to fetch - if (m_ptx_sim) { + if (m_ptx_sim || m_nvbit_sim) { // GPU : stall on branch policy, check whether previous branch has been resolved if (*m_simBase->m_knobs->KNOB_MT_NO_FETCH_BR && !check_br_ready(fetch_id)) { diff --git a/src/frontend.h b/src/frontend.h index d193731f..bca26f6b 100644 --- a/src/frontend.h +++ b/src/frontend.h @@ -409,6 +409,7 @@ class frontend_c bool m_fe_running; /**< enabled frontend */ bool m_ptx_sim; /**< PTX simulation */ bool m_igpu_sim; /**< iGPU simulation */ + bool m_nvbit_sim; /**< NVBIT simulation */ bool m_acc_sim; /**< Accelerator simulation */ bool m_ready_thread_available; /**< ready thread available */ bool m_last_fetch_tid_failed; diff --git a/src/global_defs.h b/src/global_defs.h index ddb16ece..1dfbeceb 100644 --- a/src/global_defs.h +++ b/src/global_defs.h @@ -159,7 +159,7 @@ void delete_store_hash_entry_wrapper(map_c *map, uop_c *uop); /////////////////////////////////////////////////////////////////////////////////////////////// // Global definitions -#define MAX_TR_OPCODE_NAME GPU_OPCODE_LAST +#define MAX_TR_OPCODE_NAME NVBIT_OPCODE_LAST #define MAX_GPU_ADDR_SPACE GPU_ADDR_SP_LAST #define MAX_GPU_CACHE_OP GPU_CACHE_OP_LAST #define MAX_GPU_CACHE_LEVEL GPU_CACHE_LAST diff --git a/src/macsim.cc b/src/macsim.cc index 92d5e414..dbf5b7b9 100644 --- a/src/macsim.cc +++ b/src/macsim.cc @@ -335,7 +335,8 @@ void macsim_c::init_cores(int num_max_core) { m_core_pointers[ii]->pref_init(); // insert to the core type pool - if (static_cast(*m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE) == "ptx") + if ((static_cast(*m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE) == "ptx") || + (static_cast(*m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE) == "nvbit")) m_acc_core_pool.push(ii); else m_x86_core_pool.push(ii); @@ -350,8 +351,10 @@ void macsim_c::init_cores(int num_max_core) { m_core_pointers[ii + num_large_cores]->pref_init(); // insert to the core type pool - if (static_cast(*m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) == - "ptx") + if ((static_cast(*m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) == + "ptx") || + (static_cast(*m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) == + "nvbit")) m_acc_core_pool.push(ii + total_core); else m_x86_core_pool.push(ii + total_core); @@ -366,7 +369,8 @@ void macsim_c::init_cores(int num_max_core) { m_core_pointers[ii + num_large_medium_cores]->pref_init(); // insert to the core type pool - if (static_cast(*m_simBase->m_knobs->KNOB_CORE_TYPE) == "ptx") + if ((static_cast(*m_simBase->m_knobs->KNOB_CORE_TYPE) == "ptx") || + (static_cast(*m_simBase->m_knobs->KNOB_CORE_TYPE) == "nvbit")) m_acc_core_pool.push(ii + total_core); else m_x86_core_pool.push(ii + total_core); @@ -814,7 +818,7 @@ void macsim_c::init_clock_domain(void) { for (int ii = 0; ii < m_num_sim_cores; ++ii) { core_c* core = m_core_pointers[ii]; string core_type = core->get_core_type(); - if (core_type == "ptx") { + if (core_type == "ptx" || core_type == "nvbit") { m_domain_freq[ii] = static_cast(domain_f[CLOCK_GPU]); } else { m_domain_freq[ii] = static_cast(domain_f[CLOCK_CPU]); diff --git a/src/main.cc b/src/main.cc index bb7f253a..ac1910a4 100644 --- a/src/main.cc +++ b/src/main.cc @@ -27,7 +27,7 @@ POSSIBILITY OF SUCH DAMAGE. */ /********************************************************************************************** - * File : maim.cc + * File : main.cc * Author : HPArch * Date : 3/25/2011 * SVN : $Id: main.cc 911 2009-11-20 19:08:10Z kacear $: diff --git a/src/memory.cc b/src/memory.cc index a7e609f3..63272ddf 100644 --- a/src/memory.cc +++ b/src/memory.cc @@ -556,10 +556,13 @@ int dcu_c::access(uop_c* uop) { case MEM_LD_LM: case MEM_LD_CM: case MEM_LD_TM: + case MEM_LD_GM: + case MEM_LD_SM: req_type = MRT_DFETCH; break; case MEM_ST: case MEM_ST_LM: + case MEM_ST_SM: case MEM_ST_GM: req_type = MRT_DSTORE; break; @@ -1576,18 +1579,21 @@ memory_c::memory_c(macsim_c* simBase) { m_num_cpu = 0; if ((KNOB(KNOB_LARGE_CORE_TYPE)->getValue() == "ptx") || + (KNOB(KNOB_LARGE_CORE_TYPE)->getValue() == "nvbit") || (KNOB(KNOB_LARGE_CORE_TYPE)->getValue() == "igpu")) m_num_gpu += *KNOB(KNOB_NUM_SIM_LARGE_CORES); else m_num_cpu += *KNOB(KNOB_NUM_SIM_LARGE_CORES); if ((KNOB(KNOB_MEDIUM_CORE_TYPE)->getValue() == "ptx") || + (KNOB(KNOB_MEDIUM_CORE_TYPE)->getValue() == "nvbit") || (KNOB(KNOB_LARGE_CORE_TYPE)->getValue() == "igpu")) m_num_gpu += *KNOB(KNOB_NUM_SIM_MEDIUM_CORES); else m_num_cpu += *KNOB(KNOB_NUM_SIM_MEDIUM_CORES); if ((KNOB(KNOB_CORE_TYPE)->getValue() == "ptx") || + (KNOB(KNOB_CORE_TYPE)->getValue() == "nvbit") || (KNOB(KNOB_LARGE_CORE_TYPE)->getValue() == "igpu")) m_num_gpu += *KNOB(KNOB_NUM_SIM_SMALL_CORES); else diff --git a/src/memory.h b/src/memory.h index 3efa846f..285b9d8f 100644 --- a/src/memory.h +++ b/src/memory.h @@ -306,6 +306,7 @@ class dcu_c bool m_acc_sim; /**< gpu cache */ bool m_igpu_sim; /**< intel gpu cache */ bool m_ptx_sim; /**< gpu cache */ + bool m_nvbit_sim; /**< NVBIT simulation */ queue_c* m_in_queue; /**< input queue */ queue_c* m_wb_queue; /**< write-back queue */ queue_c* m_fill_queue; /**< fill queue */ diff --git a/src/process_manager.cc b/src/process_manager.cc index 1456f5f8..594c9a6e 100644 --- a/src/process_manager.cc +++ b/src/process_manager.cc @@ -105,15 +105,6 @@ thread_stat_s::thread_stat_s() { m_thread_end_cycle = 0; } -//////////////////////////////////////////////////////////////////////////////// -// process_manager_c() - constructor -// m_thread_queue - contains the list of unassigned threads (from all -// applications) that are ready to be launched -// m_block_queue - contains the list of unassigned blocks (from all -// applications) that are ready to be launched -//////////////////////////////////////////////////////////////////////////////// -/////////////////////////////////////////////////////////////////////////////////////////////// - //////////////////////////////////////////////////////////////////////////////// // process_s() - constructor //////////////////////////////////////////////////////////////////////////////// @@ -137,7 +128,7 @@ process_s::process_s() { } //////////////////////////////////////////////////////////////////////////////// -// process_s() - constructor +// process_s() - destructor //////////////////////////////////////////////////////////////////////////////// process_s::~process_s() { } @@ -148,6 +139,7 @@ process_s::~process_s() { thread_s::thread_s(macsim_c *simBase) { m_simBase = simBase; m_fetch_data = new frontend_s; + int buf_ele_size = (CPU_TRACE_SIZE > GPU_TRACE_SIZE) ? CPU_TRACE_SIZE : GPU_TRACE_SIZE; m_buffer = new char[1000 * buf_ele_size]; @@ -185,9 +177,6 @@ thread_s::~thread_s() { //////////////////////////////////////////////////////////////////////////////// // block_schedule_info_s() - constructor //////////////////////////////////////////////////////////////////////////////// -/////////////////////////////////////////////////////////////////////////////////////////////// - -// block_schedule_info_s constructor block_schedule_info_s::block_schedule_info_s() { m_start_to_fetch = false; m_dispatched_core_id = -1; @@ -199,11 +188,12 @@ block_schedule_info_s::block_schedule_info_s() { m_total_thread_num = 0; } +//////////////////////////////////////////////////////////////////////////////// +// block_schedule_info_s() - destructor +//////////////////////////////////////////////////////////////////////////////// block_schedule_info_s::~block_schedule_info_s() { } -/////////////////////////////////////////////////////////////////////////////////////////////// - //////////////////////////////////////////////////////////////////////////////// // process_manager_c() - constructor // m_thread_queue - contains the list of unassigned threads (from all @@ -240,7 +230,7 @@ process_manager_c::~process_manager_c() { // process_manager_c::create_thread_node() // called for each thread/warp when it becomes ready to be launched (started); // allocates a node for the thread/warp and add its to m_thread_queue (for x86) -// or m_block_queue (for ptx) +// or m_block_queue (for ptx or nvbit) //////////////////////////////////////////////////////////////////////////////// void process_manager_c::create_thread_node(process_s *process, int tid, bool main) { @@ -263,6 +253,8 @@ void process_manager_c::create_thread_node(process_s *process, int tid, // block id assignment in case of multiple applications int block_id = start_info->m_thread_id >> BLOCK_ID_SHIFT; + // inside parenthesis is the unique block id + // using multi-key map with multi_key_map_c::find(key1, key2) node->m_block_id = m_simBase->m_block_id_mapper->find( process->m_process_id, block_id + @@ -406,6 +398,7 @@ int process_manager_c::create_process(string appl, int repeat, int pid) { // use for reading traces (which contains one extra field compared to the // structure used when generating traces) match with the structure used when // generating traces + if (*KNOB(KNOB_TRACE_USES_64_BIT_ADDR)) { assert(sizeof(trace_info_gpu_s) == (sizeof(trace_info_gpu_small_s) + sizeof(uint64_t))); @@ -413,6 +406,9 @@ int process_manager_c::create_process(string appl, int repeat, int pid) { assert(sizeof(trace_info_gpu_s) == (sizeof(trace_info_gpu_small_s) + sizeof(uint32_t))); } + } else if (trace_type == "nvbit") { + process->m_acc = true; + process->m_core_pool = &m_simBase->m_acc_core_pool; } else { process->m_acc = false; process->m_core_pool = &m_simBase->m_x86_core_pool; @@ -502,27 +498,31 @@ void process_manager_c::setup_process(process_s *process) { "GPU traces\n"); } } + printf("trace type: "); + cout << trace_type; + cout << " end" << endl; // get occupancy if (trace_type == "ptx") { process->m_max_block = *m_simBase->m_knobs->KNOB_MAX_BLOCK_PER_CORE; - } - if (trace_type == "newptx") { + } else if (trace_type == "newptx") { if (!(trace_config_file >> process->m_max_block)) ASSERTM(0, "error reading from file:%s", trace_info_file_name.c_str()); trace_type = "ptx"; if (*m_simBase->m_knobs->KNOB_MAX_BLOCK_PER_CORE_SUPER > 0) { process->m_max_block = *m_simBase->m_knobs->KNOB_MAX_BLOCK_PER_CORE_SUPER; } - } - - if (trace_type == "x86") { + } else if (trace_type == "x86") { std::string gen_version; trace_config_file >> gen_version; if (gen_version != t_gen_ver) std::cout << "!!WARNING!! Trace reader and trace generator version " "mismatch; trace may not be read correctly." << std::endl; + } else if (trace_type == "nvbit") { + if (!(trace_config_file >> process->m_max_block)) + ASSERTM(0, "error reading from file:%s", trace_info_file_name.c_str()); + process->m_max_block = *m_simBase->m_knobs->KNOB_MAX_BLOCK_PER_CORE; } // get thread count @@ -538,7 +538,8 @@ void process_manager_c::setup_process(process_s *process) { thread_count = *KNOB(KNOB_TRACE_MAX_THREAD_COUNT); } - report("thread_count:" << thread_count); + report("thread_count: " << thread_count); + report("max blocks per core: " << process->m_max_block); // create data structures thread_stat_s *new_stat = new thread_stat_s[thread_count]; @@ -638,7 +639,8 @@ void process_manager_c::setup_process(process_s *process) { // TODO (jaekyu, 1-30-2009) // FIXME - if (trace_type == "ptx" && *KNOB(KNOB_BLOCKS_TO_SIMULATE)) { + // euijun + if ((trace_type == "ptx" || trace_type == "nvbit") && *KNOB(KNOB_BLOCKS_TO_SIMULATE)) { if ((*KNOB(KNOB_BLOCKS_TO_SIMULATE) * m_simBase->m_no_threads_per_block) < static_cast(thread_count)) { uns temp = thread_count; @@ -766,8 +768,13 @@ thread_s *process_manager_c::create_thread(process_s *process, int tid, // TODO - nbl (apr-17-2013): use pools if (process->m_acc) { - trace_info->m_prev_trace_info = new trace_info_gpu_s; - trace_info->m_next_trace_info = new trace_info_gpu_s; + if (KNOB(KNOB_CORE_TYPE)->getValue() == "nvbit") { + trace_info->m_prev_trace_info = new trace_info_nvbit_s; + trace_info->m_next_trace_info = new trace_info_nvbit_s; + } else if (KNOB(KNOB_CORE_TYPE)->getValue() == "ptx") { + trace_info->m_prev_trace_info = new trace_info_gpu_s; + trace_info->m_next_trace_info = new trace_info_gpu_s; + } } else { if (KNOB(KNOB_LARGE_CORE_TYPE)->getValue() == "x86") { trace_info->m_prev_trace_info = new trace_info_cpu_s; @@ -810,9 +817,15 @@ thread_s *process_manager_c::create_thread(process_s *process, int tid, #ifndef USING_QSIM // open trace file - trace_info->m_trace_file = gzopen(filename.c_str(), "r"); - if (trace_info->m_trace_file == NULL) - ASSERTM(0, "error opening trace file:%s\n", filename.c_str()); + trace_info->m_trace_file = gzopen(filename.c_str(), "rb"); + + if (trace_info->m_trace_file == NULL) { + int errnum = errno; + const char* errmsg = strerror(errnum); + printf("Error opening file %s: %s\n", filename.c_str(), errmsg); + ASSERTM(0, "error opening trace file: %s\n", filename.c_str()); + } + #endif trace_info->m_file_opened = true; @@ -973,11 +986,19 @@ int process_manager_c::terminate_thread(int core_id, thread_s *trace_info, // TODO - nbl (apr-17-2013): use pools if (trace_info->m_process->m_acc) { - trace_info_gpu_s *temp = - static_cast(trace_info->m_prev_trace_info); - delete temp; - temp = static_cast(trace_info->m_next_trace_info); - delete temp; + if (KNOB(KNOB_CORE_TYPE)->getValue() == "ptx") { + trace_info_gpu_s *temp = + static_cast(trace_info->m_prev_trace_info); + delete temp; + temp = static_cast(trace_info->m_next_trace_info); + delete temp; + } else if (KNOB(KNOB_CORE_TYPE)->getValue() == "nvbit") { + trace_info_nvbit_s *temp = + static_cast(trace_info->m_prev_trace_info); + delete temp; + temp = static_cast(trace_info->m_next_trace_info); + delete temp; + } } else { if (KNOB(KNOB_LARGE_CORE_TYPE)->getValue() == "x86") { trace_info_cpu_s *temp = @@ -1086,6 +1107,7 @@ int process_manager_c::get_next_low_occupancy_core(std::string core_type) { core_c *core = m_simBase->m_core_pointers[core_id]; if (*KNOB(KNOB_ROUTER_PLACEMENT) == 1 && core->get_core_type() != "ptx" && + core->get_core_type() != "nvbit" && (core_id < *KNOB(KNOB_CORE_ENABLE_BEGIN) || *KNOB(KNOB_CORE_ENABLE_END) < core_id)) continue; @@ -1106,6 +1128,7 @@ int process_manager_c::get_next_available_core(std::string core_type) { core_c *core = m_simBase->m_core_pointers[core_id]; if (*KNOB(KNOB_ROUTER_PLACEMENT) == 1 && core->get_core_type() != "ptx" && + core->get_core_type() != "nvbit" && (core_id < *KNOB(KNOB_CORE_ENABLE_BEGIN) || *KNOB(KNOB_CORE_ENABLE_END) < core_id)) continue; @@ -1155,7 +1178,7 @@ void process_manager_c::sim_thread_schedule(bool initial) { for (std::set::const_iterator itr = core_type_set.begin(); itr != core_type_set.end(); itr++) { std::string core_type = *itr; - if (core_type == "ptx") continue; + if (core_type == "ptx" || core_type == "nvbit") continue; // Get a core of this type // Follow the knob policy (greedy or balanced) @@ -1218,7 +1241,7 @@ void process_manager_c::sim_thread_schedule(bool initial) { core_c *core = m_simBase->m_core_pointers[core_id]; std::string core_type = core->get_core_type(); - if (core_type != "ptx") continue; + if (core_type != "ptx" && core_type != "nvbit") continue; // get currently fetching id int prev_fetching_block_id = core->m_fetching_block_id; @@ -1335,7 +1358,7 @@ int process_manager_c::sim_schedule_thread_block(int core_id, bool initial) { } } - // All threads from previous block have been schedule. Thus, need to find a new block + // All threads from previous block have been scheduled. Thus, need to find a new block int appl_id = core->get_appl_id(); int max_block_per_core = m_simBase->m_sim_processes[appl_id]->m_max_block; @@ -1362,8 +1385,7 @@ int process_manager_c::sim_schedule_thread_block(int core_id, bool initial) { m_simBase->m_block_id_mapper->find( process->m_process_id, process->m_kernel_block_start_count [process->m_current_vector_index - 1])) % - num_core_per_appl != - (core_id - min_core_id)) { + num_core_per_appl != (core_id - min_core_id)) { continue; } } diff --git a/src/retire.cc b/src/retire.cc index 1a80efb7..6ec15bb2 100644 --- a/src/retire.cc +++ b/src/retire.cc @@ -108,7 +108,7 @@ retire_c::retire_c(RETIRE_INTERFACE_PARAMS(), macsim_c* simBase) RETIRE_CONFIG(); - if (m_ptx_sim || m_igpu_sim) m_knob_width = 1000; + if (m_ptx_sim || m_igpu_sim || m_nvbit_sim) m_knob_width = 1000; } // retire_c destructor @@ -130,7 +130,7 @@ void retire_c::run_a_cycle() { vector* uop_list = NULL; unsigned int uop_list_index = 0; - if (m_ptx_sim || m_igpu_sim) { + if (m_ptx_sim || m_igpu_sim || m_nvbit_sim) { // GPU : many retireable uops from multiple threads. Get entire retireable uops uop_list = m_gpu_rob->get_n_uops_in_ready_order(m_knob_width, m_cur_core_cycle); @@ -144,7 +144,7 @@ void retire_c::run_a_cycle() { // we need to handle retirement for x86 and ptx separately // retirement logic for GPU - if (m_ptx_sim || m_igpu_sim) { + if (m_ptx_sim || m_igpu_sim || m_nvbit_sim) { // GPU : many retireable uops from multiple threads. Get entire retireable uops if (uop_list_index == uop_list->size()) { uop_list->clear(); @@ -281,7 +281,7 @@ void retire_c::run_a_cycle() { STAT_EVENT(UOP_COUNT_TOT); // GPU : barrier - if (m_ptx_sim && cur_uop->m_bar_type == BAR_FETCH) { + if ((m_ptx_sim || m_nvbit_sim) && cur_uop->m_bar_type == BAR_FETCH) { frontend_c* frontend = core->get_frontend(); frontend->synch_thread(cur_uop->m_block_id, cur_uop->m_thread_id); } @@ -542,7 +542,7 @@ void retire_c::update_stats(process_s* process) { // TOCHECK I will get back to this later if (*KNOB(KNOB_REPEAT_TRACE) && process->m_repeat < *KNOB(KNOB_REPEAT_TRACE_N) && - core->get_core_type() == "ptx") { + (core->get_core_type() == "ptx" || core->get_core_type() == "nvbit")) { if ((process->m_repeat + 1) == *m_simBase->m_knobs->KNOB_REPEAT_TRACE_N) { --m_simBase->m_process_count_without_repeat; STAT_EVENT_N(CYC_COUNT_ACC, CYCLE); @@ -554,7 +554,7 @@ void retire_c::update_stats(process_s* process) { } } else { if (process->m_repeat == 0) { - if (core->get_core_type() == "ptx") { + if ((core->get_core_type() == "ptx") || (core->get_core_type() == "nvbit")) { STAT_EVENT_N(CYC_COUNT_ACC, CYCLE); } else { STAT_EVENT_N(CYC_COUNT_X86, CYCLE); diff --git a/src/retire.h b/src/retire.h index 6bb64a8b..78d25b73 100644 --- a/src/retire.h +++ b/src/retire.h @@ -221,6 +221,7 @@ class retire_c bool m_ptx_sim; /**< ptx simulation */ bool m_acc_sim; /**< accelerator simulation */ bool m_igpu_sim; /**< intel gpu simulation */ + bool m_nvbit_sim; /**< NVBIT simulation */ unordered_map m_insts_retired; /**< number of retired inst. per thread */ unordered_map diff --git a/src/rwqueue b/src/rwqueue index 0823c612..2dee33ae 160000 --- a/src/rwqueue +++ b/src/rwqueue @@ -1 +1 @@ -Subproject commit 0823c6127331d16ac33679f926be6f9e9f372d98 +Subproject commit 2dee33ae3edd1e454ac34fea0a27017613355eff diff --git a/src/schedule_smc.cc b/src/schedule_smc.cc index 664d5e50..b5747641 100644 --- a/src/schedule_smc.cc +++ b/src/schedule_smc.cc @@ -305,7 +305,7 @@ bool schedule_smc_c::uop_schedule_smc(int thread_id, int entry, // check available mshr spaces for scheduling core_c *core = m_simBase->m_core_pointers[m_core_id]; - if ("ptx" == core->get_core_type() && cur_uop->m_mem_type != NOT_MEM && + if ((("ptx" == core->get_core_type()) || (core->get_core_type() == "nvbit") )&& cur_uop->m_mem_type != NOT_MEM && cur_uop->m_num_child_uops > 0) { // constant or texture memory access if (cur_uop->m_mem_type == MEM_LD_CM || diff --git a/src/trace_read.cc b/src/trace_read.cc index 2f10b11f..61eb5a25 100644 --- a/src/trace_read.cc +++ b/src/trace_read.cc @@ -45,7 +45,7 @@ POSSIBILITY OF SUCH DAMAGE. #include "knob.h" #include "process_manager.h" #include "debug_macros.h" -#include "statistics.h" +// #include "statistics.h" #include "frontend.h" #include "statsEnums.h" #include "utils.h" @@ -59,6 +59,7 @@ POSSIBILITY OF SUCH DAMAGE. #include "trace_read_a64.h" #include "trace_read_igpu.h" #include "trace_read_gpu.h" +#include "trace_read_nvbit.h" #ifdef USING_QSIM #include "trace_gen_x86.h" @@ -190,12 +191,15 @@ void trace_read_c::setup_trace(int core_id, int sim_thread_id) { core_c *core = m_simBase->m_core_pointers[core_id]; thread_s *thread_trace_info = core->get_trace_info(sim_thread_id); + int size = 0; // read one instruction from the trace file to get next instruction. Always one instruction // will be read ahead to get next pc address if (core->m_running_thread_num) { #ifndef USING_QSIM - gzread(thread_trace_info->m_trace_file, - thread_trace_info->m_prev_trace_info, m_trace_size); + if ((size = gzread(thread_trace_info->m_trace_file, + thread_trace_info->m_prev_trace_info, m_trace_size)) <= + 0) + printf("%s\n", gzerror(thread_trace_info->m_trace_file, 0)); #else m_tg->read_trace(core_id, (void *)(thread_trace_info->m_prev_trace_info), m_trace_size); @@ -686,14 +690,21 @@ trace_reader_wrapper_c::trace_reader_wrapper_c(macsim_c *simBase) { m_cpu_decoder = new a64_decoder_c(simBase, m_dprint_output); else if (KNOB(KNOB_LARGE_CORE_TYPE)->getValue() == "igpu") m_cpu_decoder = new igpu_decoder_c(simBase, m_dprint_output); + else if (KNOB(KNOB_LARGE_CORE_TYPE)->getValue() == "ptx") + m_gpu_decoder = new gpu_decoder_c(simBase, m_dprint_output); + else if (KNOB(KNOB_LARGE_CORE_TYPE)->getValue() == "nvbit") + m_gpu_decoder = new nvbit_decoder_c(simBase, m_dprint_output); else { ASSERTM(0, "Wrong core type %s\n", KNOB(KNOB_LARGE_CORE_TYPE)->getValue().c_str()); } - + //if (KNOB(KNOB_CORE_TYPE)->getValue() == "nvbit") //need to double check this // hyesoon Apr-16-2023 + // m_gpu_decoder = new nvbit_decoder_c(simBase, m_dprint_output); m_cpu_decoder->init_pin_convert(); - - m_gpu_decoder = new gpu_decoder_c(simBase, m_dprint_output); + if (KNOB(KNOB_CORE_TYPE)->getValue() == "nvbit") + m_gpu_decoder = new nvbit_decoder_c(simBase, m_dprint_output); + else + m_gpu_decoder = new gpu_decoder_c(simBase, m_dprint_output); } trace_reader_wrapper_c::trace_reader_wrapper_c() { diff --git a/src/trace_read.h b/src/trace_read.h index 89fbe02b..989168ca 100644 --- a/src/trace_read.h +++ b/src/trace_read.h @@ -57,8 +57,11 @@ POSSIBILITY OF SUCH DAMAGE. #define MAX_DST_NUM 6 #define MAX_GPU_SRC_NUM 5 #define MAX_GPU_DST_NUM 4 +#define MAX_NVBIT_SRC_NUM 4 +#define MAX_NVBIT_DST_NUM 4 #define CPU_TRACE_SIZE (sizeof(trace_info_cpu_s) - sizeof(uint64_t)) #define GPU_TRACE_SIZE (sizeof(trace_info_gpu_small_s)) +#define NVBIT_TRACE_SIZE (sizeof(trace_info_nvbit_small_s)) #define MAX_TR_OPCODE 452 // ARM_INS_ENDING /////////////////////////////////////////////////////////////////////////////////////////////// @@ -223,6 +226,75 @@ typedef struct trace_info_gpu_s { m_next_inst_addr; // next pc address, not present in raw trace format } trace_info_gpu_s; +// the same structure as the trace generator + +typedef struct trace_info_nvbit_small_s { + uint8_t m_opcode; + bool m_is_fp; + bool m_is_load; + uint8_t m_cf_type; + uint8_t m_num_read_regs; + uint8_t m_num_dest_regs; + uint16_t m_src[MAX_NVBIT_SRC_NUM]; + uint16_t m_dst[MAX_NVBIT_DST_NUM]; + uint8_t m_size; + + uint32_t m_active_mask; + uint32_t m_br_taken_mask; + uint64_t m_inst_addr; + uint64_t m_br_target_addr; + union { + uint64_t m_reconv_inst_addr; + uint64_t m_mem_addr; + }; + union { + uint8_t m_mem_access_size; + uint8_t m_barrier_id; + }; + uint16_t m_num_barrier_threads; + union { + uint8_t m_addr_space; // for loads, stores, atomic, prefetch(?) + uint8_t m_level; // for membar + }; + uint8_t m_cache_level; // for prefetch? + uint8_t m_cache_operator; // for loads, stores, atomic, prefetch(?) +} trace_info_nvbit_small_s; + +//trace_info_nvbit_small_s + m_next_inst_addr +typedef struct trace_info_nvbit_s { + uint8_t m_opcode; + bool m_is_fp; + bool m_is_load; + uint8_t m_cf_type; + uint8_t m_num_read_regs; + uint8_t m_num_dest_regs; + uint16_t m_src[MAX_NVBIT_SRC_NUM]; + uint16_t m_dst[MAX_NVBIT_DST_NUM]; + uint8_t m_size; + + uint32_t m_active_mask; + uint32_t m_br_taken_mask; + uint64_t m_inst_addr; + uint64_t m_br_target_addr; + union { + uint64_t m_reconv_inst_addr; + uint64_t m_mem_addr; + }; + union { + uint8_t m_mem_access_size; + uint8_t m_barrier_id; + }; + uint16_t m_num_barrier_threads; + union { + uint8_t m_addr_space; // for loads, stores, atomic, prefetch(?) + uint8_t m_level; // for membar + }; + uint8_t m_cache_level; // for prefetch? + uint8_t m_cache_operator; // for loads, stores, atomic, prefetch(?) + uint64_t m_next_inst_addr; // next pc address, not present in raw trace fo + +} trace_info_nvbit_s; + /////////////////////////////////////////////////////////////////////////////////////////////// /// \brief structure to hold decoded uop information /// @@ -586,6 +658,173 @@ typedef enum GPU_FENCE_LEVEL_ENUM_ { GPU_FENCE_LAST } GPU_FENCE_LEVEL_ENUM; +typedef enum NVBIT_OPCODE_ { + NVBIT_FADD, + NVBIT_FADD32I, + NVBIT_FCHK, + NVBIT_FFMA32I, + NVBIT_FFMA, + NVBIT_FMNMX, + NVBIT_FMUL, + NVBIT_FMUL32I, + NVBIT_FSEL, + NVBIT_FSET, + NVBIT_FSETP, + NVBIT_FSWZADD, + NVBIT_MUFU, + NVBIT_HADD2, + NVBIT_HADD2_32I, + NVBIT_HFMA2, + NVBIT_HFMA2_32I, + NVBIT_HMMA, + NVBIT_HMUL2, + NVBIT_HMUL2_32I, + NVBIT_HSET2, + NVBIT_HSETP2, + NVBIT_DADD, + NVBIT_DFMA, + NVBIT_DMUL, + NVBIT_DSETP, + NVBIT_BMMA, + NVBIT_BMSK, + NVBIT_BREV, + NVBIT_FLO, + NVBIT_IABS, + NVBIT_IADD, + NVBIT_IADD3, + NVBIT_IADD32I, + NVBIT_IDP, + NVBIT_IDP4A, + NVBIT_IMAD, + NVBIT_IMMA, + NVBIT_IMNMX, + NVBIT_IMUL, + NVBIT_IMUL32I, + NVBIT_ISCADD, + NVBIT_ISCADD32I, + NVBIT_ISETP, + NVBIT_LEA, + NVBIT_LOP, + NVBIT_LOP3, + NVBIT_LOP32I, + NVBIT_POPC, + NVBIT_SHF, + NVBIT_SHL, + NVBIT_SHR, + NVBIT_VABSDIFF, + NVBIT_VABSDIFF4, + NVBIT_F2F, + NVBIT_F2I, + NVBIT_I2F, + NVBIT_I2I, + NVBIT_I2IP, + NVBIT_FRND, + NVBIT_MOV, + NVBIT_MOV32I, + NVBIT_MOVM, + NVBIT_PRMT, + NVBIT_SEL, + NVBIT_SGXT, + NVBIT_SHFL, + NVBIT_PLOP3, + NVBIT_PSETP, + NVBIT_P2R, + NVBIT_R2P, + NVBIT_LD, + NVBIT_LDC, + NVBIT_LDG, + NVBIT_LDL, + NVBIT_LDS, + NVBIT_LDSM, + NVBIT_ST, + NVBIT_STG, + NVBIT_STL, + NVBIT_STS, + NVBIT_MATCH, + NVBIT_QSPC, + NVBIT_ATOM, + NVBIT_ATOMS, + NVBIT_ATOMG, + NVBIT_RED, + NVBIT_CCTL, + NVBIT_CCTLL, + NVBIT_ERRBAR, + NVBIT_MEMBAR, + NVBIT_CCTLT, + NVBIT_R2UR, + NVBIT_S2UR, + NVBIT_UBMSK, + NVBIT_UBREV, + NVBIT_UCLEA, + NVBIT_UFLO, + NVBIT_UIADD3, + NVBIT_UIADD3_64, + NVBIT_UIMAD, + NVBIT_UISETP, + NVBIT_ULDC, + NVBIT_ULEA, + NVBIT_ULOP, + NVBIT_ULOP3, + NVBIT_ULOP32I, + NVBIT_UMOV, + NVBIT_UP2UR, + NVBIT_UPLOP3, + NVBIT_UPOPC, + NVBIT_UPRMT, + NVBIT_UPSETP, + NVBIT_UR2UP, + NVBIT_USEL, + NVBIT_USGXT, + NVBIT_USHF, + NVBIT_USHL, + NVBIT_USHR, + NVBIT_VOTEU, + NVBIT_TEX, + NVBIT_TLD, + NVBIT_TLD4, + NVBIT_TMML, + NVBIT_TXD, + NVBIT_TXQ, + NVBIT_SUATOM, + NVBIT_SULD, + NVBIT_SURED, + NVBIT_SUST, + NVBIT_BMOV, + NVBIT_BPT, + NVBIT_BRA, + NVBIT_BREAK, + NVBIT_BRX, + NVBIT_BRXU, + NVBIT_BSSY, + NVBIT_BSYNC, + NVBIT_CALL, + NVBIT_EXIT, + NVBIT_JMP, + NVBIT_JMX, + NVBIT_JMXU, + NVBIT_KILL, + NVBIT_NANOSLEEP, + NVBIT_RET, + NVBIT_RPCMOV, + NVBIT_RTT, + NVBIT_WARPSYNC, + NVBIT_YIELD, + NVBIT_B2R, + NVBIT_BAR, + NVBIT_CS2R, + NVBIT_DEPBAR, + NVBIT_GETLMEMBASE, + NVBIT_LEPC, + NVBIT_NOP, + NVBIT_PMTRIG, + NVBIT_R2B, + NVBIT_S2R, + NVBIT_SETCTAID, + NVBIT_SETLMEMBASE, + NVBIT_VOTE, + NVBIT_OPCODE_LAST +} NVBIT_OPCODE; + // in trace generator, special registers are assigned values starting from 200 // matches order in ocelot/ir/interface/PTXOperand.h typedef enum GPU_SPECIAL_REGISTER_ENUM_ { diff --git a/src/trace_read_nvbit.cc b/src/trace_read_nvbit.cc new file mode 100644 index 00000000..537d1b88 --- /dev/null +++ b/src/trace_read_nvbit.cc @@ -0,0 +1,1600 @@ +/* +Copyright (c) <2012>, All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted +provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list of conditions +and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this list of +conditions and the following disclaimer in the documentation and/or other materials provided +with the distribution. + +Neither the name of the nor the names of its contributors +may be used to endorse or promote products derived from this software without specific prior +written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY +AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +*/ + +/********************************************************************************************** + * File : trace_read_nvbit.cc + * Author : HPArch Research Group + * Date : + * SVN : $Id: trace_read.cc 912 2009-11-20 19:09:21Z kacear $ + * Description : Trace read and decode for CPU traces + *********************************************************************************************/ + +#include +#include + +#include "assert_macros.h" +#include "trace_read.h" +#include "trace_read_nvbit.h" +#include "uop.h" +#include "global_types.h" +#include "core.h" +#include "knob.h" +#include "process_manager.h" +#include "debug_macros.h" +#include "statistics.h" +#include "frontend.h" +#include "statsEnums.h" +#include "utils.h" +#include "pref_common.h" +#include "readonly_cache.h" +#include "sw_managed_cache.h" +#include "memory.h" +#include "inst_info.h" + +#include "all_knobs.h" + +/////////////////////////////////////////////////////////////////////////////////////////////// + +#define DEBUG(args...) _DEBUG(*KNOB(KNOB_DEBUG_TRACE_READ), ##args) +#define DEBUG_CORE(m_core_id, args...) \ + if (m_core_id == *m_simBase->m_knobs->KNOB_DEBUG_CORE_ID) { \ + _DEBUG(*m_simBase->m_knobs->KNOB_DEBUG_TRACE_READ, ##args); \ + } + +/////////////////////////////////////////////////////////////////////////////////////////////// + +/** + * Constructor + */ +// move constructor to header file + +nvbit_decoder_c::nvbit_decoder_c(macsim_c *simBase, ofstream *m_dprint_output) + : trace_read_c(simBase, m_dprint_output) { + m_trace_size = NVBIT_TRACE_SIZE; + + // map opcode type to uop type + init_pin_convert(); +} + +/** + * Destructor + */ +nvbit_decoder_c::~nvbit_decoder_c() { +} + +/** + * In case of GPU simulation, from our design, each uncoalcesd accesses will be one + * trace instruction. To make these accesses in one instruction, we need to read trace + * file ahead. + * @param core_id - core id + * @param trace_info - trace information to store an instruction + * @param sim_thread_id - thread id + * @param inst_read - indicate instruction read successful + * @see get_uops_from_traces + */ +bool nvbit_decoder_c::peek_trace(int core_id, void *t_info, int sim_thread_id, + bool *inst_read) { + core_c *core = m_simBase->m_core_pointers[core_id]; + int bytes_read = -1; + thread_s *thread_trace_info = core->get_trace_info(sim_thread_id); + bool ret_val = false; + trace_info_nvbit_s *trace_info = static_cast(t_info); + + if (!thread_trace_info->m_buffer_exhausted) { + memcpy(trace_info, + &thread_trace_info + ->m_buffer[m_trace_size * thread_trace_info->m_buffer_index], + m_trace_size); + thread_trace_info->m_buffer_index = + (thread_trace_info->m_buffer_index + 1) % k_trace_buffer_size; + + if (thread_trace_info->m_buffer_index >= + thread_trace_info->m_buffer_index_max) { + bytes_read = 0; + } else { + bytes_read = m_trace_size; + } + + // we consume all trace buffer entries + if (thread_trace_info->m_buffer_index == 0) { + thread_trace_info->m_buffer_exhausted = true; + } + } + // read one instruction each + else { + bytes_read = gzread(thread_trace_info->m_trace_file, trace_info, m_trace_size); + } + + if (m_trace_size == bytes_read) { + *inst_read = true; + ret_val = true; + } else if (0 == bytes_read) { + *inst_read = false; + ret_val = true; + } else { + *inst_read = false; + ret_val = false; + } + + return ret_val; +} + +/** + * After peeking trace, in case of failture, we need to rewind trace file. + * @param core_id - core id + * @param sim_thread_id - thread id + * @param num_inst - number of instructions to rewind + * @see peek_trace + */ +bool nvbit_decoder_c::ungetch_trace(int core_id, int sim_thread_id, + int num_inst) { + core_c *core = m_simBase->m_core_pointers[core_id]; + thread_s *thread_trace_info = core->get_trace_info(sim_thread_id); + + // if we read instructions and store it in the buffer, reduce buffer index only + if (thread_trace_info->m_buffer_index >= num_inst) { + thread_trace_info->m_buffer_index -= num_inst; + return true; + } + // more instructions to rewind. + else if (thread_trace_info->m_buffer_index) { + num_inst -= thread_trace_info->m_buffer_index; + thread_trace_info->m_buffer_index = 0; + } + + off_t offset = gzseek(thread_trace_info->m_trace_file, + -1 * num_inst * m_trace_size, SEEK_CUR); + if (offset == -1) { + return false; + } + return true; +} + +/** + * Dump out instruction information to the file. At most 50000 instructions will be printed + * @param t_info - trace information + * @param core_id - core id + * @param thread_id - thread id + */ +void nvbit_decoder_c::dprint_inst(void *trace_info, int core_id, + int thread_id) { + if (m_dprint_count++ >= 50000 || !*KNOB(KNOB_DEBUG_PRINT_TRACE)) return; + trace_info_nvbit_s *inst_info = static_cast(trace_info); + + (*m_dprint_output) << "*** begin of the data structure *** " << endl; + (*m_dprint_output) << "core_id:" << core_id << " thread_id:" << thread_id + << endl; + (*m_dprint_output) << "opcode: " << g_tr_opcode_names[inst_info->m_opcode] + << endl; + (*m_dprint_output) << "num_read_regs: " + << (uint32_t)inst_info->m_num_read_regs << endl; + (*m_dprint_output) << "num_dest_regs: " + << (uint32_t)inst_info->m_num_dest_regs << endl; + + for (uint32_t ii = 0; ii < (uint32_t)inst_info->m_num_read_regs; ++ii) { + (*m_dprint_output) << "src" << ii << ": " << (uint32_t)inst_info->m_src[ii] + << endl; + } + + for (uint32_t ii = 0; ii < (uint32_t)inst_info->m_num_dest_regs; ++ii) + (*m_dprint_output) << "dst" << ii << ": " << (uint32_t)inst_info->m_dst[ii] + << endl; + + (*m_dprint_output) << "is_fp: " << inst_info->m_is_fp << endl; + (*m_dprint_output) << "cf_type: " << g_tr_cf_names[inst_info->m_cf_type] + << endl; + (*m_dprint_output) << "is_load: " << inst_info->m_is_load << endl; + + (*m_dprint_output) << "inst_size: " << (uint32_t)inst_info->m_size << endl; + + (*m_dprint_output) << "inst_addr: " << hex << inst_info->m_inst_addr << dec + << endl; + (*m_dprint_output) << "active_mask: " << hex << inst_info->m_active_mask + << dec << endl; + (*m_dprint_output) << "br_taken_mask: " << hex << inst_info->m_br_taken_mask + << dec << endl; + + (*m_dprint_output) << "br_target_addr: " << hex << inst_info->m_br_target_addr + << dec << endl; + (*m_dprint_output) << "reconv_inst_addr/mem_addr: " << hex + << inst_info->m_reconv_inst_addr << dec << endl; + + (*m_dprint_output) << "mem_addr/reconv_inst_addr: " << hex + << inst_info->m_mem_addr << dec << endl; + (*m_dprint_output) << "mem_access_size/barrier_id: " + << (uint32_t)inst_info->m_mem_access_size << endl; + (*m_dprint_output) << "num_barrier_threads: " + << (uint32_t)inst_info->m_num_barrier_threads << endl; + if (inst_info->m_opcode == GPU_MEMBAR_CTA || + inst_info->m_opcode == GPU_MEMBAR_GL || + inst_info->m_opcode == GPU_MEMBAR_SYS) { + (*m_dprint_output) << "addr_space/fence_level: " << g_addr_space_names[0] + << endl; + } else { + (*m_dprint_output) << "addr_space/fence_level: " + << g_addr_space_names[inst_info->m_addr_space] << endl; + } + //(*m_dprint_output) << "cache_operator: " + // << g_cache_op_names[inst_info->m_cache_operator] << endl; + + (*m_dprint_output) << "barrier_id/mem_access_size: " + << (uint32_t)inst_info->m_barrier_id << endl; + + //(*m_dprint_output) << "cache_level: " + // << g_cache_level_names[inst_info->m_cache_level] << endl; + if (inst_info->m_level < GPU_FENCE_LAST) { + (*m_dprint_output) << "fence_level/addr_space: " + << g_fence_level_names[inst_info->m_level] << endl; + } else { + (*m_dprint_output) << "fence_level/addr_space: " << g_fence_level_names[0] + << endl; + } + + (*m_dprint_output) << "*** end of the data structure *** " << endl << endl; +} + +/////////////////////////////////////////////////////////////////////////////////////////////// + +// FIXME +/** + * GPU simulation : Read trace ahead to read synchronization information + * @param trace_info - trace information + * @see process_manager_c::sim_thread_schedule + */ + +void nvbit_decoder_c::pre_read_trace(thread_s *trace_info) { + int bytes_read; + trace_info_nvbit_s inst_info; + + while ((bytes_read = gzread(trace_info->m_trace_file, &inst_info, + m_trace_size)) == m_trace_size) { + // printing opcode.. + printf("%x ", inst_info.m_opcode); + } + printf("\n"); + gzrewind(trace_info->m_trace_file); +} + +/////////////////////////////////////////////////////////////////////////////////////////////// + +/** + * From statis instruction, add dynamic information such as load address, branch target, ... + * @param info - instruction information from the hash table + * @param pi - raw trace information + * @param trace_uop - MacSim uop type + * @param rep_offset - repetition offet + * @param core_id - core id + */ +void nvbit_decoder_c::convert_dyn_uop(inst_info_s *info, void *trace_info, + trace_uop_s *trace_uop, Addr rep_offset, + int core_id) { + core_c *core = m_simBase->m_core_pointers[core_id]; + trace_info_nvbit_s *pi = static_cast(trace_info); + trace_uop->m_va = 0; + + trace_uop->m_active_mask = pi->m_active_mask; + if (info->m_table_info->m_cf_type) { + trace_uop->m_actual_taken = pi->m_br_taken_mask ? true : false; + trace_uop->m_target = pi->m_br_target_addr; + + trace_uop->m_taken_mask = pi->m_br_taken_mask; + trace_uop->m_reconverge_addr = pi->m_reconv_inst_addr; + } + // TODO (jaekyu, 2-9-2013) + // what is AMP_VAL? + else if (info->m_table_info->m_mem_type) { + int amp_val = *KNOB(KNOB_MEM_SIZE_AMP); + + if (info->m_table_info->m_mem_type == MEM_ST || + info->m_table_info->m_mem_type == MEM_ST_LM || + info->m_table_info->m_mem_type == MEM_ST_SM || + info->m_table_info->m_mem_type == MEM_ST_GM) { + trace_uop->m_va = MIN2((pi->m_mem_addr + rep_offset) * amp_val, MAX_ADDR); + trace_uop->m_mem_size = pi->m_mem_access_size * amp_val; + } else if ((info->m_table_info->m_mem_type == MEM_LD) || + (info->m_table_info->m_mem_type == MEM_LD_CM) || + (info->m_table_info->m_mem_type == MEM_LD_GM) || + (info->m_table_info->m_mem_type == MEM_LD_LM) || + (info->m_table_info->m_mem_type == MEM_LD_SM)) { + if (info->m_trace_info.m_second_mem) { + assert( + 0); // nbl - mar-19-2013: ptx instructions access only one memory location + // trace_uop->m_va = MIN2((pi->m_ld_vaddr2 + rep_offset)*amp_val, MAX_ADDR); + } else + trace_uop->m_va = MIN2((pi->m_mem_addr + rep_offset) * amp_val, MAX_ADDR); + + trace_uop->m_mem_size = pi->m_mem_access_size * amp_val; + } + } + + // next pc + trace_uop->m_npc = trace_uop->m_addr; +} + +/////////////////////////////////////////////////////////////////////////////////////////////// + +/** + * Function to decode an instruction from the trace file into a sequence of uops + * @param pi - raw trace format + * @param trace_uop - micro uops storage for this instruction + * @param core_id - core id + * @param sim_thread_id - thread id + */ +inst_info_s *nvbit_decoder_c::convert_pinuop_to_t_uop(void *trace_info, + trace_uop_s **trace_uop, + int core_id, + int sim_thread_id) { + core_c *core = m_simBase->m_core_pointers[core_id]; + trace_info_nvbit_s *pi = static_cast(trace_info); + + // simulator maintains a cache of decoded instructions (uop) for each process, + // this avoids decoding of instructions everytime an instruction is executed + int process_id = core->get_trace_info(sim_thread_id)->m_process->m_process_id; + hash_c *htable = m_simBase->m_inst_info_hash[process_id]; + + // since each instruction can be decoded into multiple uops, the key to the + // hashtable has to be (instruction addr + something else) + // the instruction addr is shifted left by 3-bits and the number of the uop + // in the decoded sequence is added to the shifted value to obtain the key + bool new_entry = false; + Addr key_addr = (pi->m_inst_addr << 3); + + // Get instruction information from the hash table if exists. + // Else create a new entry + inst_info_s *info = htable->hash_table_access_create(key_addr, &new_entry); + + inst_info_s *first_info = info; + int num_uop = 0; + int dyn_uop_counter = 0; + bool tmp_reg_needed = false; + bool inst_has_ALU_uop = false; + bool inst_has_ld_uop = false; + int ii, jj, kk; + + if (new_entry) { + // Since we found a new instruction, we need to decode this instruction and store all + // uops to the hash table + int write_dest_reg = 0; + + trace_uop[0]->m_rep_uop_num = 0; + trace_uop[0]->m_opcode = pi->m_opcode; + + // temporal register rules: + // load->dest_reg (through tmp), load->store (through tmp), dest_reg->store (real reg) + // load->cf (through tmp), dest_reg->cf (thought dest), st->cf (no dependency) + + /// + /// 1. This instruction has a memory load operation + /// + if (pi->m_is_load) { + num_uop = 1; + + // set memory type + switch (pi->m_opcode) { + case NVBIT_LD: + trace_uop[0]->m_mem_type = MEM_LD; + break; + case NVBIT_LDC: + trace_uop[0]->m_mem_type = MEM_LD_CM; + break; + case NVBIT_LDG: + trace_uop[0]->m_mem_type = MEM_LD_GM; + break; + case NVBIT_LDL: + trace_uop[0]->m_mem_type = MEM_LD_LM; + break; + case NVBIT_LDS: + case NVBIT_LDSM: + trace_uop[0]->m_mem_type = MEM_LD_SM; + break; + default: + trace_uop[0]->m_mem_type = MEM_LD; + break; + } + + trace_uop[0]->m_cf_type = NOT_CF; + trace_uop[0]->m_op_type = (pi->m_is_fp) ? UOP_FMEM : UOP_IMEM; + trace_uop[0]->m_bar_type = NOT_BAR; + trace_uop[0]->m_num_dest_regs = pi->m_num_dest_regs; + trace_uop[0]->m_num_src_regs = pi->m_num_read_regs; + trace_uop[0]->m_pin_2nd_mem = 0; + trace_uop[0]->m_eom = 0; + trace_uop[0]->m_alu_uop = false; + trace_uop[0]->m_inst_size = pi->m_size; + + // m_has_immediate is meaningless for GPU traces + trace_uop[0]->m_mul_mem_uops = 0; + + write_dest_reg = 1; + + if (trace_uop[0]->m_mem_type == MEM_LD || + trace_uop[0]->m_mem_type == MEM_LD_CM || + trace_uop[0]->m_mem_type == MEM_LD_GM || + trace_uop[0]->m_mem_type == MEM_LD_LM || + trace_uop[0]->m_mem_type == MEM_LD_SM) { + inst_has_ld_uop = true; + } + } // HAS_LOAD + + // Add one more uop when temporary register is required + if (pi->m_num_dest_regs && !write_dest_reg) { + trace_uop_s *cur_trace_uop = trace_uop[num_uop++]; + if (inst_has_ld_uop) { + tmp_reg_needed = true; + } + + cur_trace_uop->m_opcode = pi->m_opcode; + cur_trace_uop->m_mem_type = NOT_MEM; + cur_trace_uop->m_cf_type = NOT_CF; + cur_trace_uop->m_op_type = + (Uop_Type)((pi->m_is_fp) ? m_fp_uop_table[pi->m_opcode] + : m_int_uop_table[pi->m_opcode]); + cur_trace_uop->m_bar_type = NOT_BAR; + cur_trace_uop->m_num_src_regs = pi->m_num_read_regs; + cur_trace_uop->m_num_dest_regs = pi->m_num_dest_regs; + cur_trace_uop->m_pin_2nd_mem = 0; + cur_trace_uop->m_eom = 0; + cur_trace_uop->m_alu_uop = true; + + inst_has_ALU_uop = true; + } + + /// + /// 2. Instruction has a memory store operation + /// + if (pi->m_opcode == NVBIT_ST || pi->m_opcode == NVBIT_STG || + pi->m_opcode == NVBIT_STL || pi->m_opcode == NVBIT_STS) { + trace_uop_s *cur_trace_uop = trace_uop[num_uop++]; + if (inst_has_ld_uop) tmp_reg_needed = true; + + // set memory type + switch (pi->m_opcode) { + case NVBIT_ST: + cur_trace_uop->m_mem_type = MEM_ST; + break; + case NVBIT_STG: + cur_trace_uop->m_mem_type = MEM_ST_GM; + break; + case NVBIT_STL: + cur_trace_uop->m_mem_type = MEM_ST_LM; + break; + case NVBIT_STS: + cur_trace_uop->m_mem_type = MEM_ST_SM; + break; + default: + cur_trace_uop->m_mem_type = MEM_ST; + break; + } + + cur_trace_uop->m_opcode = pi->m_opcode; + cur_trace_uop->m_cf_type = NOT_CF; + cur_trace_uop->m_op_type = (pi->m_is_fp) ? UOP_FMEM : UOP_IMEM; + cur_trace_uop->m_bar_type = NOT_BAR; + cur_trace_uop->m_num_src_regs = pi->m_num_read_regs; + cur_trace_uop->m_num_dest_regs = 0; + cur_trace_uop->m_pin_2nd_mem = 0; + cur_trace_uop->m_eom = 0; + cur_trace_uop->m_alu_uop = false; + cur_trace_uop->m_inst_size = pi->m_size; + cur_trace_uop->m_mul_mem_uops = 0; + } + + /// + /// 3. Instruction has a branch operation + /// + if (pi->m_cf_type) { + trace_uop_s *cur_trace_uop = trace_uop[num_uop++]; + + if (inst_has_ld_uop) tmp_reg_needed = true; + + cur_trace_uop->m_mem_type = NOT_MEM; + cur_trace_uop->m_cf_type = + (Cf_Type)((pi->m_cf_type >= PIN_CF_SYS) ? CF_ICO : pi->m_cf_type); + cur_trace_uop->m_op_type = UOP_CF; + cur_trace_uop->m_bar_type = NOT_BAR; + cur_trace_uop->m_num_src_regs = pi->m_num_read_regs; + cur_trace_uop->m_num_dest_regs = 0; + cur_trace_uop->m_pin_2nd_mem = 0; + cur_trace_uop->m_eom = 0; + cur_trace_uop->m_alu_uop = false; + cur_trace_uop->m_inst_size = pi->m_size; + } + + // To-do.. fix GPU_ to NVBIT_ + // ASSERTM(pi->m_opcode != GPU_BAR_ARRIVE && pi->m_opcode != GPU_BAR_RED && + // pi->m_opcode != GPU_MEMBAR_CTA && + // pi->m_opcode != GPU_MEMBAR_GL && pi->m_opcode != GPU_MEMBAR_SYS, + // "unsupported uop - %d - %s", + // pi->m_opcode, nvbit_decoder_c::g_tr_opcode_names[pi->m_opcode]); + + /// + /// Non-memory, non-branch instruction + /// + if (num_uop == 0) { + trace_uop[0]->m_opcode = pi->m_opcode; + trace_uop[0]->m_mem_type = NOT_MEM; + trace_uop[0]->m_cf_type = NOT_CF; + trace_uop[0]->m_op_type = UOP_NOP; + trace_uop[0]->m_bar_type = NOT_BAR; + trace_uop[0]->m_num_dest_regs = 0; + trace_uop[0]->m_num_src_regs = 0; + trace_uop[0]->m_pin_2nd_mem = 0; + trace_uop[0]->m_eom = 1; + trace_uop[0]->m_inst_size = pi->m_size; + ++num_uop; + } + + info->m_trace_info.m_bom = true; + info->m_trace_info.m_eom = false; + info->m_trace_info.m_num_uop = num_uop; + + /// + /// Process each static uop to dynamic uop + /// + // GPUs should have only one uop per inst! + for (ii = 0; ii < num_uop; ++ii) { + // For the first uop, we have already created hash entry. However, for following uops + // we need to create hash entries + if (ii > 0) { + key_addr = ((pi->m_inst_addr << 3) + ii); + info = htable->hash_table_access_create(key_addr, &new_entry); + info->m_trace_info.m_bom = false; + info->m_trace_info.m_eom = false; + } + ASSERTM(new_entry, "Add new uops to hash_table for core id::%d\n", + core_id); + + trace_uop[ii]->m_addr = pi->m_inst_addr; + + DEBUG_CORE(core_id, + "pi->instruction_addr:0x%llx trace_uop[%d]->addr:0x%llx " + "num_src_regs:%d num_read_regs:%d " + "pi:num_dst_regs:%d uop:num_dst_regs:%d \n", + (Addr)(pi->m_inst_addr), ii, trace_uop[ii]->m_addr, + trace_uop[ii]->m_num_src_regs, pi->m_num_read_regs, + pi->m_num_dest_regs, trace_uop[ii]->m_num_dest_regs); + + // set source register + for (jj = 0; jj < trace_uop[ii]->m_num_src_regs; ++jj) { + (trace_uop[ii])->m_srcs[jj].m_type = (Reg_Type)0; + (trace_uop[ii])->m_srcs[jj].m_id = pi->m_src[jj]; + (trace_uop[ii])->m_srcs[jj].m_reg = pi->m_src[jj]; + } + + // store or control flow has a dependency whoever the last one + if ((trace_uop[ii]->m_mem_type == MEM_ST) || + (trace_uop[ii]->m_mem_type == MEM_ST_LM) || + (trace_uop[ii]->m_mem_type == MEM_ST_SM) || + (trace_uop[ii]->m_mem_type == MEM_ST_GM) || + (trace_uop[ii]->m_cf_type != NOT_CF)) { + if (tmp_reg_needed && !inst_has_ALU_uop) { + (trace_uop[ii])->m_srcs[jj].m_type = (Reg_Type)0; + (trace_uop[ii])->m_srcs[jj].m_id = TR_REG_TMP0; + (trace_uop[ii])->m_srcs[jj].m_reg = TR_REG_TMP0; + trace_uop[ii]->m_num_src_regs += 1; + } else if (inst_has_ALU_uop) { + for (kk = 0; kk < pi->m_num_dest_regs; ++kk) { + (trace_uop[ii])->m_srcs[jj + kk].m_type = (Reg_Type)0; + (trace_uop[ii])->m_srcs[jj + kk].m_id = pi->m_dst[kk]; + (trace_uop[ii])->m_srcs[jj + kk].m_reg = pi->m_dst[kk]; + } + + trace_uop[ii]->m_num_src_regs += pi->m_num_dest_regs; + } + } + + // alu uop only has a dependency with a temp register + if (trace_uop[ii]->m_alu_uop) { + if (tmp_reg_needed) { + (trace_uop[ii])->m_srcs[jj].m_type = (Reg_Type)0; + (trace_uop[ii])->m_srcs[jj].m_id = TR_REG_TMP0; + (trace_uop[ii])->m_srcs[jj].m_reg = TR_REG_TMP0; + trace_uop[ii]->m_num_src_regs += 1; + } + } + + for (jj = 0; jj < trace_uop[ii]->m_num_dest_regs; ++jj) { + (trace_uop[ii])->m_dests[jj].m_type = (Reg_Type)0; + (trace_uop[ii])->m_dests[jj].m_id = pi->m_dst[jj]; + (trace_uop[ii])->m_dests[jj].m_reg = pi->m_dst[jj]; + } + + // add tmp register as a destination register + if (tmp_reg_needed && ((trace_uop[ii]->m_mem_type == MEM_LD) || + (trace_uop[ii]->m_mem_type == MEM_LD_CM) || + (trace_uop[ii]->m_mem_type == MEM_LD_GM) || + (trace_uop[ii]->m_mem_type == MEM_LD_LM) || + (trace_uop[ii]->m_mem_type == MEM_LD_SM))) { + (trace_uop[ii])->m_dests[jj].m_type = (Reg_Type)0; + (trace_uop[ii])->m_dests[jj].m_id = TR_REG_TMP0; + (trace_uop[ii])->m_dests[jj].m_reg = TR_REG_TMP0; + trace_uop[ii]->m_num_dest_regs += 1; + } + + // NVBit trace tool does not support yet... (euijun Feb 28 2024) + // // the last uop + // if (ii == (num_uop - 1) && + // trace_uop[num_uop - 1]->m_mem_type == NOT_MEM) { + // if (pi->m_opcode == NVBIT_BAR) { + // // only the last instruction will have bar type - this is in case of + // // CPU, in case of GPU there is always only one uop? + // trace_uop[(num_uop - 1)]->m_bar_type = BAR_FETCH; + // } + // } + + // update instruction information with MacSim trace + convert_t_uop_to_info(trace_uop[ii], info); + + DEBUG_CORE(core_id, "tuop: pc 0x%llx num_src_reg:%d num_dest_reg:%d \n", + trace_uop[ii]->m_addr, trace_uop[ii]->m_num_src_regs, + trace_uop[ii]->m_num_dest_regs); + + trace_uop[ii]->m_info = info; + + // Add dynamic information to the uop + convert_dyn_uop(info, pi, trace_uop[ii], 0, core_id); + } + + // set end of macro flag to the last uop + trace_uop[num_uop - 1]->m_eom = 1; + + ASSERT(num_uop > 0); + } // NEW_ENTRY + /// + /// Hash table already has matching instruction, we can skip above decoding process + /// + else { + ASSERT(info); + + num_uop = info->m_trace_info.m_num_uop; + for (ii = 0; ii < num_uop; ++ii) { + if (ii > 0) { + key_addr = ((pi->m_inst_addr << 3) + ii); + info = htable->hash_table_access_create(key_addr, &new_entry); + } + ASSERTM(!new_entry, "Core id %d index %d\n", core_id, ii); + + // convert raw instruction trace to MacSim trace format + convert_info_uop(info, trace_uop[ii]); + + // add dynamic information + convert_dyn_uop(info, pi, trace_uop[ii], 0, core_id); + + trace_uop[ii]->m_info = info; + trace_uop[ii]->m_eom = 0; + trace_uop[ii]->m_addr = pi->m_inst_addr; + trace_uop[ii]->m_opcode = pi->m_opcode; + if (trace_uop[ii]->m_mem_type) { + // nagesh mar-10-2010 - to form single uop for uncoalesced memory accesses + // this checking should be done for every instance of the instruction, + // not for only the first instance, because depending on the address + // calculation, some accesses may be coalesced, some may be uncoalesced + trace_uop[ii]->m_mul_mem_uops = 0; + } + } + + // set end of macro flag to the last uop + trace_uop[num_uop - 1]->m_eom = 1; + + ASSERT(num_uop > 0); + } + + ///////////////////////////////////////////////////////////////////////////////////////////// + // end of instruction decoding + ///////////////////////////////////////////////////////////////////////////////////////////// + + dyn_uop_counter = num_uop; + ASSERT(dyn_uop_counter); + + // set eom flag and next pc address for the last uop of this instruction + trace_uop[dyn_uop_counter - 1]->m_eom = 1; + trace_uop[dyn_uop_counter - 1]->m_npc = pi->m_next_inst_addr; + + STAT_CORE_EVENT(core_id, OP_CAT_GPU_INVALID + (pi->m_opcode)); + +// nbl - mar-19-2013, power events have to be fixed +#if NBL_MAR_19_2013_TBD + if (pi->m_num_ld || pi->m_has_st) { + POWER_CORE_EVENT(core_id, POWER_SEGMENT_REGISTER_W); + } + + if (pi->m_write_flg) { + POWER_CORE_EVENT(core_id, POWER_FLAG_REGISTER_W); + } +#endif + + ASSERT(num_uop > 0); + first_info->m_trace_info.m_num_uop = num_uop; + + return first_info; +} + +/////////////////////////////////////////////////////////////////////////////////////////////// + +/** + * Get an uop from trace + * Called by frontend.cc + * @param core_id - core id + * @param uop - uop object to hold instruction information + * @param sim_thread_id thread id + */ +bool nvbit_decoder_c::get_uops_from_traces(int core_id, uop_c *uop, + int sim_thread_id) { + ASSERT(uop); + + trace_uop_s *trace_uop; + int num_uop = 0; + core_c *core = m_simBase->m_core_pointers[core_id]; + inst_info_s *info; + + // fetch ended : no uop to fetch + if (core->m_fetch_ended[sim_thread_id]) return false; + + trace_info_nvbit_s trace_info; + bool read_success = true; + thread_s *thread_trace_info = core->get_trace_info(sim_thread_id); + + if (thread_trace_info->m_thread_init) { + thread_trace_info->m_thread_init = false; + } + + /// + /// BOM (beginning of macro) : need to get a next instruction + /// + + if (thread_trace_info->m_bom) { + bool inst_read; // indicate new instruction has been read from a trace file + + if (core->m_inst_fetched[sim_thread_id] < *KNOB(KNOB_MAX_INSTS)) { + // read next instruction + read_success = read_trace(core_id, thread_trace_info->m_next_trace_info, + sim_thread_id, &inst_read); + } else { + inst_read = false; + if (!core->get_trace_info(sim_thread_id)->m_trace_ended) { + core->get_trace_info(sim_thread_id)->m_trace_ended = true; + } + } + + // Copy current instruction to data structure + memcpy(&trace_info, thread_trace_info->m_prev_trace_info, + sizeof(trace_info_nvbit_s)); + + // Set next pc address + trace_info_nvbit_s *next_trace_info = + static_cast(thread_trace_info->m_next_trace_info); + trace_info.m_next_inst_addr = next_trace_info->m_inst_addr; + + // Copy next instruction to current instruction field + memcpy(thread_trace_info->m_prev_trace_info, + thread_trace_info->m_next_trace_info, sizeof(trace_info_nvbit_s)); + + DEBUG_CORE(core_id, + "trace_read nm core_id:%d thread_id:%d pc:0x%llx opcode:%d " + "inst_count:%llu\n", + core_id, sim_thread_id, (Addr)(trace_info.m_inst_addr), + static_cast(trace_info.m_opcode), + (Counter)(thread_trace_info->m_temp_inst_count)); + + /// + /// Trace read failed + /// + if (!read_success) return false; + + // read a new instruction, so update stats + if (inst_read) { + ++core->m_inst_fetched[sim_thread_id]; + DEBUG_CORE(core_id, "core_id:%d thread_id:%d inst_num:%llu\n", core_id, + sim_thread_id, + (Counter)(thread_trace_info->m_temp_inst_count + 1)); + if (core->m_inst_fetched[sim_thread_id] > core->m_max_inst_fetched) + core->m_max_inst_fetched = core->m_inst_fetched[sim_thread_id]; + } + + // GPU simulation : if we use common cache for the shared memory + // Set appropiriate opcode type (not using shared memory) + if (*KNOB(KNOB_PTX_COMMON_CACHE)) { + switch (trace_info.m_opcode) { + case NVBIT_STS: + trace_info.m_opcode = NVBIT_STL; + break; + case NVBIT_LDS: + case NVBIT_LDSM: + trace_info.m_opcode = NVBIT_LDL; + break; + } + } + + // So far we have raw instruction format, so we need to MacSim specific trace format + info = + convert_pinuop_to_t_uop(&trace_info, thread_trace_info->m_trace_uop_array, + core_id, sim_thread_id); + + trace_uop = thread_trace_info->m_trace_uop_array[0]; + num_uop = info->m_trace_info.m_num_uop; + ASSERT(info->m_trace_info.m_num_uop > 0); + + thread_trace_info->m_num_sending_uop = 1; + thread_trace_info->m_eom = thread_trace_info->m_trace_uop_array[0]->m_eom; + thread_trace_info->m_bom = false; + + uop->m_isitBOM = true; + POWER_CORE_EVENT(core_id, POWER_INST_DECODER_R); + POWER_CORE_EVENT(core_id, POWER_OPERAND_DECODER_R); + } // END EOM + // read remaining uops from the same instruction + else { + trace_uop = thread_trace_info + ->m_trace_uop_array[thread_trace_info->m_num_sending_uop]; + info = trace_uop->m_info; + thread_trace_info->m_eom = trace_uop->m_eom; + info->m_trace_info.m_bom = 0; // because of repeat instructions .... + uop->m_isitBOM = false; + ++thread_trace_info->m_num_sending_uop; + } + + // set end of macro flag + if (thread_trace_info->m_eom) { + uop->m_isitEOM = true; // mark for current uop + thread_trace_info->m_bom = true; // mark for next instruction + } else { + uop->m_isitEOM = false; + thread_trace_info->m_bom = false; + } + + if (core->get_trace_info(sim_thread_id)->m_trace_ended && uop->m_isitEOM) { + --core->m_fetching_thread_num; + core->m_fetch_ended[sim_thread_id] = true; + uop->m_last_uop = true; + DEBUG_CORE(core_id, + "core_id:%d thread_id:%d inst_num:%lld uop_num:%lld " + "fetched:%lld last uop\n", + core_id, sim_thread_id, uop->m_inst_num, uop->m_uop_num, + core->m_inst_fetched[sim_thread_id]); + } + + /* BAR_FETCH */ + if (trace_uop->m_bar_type == + BAR_FETCH) { // only last uop with have BAR_FETCH set + frontend_c *frontend = core->get_frontend(); + frontend_s *fetch_data = core->get_trace_info(sim_thread_id)->m_fetch_data; + + fetch_data->m_fetch_blocked = true; + + bool new_entry = false; + sync_thread_s *sync_info = frontend->m_sync_info->hash_table_access_create( + core->get_trace_info(sim_thread_id)->m_block_id, &new_entry); + + // new synchronization information + if (new_entry) { + sync_info->m_block_id = core->get_trace_info(sim_thread_id)->m_block_id; + sync_info->m_sync_count = 0; + sync_info->m_num_threads_in_block = + m_simBase->m_block_schedule_info[sync_info->m_block_id] + ->m_total_thread_num; + } + + ++fetch_data->m_sync_count; + fetch_data->m_sync_wait_start = core->get_cycle_count(); + } + + /// + /// Set up actual uop data structure + /// + uop->m_opcode = trace_uop->m_opcode; + uop->m_uop_type = info->m_table_info->m_op_type; + uop->m_cf_type = info->m_table_info->m_cf_type; + uop->m_mem_type = info->m_table_info->m_mem_type; + ASSERT(uop->m_mem_type >= 0 && uop->m_mem_type < NUM_MEM_TYPES); + uop->m_bar_type = trace_uop->m_bar_type; + uop->m_npc = trace_uop->m_npc; + uop->m_active_mask = trace_uop->m_active_mask; + + if (uop->m_cf_type) { + uop->m_taken_mask = trace_uop->m_taken_mask; + uop->m_reconverge_addr = trace_uop->m_reconverge_addr; + uop->m_target_addr = trace_uop->m_target; + } + + if (uop->m_opcode == GPU_EN) { + m_simBase->m_gpu_paused = false; + } + + // address translation + if (trace_uop->m_va == 0) { + uop->m_vaddr = 0; + } else { + // since we can have 64-bit address space and each trace has 32-bit address, + // using extra bits to differentiate address space of each application + uop->m_vaddr = + trace_uop->m_va + + m_simBase->m_memory->base_addr( + core_id, + (unsigned long)UINT_MAX * + (core->get_trace_info(sim_thread_id)->m_process->m_process_id) * + 10ul); + } + + uop->m_mem_size = trace_uop->m_mem_size; + + uop->m_dir = trace_uop->m_actual_taken; + uop->m_pc = info->m_addr; + uop->m_core_id = core_id; + + // we found first uop of an instruction, so add instruction count + if (uop->m_isitBOM) ++thread_trace_info->m_temp_inst_count; + + uop->m_inst_num = thread_trace_info->m_temp_inst_count; + uop->m_num_srcs = trace_uop->m_num_src_regs; + uop->m_num_dests = trace_uop->m_num_dest_regs; + + ASSERTM(uop->m_num_dests < MAX_DST_NUM, "uop->num_dests=%d MAX_DST_NUM=%d\n", + uop->m_num_dests, MAX_DST_NUM); + + // uop number is specific to the core + uop->m_unique_num = core->inc_and_get_unique_uop_num(); + + DEBUG_CORE(uop->m_core_id, + "uop_num:%llu num_srcs:%d trace_uop->num_src_regs:%d " + "num_dsts:%d num_seing_uop:%d pc:0x%llx dir:%d \n", + uop->m_uop_num, uop->m_num_srcs, trace_uop->m_num_src_regs, + uop->m_num_dests, thread_trace_info->m_num_sending_uop, uop->m_pc, + uop->m_dir); + + // filling the src_info, dest_info + if (uop->m_num_srcs < MAX_SRCS) { + for (int index = 0; index < uop->m_num_srcs; ++index) { + uop->m_src_info[index] = trace_uop->m_srcs[index].m_id; + // DEBUG("uop_num:%lld src_info[%d]:%d\n", uop->uop_num, index, uop->src_info[index]); + } + } else { + ASSERTM(uop->m_num_srcs < MAX_SRCS, "src_num:%d MAX_SRC:%d", + uop->m_num_srcs, MAX_SRCS); + } + + for (int index = 0; index < uop->m_num_dests; ++index) { + uop->m_dest_info[index] = trace_uop->m_dests[index].m_id; + ASSERT(trace_uop->m_dests[index].m_reg < NUM_REG_IDS); + } + + uop->m_uop_num = (thread_trace_info->m_temp_uop_count++); + uop->m_thread_id = sim_thread_id; + uop->m_block_id = ((core)->get_trace_info(sim_thread_id))->m_block_id; + uop->m_orig_block_id = + ((core)->get_trace_info(sim_thread_id))->m_orig_block_id; + uop->m_unique_thread_id = + ((core)->get_trace_info(sim_thread_id))->m_unique_thread_id; + uop->m_orig_thread_id = + ((core)->get_trace_info(sim_thread_id))->m_orig_thread_id; + + /// + /// GPU simulation : coalescing logic + /// trace always includes 32 entries (even if warp doesn't have 32 active threads) + /// for inactive threads, memory address and size are zero + /// + if (uop->m_mem_type != NOT_MEM) { + // if PTX memory instructions are decoded into multiple uops with + // more than one of them accessing memory then we have to seek + // backwards in the trace file for the second uop (and subsequent) + // ones that accesses memory + int index = thread_trace_info->m_num_sending_uop - 1; + if (!thread_trace_info->m_trace_uop_array[index]->m_eom) { + for (int ii = 0;; ++ii) { + ASSERT((index + 1) < MAX_PUP); + if (thread_trace_info->m_trace_uop_array[++index]->m_mem_type) { + ASSERTM(0, + "this condition is not handled in the code (and should never " + "occur?)!"); + } + if (thread_trace_info->m_trace_uop_array[++index]->m_eom) { + break; + } + } + } + + // if (*KNOB(KNOB_COMPUTE_CAPABILITY) == 1.3f) { + // if (*KNOB(KNOB_BYTE_LEVEL_ACCESS)) { + // // cache_line_addr = uop->m_vaddr; + // // cache_line_size = *KNOB(KNOB_MAX_TRANSACTION_SIZE); + // } + // ASSERTM(0, "TBD"); + // } else if (*KNOB(KNOB_COMPUTE_CAPABILITY) == 2.0f) { + // Addr line_addr = 0; + // Addr end_line_addr = 0; + // int line_size; + // switch (uop->m_mem_type) { + // // shared memory, parameter memory + // case MEM_LD_SM: + // case MEM_ST_SM: + // if (uop->m_vaddr && uop->m_mem_size) { + // line_addr = + // core->get_shared_memory()->base_cache_line(uop->m_vaddr); + // end_line_addr = core->get_shared_memory()->base_cache_line( + // uop->m_vaddr + uop->m_mem_size - 1); + // } + // line_size = core->get_shared_memory()->cache_line_size(); + // break; + // // constant memory + // case MEM_LD_CM: + // if (uop->m_vaddr && uop->m_mem_size) { + // line_addr = core->get_const_cache()->base_cache_line(uop->m_vaddr); + // end_line_addr = core->get_const_cache()->base_cache_line( + // uop->m_vaddr + uop->m_mem_size - 1); + // } + // line_size = core->get_const_cache()->cache_line_size(); + // break; + // // texture memory --> todo: should fix it to MEM_LD_LM + // case MEM_LD_TM: + // if (uop->m_vaddr && uop->m_mem_size) { + // line_addr = + // core->get_texture_cache()->base_cache_line(uop->m_vaddr); + // end_line_addr = core->get_texture_cache()->base_cache_line( + // uop->m_vaddr + uop->m_mem_size - 1); + // } + // line_size = core->get_texture_cache()->cache_line_size(); + // break; + // // global memory + // default: + // if (uop->m_vaddr && uop->m_mem_size) { + // line_addr = m_simBase->m_memory->base_addr(core_id, uop->m_vaddr); + // end_line_addr = m_simBase->m_memory->base_addr( + // core_id, uop->m_vaddr + uop->m_mem_size - 1); + // } + // line_size = m_simBase->m_memory->line_size(core_id); + // break; + // } + + // ASSERTM(ungetch_trace(core_id, sim_thread_id, 1), "mention why\n"); + + // static set + // seen_block_addr; // to efficiently track seen cache blocks + // static list + // seen_block_list; // to maintain the order of seen cache blocks - is it necessary? + // static map accessed_addr; + + // seen_block_addr.clear(); + // seen_block_list.clear(); + + // bool last_inst = false; + // bool inst_read; + // Addr addr; + // int access_size = uop->m_mem_size; + + // ASSERTM(access_size, + // "access size cannot be zero %s tid %d core %d uop num %llu block " + // "id %d orig id %d\n", + // nvbit_decoder_c::g_tr_opcode_names[uop->m_opcode], sim_thread_id, + // core_id, uop->m_uop_num, uop->m_block_id, uop->m_orig_thread_id); + + // // even if a warp has fewer than 32 threads or even if fewer than + // // 32 threads are active, there will be 32 addresses, with bytes + // // corresponding to invalid/inactive threads set to zero + // // we have read 1 out of 32 addresses + // int read_addr = 1; + // int addr_per_trace_inst = *KNOB(KNOB_TRACE_USES_64_BIT_ADDR) + // ? (m_trace_size / 8) + // : (m_trace_size / 4); + // // int addr_per_trace_inst = 1; + // // 32 instructions are guaranteed to be included + // // how does coalescing of stores happen? say multiple stores map to the same cache block, + // // but not all bytes of a cache block are written. how will the stores be communicated + // // to the l2? + // do { + // if (line_addr) { + // // if (1) { + // if (seen_block_addr.find(line_addr) == seen_block_addr.end()) { + // seen_block_addr.insert(line_addr); + // seen_block_list.push_back(line_addr); + // } + // if (seen_block_addr.find(end_line_addr) == seen_block_addr.end()) { + // seen_block_addr.insert(end_line_addr); + // seen_block_list.push_back(end_line_addr); + // } + // } + + // if (last_inst) { + // if (!thread_trace_info->m_trace_ended) { + // read_success = + // peek_trace(core_id, thread_trace_info->m_prev_trace_info, + // sim_thread_id, &inst_read); + // if (read_success) { + // if (inst_read) { + // trace_info_nvbit_s *prev_trace_info = + // static_cast( + // thread_trace_info->m_prev_trace_info); + // uop->m_npc = prev_trace_info->m_inst_addr; + // } else { + // thread_trace_info->m_trace_ended = true; + // DEBUG_CORE(core_id, "trace ended core_id:%d thread_id:%d\n", + // core_id, sim_thread_id); + // } + // } else { + // ASSERTM(0, "why?"); + // } + // } + // break; + // } + + // if (!((read_addr - 1) % addr_per_trace_inst)) { + // read_success = + // peek_trace(core_id, &trace_info, sim_thread_id, &inst_read); + // if (!read_success || (read_success && !inst_read)) { + // cout << "trace id: " << std::dec << thread_trace_info->m_trace_id << endl; + // cout << "mask: " << std::hex << uop->m_active_mask << endl; + // ASSERTM(0, "reached end without reading all addresses"); + // } + // } + + // if (*KNOB(KNOB_TRACE_USES_64_BIT_ADDR)) { + // memcpy(&addr, + // ((uint8_t *)&trace_info) + + // ((read_addr - 1) % addr_per_trace_inst) * 8, + // 8); + // } else { + // addr = 0; + // memcpy(&addr, + // ((uint8_t *)&trace_info) + + // ((read_addr - 1) % addr_per_trace_inst) * 4, + // 4); + // } + + // ++read_addr; + // if (read_addr == *KNOB(KNOB_GPU_WARP_SIZE)) { + // last_inst = true; + // } + + // if (addr && access_size) { + // int process_id = thread_trace_info->m_process->m_process_id; + // unsigned long offset = UINT_MAX * process_id * 10; + // addr += m_simBase->m_memory->base_addr(core_id, offset); + + // switch (uop->m_mem_type) { + // case MEM_LD_SM: + // case MEM_ST_SM: + // line_addr = core->get_shared_memory()->base_cache_line(addr); + // end_line_addr = core->get_shared_memory()->base_cache_line( + // addr + access_size - 1); + // break; + // case MEM_LD_CM: + // line_addr = core->get_const_cache()->base_cache_line(addr); + // end_line_addr = core->get_const_cache()->base_cache_line( + // addr + access_size - 1); + // break; + // // texture cache + // case MEM_LD_TM: + // line_addr = core->get_texture_cache()->base_cache_line(addr); + // end_line_addr = core->get_texture_cache()->base_cache_line( + // addr + access_size - 1); + // break; + // default: + // line_addr = m_simBase->m_memory->base_addr(core_id, addr); + // end_line_addr = + // m_simBase->m_memory->base_addr(core_id, addr + access_size - 1); + // break; + // } + // } else { + // line_addr = 0; + // end_line_addr = 0; + // } + // } while (1); + + // ASSERTM(seen_block_addr.size() == seen_block_list.size() && + // seen_block_addr.size(), + // "should be non-zero and equal: %ld, %ld, %d, %llx, %s, %lld, %d, %lld \n", + // seen_block_addr.size(), seen_block_list.size(), uop->m_mem_type, line_addr, nvbit_decoder_c::g_tr_opcode_names[uop->m_opcode], uop->m_vaddr, uop->m_mem_size, trace_uop->m_va); + + // uop->m_child_uops = new uop_c *[seen_block_addr.size()]; + // uop->m_num_child_uops = seen_block_addr.size(); + // uop->m_num_child_uops_done = 0; + // if (uop->m_num_child_uops != 64) { + // uop->m_pending_child_uops = N_BIT_MASK(uop->m_num_child_uops); + // } else { + // uop->m_pending_child_uops = N_BIT_MASK_64; + // } + // uop->m_vaddr = 0; + // uop->m_mem_size = 0; + + // uop_c *child_mem_uop = NULL; + // int count = 0; + + // auto itr = seen_block_list.begin(); + // auto end = seen_block_list.end(); + // while (itr != end) { + // Addr vaddr = *itr; + + // child_mem_uop = + // core->get_frontend()->get_uop_pool()->acquire_entry(m_simBase); + // child_mem_uop->allocate(); + // ASSERT(child_mem_uop); + + // memcpy(child_mem_uop, uop, sizeof(uop_c)); + + // child_mem_uop->m_parent_uop = uop; + // child_mem_uop->m_vaddr = vaddr; + // child_mem_uop->m_mem_size = line_size; + // child_mem_uop->m_uop_num = thread_trace_info->m_temp_uop_count++; + // child_mem_uop->m_unique_num = core->inc_and_get_unique_uop_num(); + + // uop->m_child_uops[count++] = child_mem_uop; + + // ++itr; + // } + // } + } + + DEBUG_CORE( + uop->m_core_id, + "new uop: uop_num:%lld inst_num:%lld thread_id:%d unique_num:%lld \n", + uop->m_uop_num, uop->m_inst_num, uop->m_thread_id, uop->m_unique_num); + + return read_success; +} + +/////////////////////////////////////////////////////////////////////////////////////////////// + +/** + * Initialize the mapping between trace opcode and uop type + */ +void nvbit_decoder_c::init_pin_convert(void) { + m_fp_uop_table[NVBIT_FADD] = UOP_NVBIT_FADD; + m_fp_uop_table[NVBIT_FADD32I] = UOP_NVBIT_FADD32I; + m_fp_uop_table[NVBIT_FCHK] = UOP_NVBIT_FCHK; + m_fp_uop_table[NVBIT_FFMA32I] = UOP_NVBIT_FFMA32I; + m_fp_uop_table[NVBIT_FFMA] = UOP_NVBIT_FFMA; + m_fp_uop_table[NVBIT_FMNMX] = UOP_NVBIT_FMNMX; + m_fp_uop_table[NVBIT_FMUL] = UOP_NVBIT_FMUL; + m_fp_uop_table[NVBIT_FMUL32I] = UOP_NVBIT_FMUL32I; + m_fp_uop_table[NVBIT_FSEL] = UOP_NVBIT_FSEL; + m_fp_uop_table[NVBIT_FSET] = UOP_NVBIT_FSET; + m_fp_uop_table[NVBIT_FSETP] = UOP_NVBIT_FSETP; + m_fp_uop_table[NVBIT_FSWZADD] = UOP_NVBIT_FSWZADD; + m_fp_uop_table[NVBIT_MUFU] = UOP_NVBIT_MUFU; + m_fp_uop_table[NVBIT_HADD2] = UOP_NVBIT_HADD2; + m_fp_uop_table[NVBIT_HADD2_32I] = UOP_NVBIT_HADD2_32I; + m_fp_uop_table[NVBIT_HFMA2] = UOP_NVBIT_HFMA2; + m_fp_uop_table[NVBIT_HFMA2_32I] = UOP_NVBIT_HFMA2_32I; + m_fp_uop_table[NVBIT_HMMA] = UOP_NVBIT_HMMA; + m_fp_uop_table[NVBIT_HMUL2] = UOP_NVBIT_HMUL2; + m_fp_uop_table[NVBIT_HMUL2_32I] = UOP_NVBIT_HMUL2_32I; + m_fp_uop_table[NVBIT_HSET2] = UOP_NVBIT_HSET2; + m_fp_uop_table[NVBIT_HSETP2] = UOP_NVBIT_HSETP2; + m_fp_uop_table[NVBIT_DADD] = UOP_NVBIT_DADD; + m_fp_uop_table[NVBIT_DFMA] = UOP_NVBIT_DFMA; + m_fp_uop_table[NVBIT_DMUL] = UOP_NVBIT_DMUL; + m_fp_uop_table[NVBIT_DSETP] = UOP_NVBIT_DSETP; + + m_int_uop_table[NVBIT_BMMA] = UOP_NVBIT_BMMA; + m_int_uop_table[NVBIT_BMSK] = UOP_NVBIT_BMSK; + m_int_uop_table[NVBIT_BREV] = UOP_NVBIT_BREV; + m_int_uop_table[NVBIT_FLO] = UOP_NVBIT_FLO; + m_int_uop_table[NVBIT_IABS] = UOP_NVBIT_IABS; + m_int_uop_table[NVBIT_IADD] = UOP_NVBIT_IADD; + m_int_uop_table[NVBIT_IADD3] = UOP_NVBIT_IADD3; + m_int_uop_table[NVBIT_IADD32I] = UOP_NVBIT_IADD32I; + m_int_uop_table[NVBIT_IDP] = UOP_NVBIT_IDP; + m_int_uop_table[NVBIT_IDP4A] = UOP_NVBIT_IDP4A; + m_int_uop_table[NVBIT_IMAD] = UOP_NVBIT_IMAD; + m_int_uop_table[NVBIT_IMMA] = UOP_NVBIT_IMMA; + m_int_uop_table[NVBIT_IMNMX] = UOP_NVBIT_IMNMX; + m_int_uop_table[NVBIT_IMUL] = UOP_NVBIT_IMUL; + m_int_uop_table[NVBIT_IMUL32I] = UOP_NVBIT_IMUL32I; + m_int_uop_table[NVBIT_ISCADD] = UOP_NVBIT_ISCADD; + m_int_uop_table[NVBIT_ISCADD32I] = UOP_NVBIT_ISCADD32I; + m_int_uop_table[NVBIT_ISETP] = UOP_NVBIT_ISETP; + m_int_uop_table[NVBIT_LEA] = UOP_NVBIT_LEA; + m_int_uop_table[NVBIT_LOP] = UOP_NVBIT_LOP; + m_int_uop_table[NVBIT_LOP3] = UOP_NVBIT_LOP3; + m_int_uop_table[NVBIT_LOP32I] = UOP_NVBIT_LOP32I; + m_int_uop_table[NVBIT_POPC] = UOP_NVBIT_POPC; + m_int_uop_table[NVBIT_SHF] = UOP_NVBIT_SHF; + m_int_uop_table[NVBIT_SHL] = UOP_NVBIT_SHL; + m_int_uop_table[NVBIT_SHR] = UOP_NVBIT_SHR; + m_int_uop_table[NVBIT_VABSDIFF] = UOP_NVBIT_VABSDIFF; + m_int_uop_table[NVBIT_VABSDIFF4] = UOP_NVBIT_VABSDIFF4; + m_int_uop_table[NVBIT_F2F] = UOP_NVBIT_F2F; + m_int_uop_table[NVBIT_F2I] = UOP_NVBIT_F2I; + m_int_uop_table[NVBIT_I2F] = UOP_NVBIT_I2F; + m_int_uop_table[NVBIT_I2I] = UOP_NVBIT_I2I; + m_int_uop_table[NVBIT_I2IP] = UOP_NVBIT_I2IP; + m_int_uop_table[NVBIT_FRND] = UOP_NVBIT_FRND; + m_int_uop_table[NVBIT_MOV] = UOP_NVBIT_MOV; + m_int_uop_table[NVBIT_MOV32I] = UOP_NVBIT_MOV32I; + m_int_uop_table[NVBIT_MOVM] = UOP_NVBIT_MOVM; + m_int_uop_table[NVBIT_PRMT] = UOP_NVBIT_PRMT; + m_int_uop_table[NVBIT_SEL] = UOP_NVBIT_SEL; + m_int_uop_table[NVBIT_SGXT] = UOP_NVBIT_SGXT; + m_int_uop_table[NVBIT_SHFL] = UOP_NVBIT_SHFL; + m_int_uop_table[NVBIT_PLOP3] = UOP_NVBIT_PLOP3; + m_int_uop_table[NVBIT_PSETP] = UOP_NVBIT_PSETP; + m_int_uop_table[NVBIT_P2R] = UOP_NVBIT_P2R; + m_int_uop_table[NVBIT_R2P] = UOP_NVBIT_R2P; + m_int_uop_table[NVBIT_LD] = UOP_NVBIT_LD; + m_int_uop_table[NVBIT_LDC] = UOP_NVBIT_LDC; + m_int_uop_table[NVBIT_LDG] = UOP_NVBIT_LDG; + m_int_uop_table[NVBIT_LDL] = UOP_NVBIT_LDL; + m_int_uop_table[NVBIT_LDS] = UOP_NVBIT_LDS; + m_int_uop_table[NVBIT_LDSM] = UOP_NVBIT_LDSM; + m_int_uop_table[NVBIT_ST] = UOP_NVBIT_ST; + m_int_uop_table[NVBIT_STG] = UOP_NVBIT_STG; + m_int_uop_table[NVBIT_STL] = UOP_NVBIT_STL; + m_int_uop_table[NVBIT_STS] = UOP_NVBIT_STS; + m_int_uop_table[NVBIT_MATCH] = UOP_NVBIT_MATCH; + m_int_uop_table[NVBIT_QSPC] = UOP_NVBIT_QSPC; + m_int_uop_table[NVBIT_ATOM] = UOP_NVBIT_ATOM; + m_int_uop_table[NVBIT_ATOMS] = UOP_NVBIT_ATOMS; + m_int_uop_table[NVBIT_ATOMG] = UOP_NVBIT_ATOMG; + m_int_uop_table[NVBIT_RED] = UOP_NVBIT_RED; + m_int_uop_table[NVBIT_CCTL] = UOP_NVBIT_CCTL; + m_int_uop_table[NVBIT_CCTLL] = UOP_NVBIT_CCTLL; + m_int_uop_table[NVBIT_ERRBAR] = UOP_NVBIT_ERRBAR; + m_int_uop_table[NVBIT_MEMBAR] = UOP_NVBIT_MEMBAR; + m_int_uop_table[NVBIT_CCTLT] = UOP_NVBIT_CCTLT; + m_int_uop_table[NVBIT_R2UR] = UOP_NVBIT_R2UR; + m_int_uop_table[NVBIT_S2UR] = UOP_NVBIT_S2UR; + m_int_uop_table[NVBIT_UBMSK] = UOP_NVBIT_UBMSK; + m_int_uop_table[NVBIT_UBREV] = UOP_NVBIT_UBREV; + m_int_uop_table[NVBIT_UCLEA] = UOP_NVBIT_UCLEA; + m_int_uop_table[NVBIT_UFLO] = UOP_NVBIT_UFLO; + m_int_uop_table[NVBIT_UIADD3] = UOP_NVBIT_UIADD3; + m_int_uop_table[NVBIT_UIADD3_64] = UOP_NVBIT_UIADD3_64; + m_int_uop_table[NVBIT_UIMAD] = UOP_NVBIT_UIMAD; + m_int_uop_table[NVBIT_UISETP] = UOP_NVBIT_UISETP; + m_int_uop_table[NVBIT_ULDC] = UOP_NVBIT_ULDC; + m_int_uop_table[NVBIT_ULEA] = UOP_NVBIT_ULEA; + m_int_uop_table[NVBIT_ULOP] = UOP_NVBIT_ULOP; + m_int_uop_table[NVBIT_ULOP3] = UOP_NVBIT_ULOP3; + m_int_uop_table[NVBIT_ULOP32I] = UOP_NVBIT_ULOP32I; + m_int_uop_table[NVBIT_UMOV] = UOP_NVBIT_UMOV; + m_int_uop_table[NVBIT_UP2UR] = UOP_NVBIT_UP2UR; + m_int_uop_table[NVBIT_UPLOP3] = UOP_NVBIT_UPLOP3; + m_int_uop_table[NVBIT_UPOPC] = UOP_NVBIT_UPOPC; + m_int_uop_table[NVBIT_UPRMT] = UOP_NVBIT_UPRMT; + m_int_uop_table[NVBIT_UPSETP] = UOP_NVBIT_UPSETP; + m_int_uop_table[NVBIT_UR2UP] = UOP_NVBIT_UR2UP; + m_int_uop_table[NVBIT_USEL] = UOP_NVBIT_USEL; + m_int_uop_table[NVBIT_USGXT] = UOP_NVBIT_USGXT; + m_int_uop_table[NVBIT_USHF] = UOP_NVBIT_USHF; + m_int_uop_table[NVBIT_USHL] = UOP_NVBIT_USHL; + m_int_uop_table[NVBIT_USHR] = UOP_NVBIT_USHR; + m_int_uop_table[NVBIT_VOTEU] = UOP_NVBIT_VOTEU; + m_int_uop_table[NVBIT_TEX] = UOP_NVBIT_TEX; + m_int_uop_table[NVBIT_TLD] = UOP_NVBIT_TLD; + m_int_uop_table[NVBIT_TLD4] = UOP_NVBIT_TLD4; + m_int_uop_table[NVBIT_TMML] = UOP_NVBIT_TMML; + m_int_uop_table[NVBIT_TXD] = UOP_NVBIT_TXD; + m_int_uop_table[NVBIT_TXQ] = UOP_NVBIT_TXQ; + m_int_uop_table[NVBIT_SUATOM] = UOP_NVBIT_SUATOM; + m_int_uop_table[NVBIT_SULD] = UOP_NVBIT_SULD; + m_int_uop_table[NVBIT_SURED] = UOP_NVBIT_SURED; + m_int_uop_table[NVBIT_SUST] = UOP_NVBIT_SUST; + m_int_uop_table[NVBIT_BMOV] = UOP_NVBIT_BMOV; + m_int_uop_table[NVBIT_BPT] = UOP_NVBIT_BPT; + m_int_uop_table[NVBIT_BRA] = UOP_NVBIT_BRA; + m_int_uop_table[NVBIT_BREAK] = UOP_NVBIT_BREAK; + m_int_uop_table[NVBIT_BRX] = UOP_NVBIT_BRX; + m_int_uop_table[NVBIT_BRXU] = UOP_NVBIT_BRXU; + m_int_uop_table[NVBIT_BSSY] = UOP_NVBIT_BSSY; + m_int_uop_table[NVBIT_BSYNC] = UOP_NVBIT_BSYNC; + m_int_uop_table[NVBIT_CALL] = UOP_NVBIT_CALL; + m_int_uop_table[NVBIT_EXIT] = UOP_NVBIT_EXIT; + m_int_uop_table[NVBIT_JMP] = UOP_NVBIT_JMP; + m_int_uop_table[NVBIT_JMX] = UOP_NVBIT_JMX; + m_int_uop_table[NVBIT_JMXU] = UOP_NVBIT_JMXU; + m_int_uop_table[NVBIT_KILL] = UOP_NVBIT_KILL; + m_int_uop_table[NVBIT_NANOSLEEP] = UOP_NVBIT_NANOSLEEP; + m_int_uop_table[NVBIT_RET] = UOP_NVBIT_RET; + m_int_uop_table[NVBIT_RPCMOV] = UOP_NVBIT_RPCMOV; + m_int_uop_table[NVBIT_RTT] = UOP_NVBIT_RTT; + m_int_uop_table[NVBIT_WARPSYNC] = UOP_NVBIT_WARPSYNC; + m_int_uop_table[NVBIT_YIELD] = UOP_NVBIT_YIELD; + m_int_uop_table[NVBIT_B2R] = UOP_NVBIT_B2R; + m_int_uop_table[NVBIT_BAR] = UOP_NVBIT_BAR; + m_int_uop_table[NVBIT_CS2R] = UOP_NVBIT_CS2R; + m_int_uop_table[NVBIT_DEPBAR] = UOP_NVBIT_DEPBAR; + m_int_uop_table[NVBIT_GETLMEMBASE] = UOP_NVBIT_GETLMEMBASE; + m_int_uop_table[NVBIT_LEPC] = UOP_NVBIT_LEPC; + m_int_uop_table[NVBIT_NOP] = UOP_NVBIT_NOP; + m_int_uop_table[NVBIT_PMTRIG] = UOP_NVBIT_PMTRIG; + m_int_uop_table[NVBIT_R2B] = UOP_NVBIT_R2B; + m_int_uop_table[NVBIT_S2R] = UOP_NVBIT_S2R; + m_int_uop_table[NVBIT_SETCTAID] = UOP_NVBIT_SETCTAID; + m_int_uop_table[NVBIT_SETLMEMBASE] = UOP_NVBIT_SETLMEMBASE; + m_int_uop_table[NVBIT_VOTE] = UOP_NVBIT_VOTE; +} + +const char *nvbit_decoder_c::g_tr_reg_names[MAX_TR_REG] = { + "*invalid*", "*none*", "*imm8*", "*imm*", + "*imm32*", "*mem*", "*mem*", "*mem*", + "*off*", "*off*", "*off*", "*modx*", + "rdi", "rsi", "rbp", "rsp", + "rbx", "rdx", "rcx", "rax", + "r8", "r9", "r10", "r11", + "r12", "r13", "r14", "r15", + "cs", "ss", "ds", "es", + "fs", "gs", "rflags", "rip", + "al", "ah", "ax", "cl", + "ch", "cx", "dl", "dh", + "dx", "bl", "bh", "bx", + "bp", "si", "di", "sp", + "flags", "ip", "edi", "dil", + "esi", "sil", "ebp", "bpl", + "esp", "spl", "ebx", "edx", + "ecx", "eax", "eflags", "eip", + "r8b", "r8w", "r8d", "r9b", + "r9w", "r9d", "r10b", "r10w", + "r10d", "r11b", "r11w", "r11d", + "r12b", "r12w", "r12d", "r13b", + "r13w", "r13d", "r14b", "r14w", + "r14d", "r15b", "r15w", "r15d", + "mm0", "mm1", "mm2", "mm3", + "mm4", "mm5", "mm6", "mm7", + "emm0", "emm1", "emm2", "emm3", + "emm4", "emm5", "emm6", "emm7", + "mxt", "xmm0", "xmm1", "xmm2", + "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7", "xmm8", "xmm9", "xmm10", + "xmm11", "xmm12", "xmm13", "xmm14", + "xmm15", "ymm0", "ymm1", "ymm2", + "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7", "ymm8", "ymm9", "ymm10", + "ymm11", "ymm12", "ymm13", "ymm14", + "ymm15", "mxcsr", "mxcsrmask", "orig_rax", + "dr0", "dr1", "dr2", "dr3", + "dr4", "dr5", "dr6", "dr7", + "cr0", "cr1", "cr2", "cr3", + "cr4", "tssr", "ldtr", "tr0", + "tr1", "tr2", "tr3", "tr4", + "tr5", "fpcw", "fpsw", "fptag", + "fpip_off", "fpip_sel", "fpopcode", "fpdp_off", + "fpdp_sel", "fptag_full", "st0", "st1", + "st2", "st3", "st4", "st5", + "st6", "st7", "x87", "r_status_flags", + "rdf", +}; + +const char *nvbit_decoder_c::g_tr_opcode_names[MAX_NVBIT_OPCODE_NAME] = { + "NVBIT_FADD", "NVBIT_FADD32I", "NVBIT_FCHK", + "NVBIT_FFMA32I", "NVBIT_FFMA", "NVBIT_FMNMX", + "NVBIT_FMUL", "NVBIT_FMUL32I", "NVBIT_FSEL", + "NVBIT_FSET", "NVBIT_FSETP", "NVBIT_FSWZADD", + "NVBIT_MUFU", "NVBIT_HADD2", "NVBIT_HADD2_32I", + "NVBIT_HFMA2", "NVBIT_HFMA2_32I", "NVBIT_HMMA", + "NVBIT_HMUL2", "NVBIT_HMUL2_32I", "NVBIT_HSET2", + "NVBIT_HSETP2", "NVBIT_DADD", "NVBIT_DFMA", + "NVBIT_DMUL", "NVBIT_DSETP", "NVBIT_BMMA", + "NVBIT_BMSK", "NVBIT_BREV", "NVBIT_FLO", + "NVBIT_IABS", "NVBIT_IADD", "NVBIT_IADD3", + "NVBIT_IADD32I", "NVBIT_IDP", "NVBIT_IDP4A", + "NVBIT_IMAD", "NVBIT_IMMA", "NVBIT_IMNMX", + "NVBIT_IMUL", "NVBIT_IMUL32I", "NVBIT_ISCADD", + "NVBIT_ISCADD32I", "NVBIT_ISETP", "NVBIT_LEA", + "NVBIT_LOP", "NVBIT_LOP3", "NVBIT_LOP32I", + "NVBIT_POPC", "NVBIT_SHF", "NVBIT_SHL", + "NVBIT_SHR", "NVBIT_VABSDIFF", "NVBIT_VABSDIFF4", + "NVBIT_F2F", "NVBIT_F2I", "NVBIT_I2F", + "NVBIT_I2I", "NVBIT_I2IP", "NVBIT_FRND", + "NVBIT_MOV", "NVBIT_MOV32I", "NVBIT_MOVM", + "NVBIT_PRMT", "NVBIT_SEL", "NVBIT_SGXT", + "NVBIT_SHFL", "NVBIT_PLOP3", "NVBIT_PSETP", + "NVBIT_P2R", "NVBIT_R2P", "NVBIT_LD", + "NVBIT_LDC", "NVBIT_LDG", "NVBIT_LDL", + "NVBIT_LDS", "NVBIT_LDSM", "NVBIT_ST", + "NVBIT_STG", "NVBIT_STL", "NVBIT_STS", + "NVBIT_MATCH", "NVBIT_QSPC", "NVBIT_ATOM", + "NVBIT_ATOMS", "NVBIT_ATOMG", "NVBIT_RED", + "NVBIT_CCTL", "NVBIT_CCTLL", "NVBIT_ERRBAR", + "NVBIT_S2UR", "NVBIT_UBMSK", "NVBIT_UBREV", + "NVBIT_MEMBAR", "NVBIT_CCTLT", "NVBIT_R2UR", + "NVBIT_UCLEA", "NVBIT_UFLO", "NVBIT_UIADD3", + "NVBIT_UIADD3_64", "NVBIT_UIMAD", "NVBIT_UISETP", + "NVBIT_ULDC", "NVBIT_ULEA", "NVBIT_ULOP", + "NVBIT_ULOP3", "NVBIT_ULOP32I", "NVBIT_UMOV", + "NVBIT_UP2UR", "NVBIT_UPLOP3", "NVBIT_UPOPC", + "NVBIT_UPRMT", "NVBIT_UPSETP", "NVBIT_UR2UP", + "NVBIT_USEL", "NVBIT_USGXT", "NVBIT_USHF", + "NVBIT_USHL", "NVBIT_USHR", "NVBIT_VOTEU", + "NVBIT_TEX", "NVBIT_TLD", "NVBIT_TLD4", + "NVBIT_TMML", "NVBIT_TXD", "NVBIT_TXQ", + "NVBIT_SUATOM", "NVBIT_SULD", "NVBIT_SURED", + "NVBIT_SUST", "NVBIT_BMOV", "NVBIT_BPT", + "NVBIT_BRA", "NVBIT_BREAK", "NVBIT_BRX", + "NVBIT_BRXU", "NVBIT_BSSY", "NVBIT_BSYNC", + "NVBIT_CALL", "NVBIT_EXIT", "NVBIT_JMP", + "NVBIT_JMX", "NVBIT_JMXU", "NVBIT_KILL", + "NVBIT_NANOSLEEP", "NVBIT_RET", "NVBIT_RPCMOV", + "NVBIT_RTT", "NVBIT_WARPSYNC", "NVBIT_YIELD", + "NVBIT_B2R", "NVBIT_BAR", "NVBIT_CS2R", + "NVBIT_DEPBAR", "NVBIT_GETLMEMBASE", "NVBIT_LEPC", + "NVBIT_NOP", "NVBIT_PMTRIG", "NVBIT_R2B", + "NVBIT_S2R", "NVBIT_SETCTAID", "NVBIT_SETLMEMBASE", + "NVBIT_VOTE"}; + +const char *nvbit_decoder_c::g_tr_cf_names[10] = { + "NOT_CF", // not a control flow instruction + "CF_BR", // an unconditional branch + "CF_CBR", // a conditional branch + "CF_CALL", // a call + "CF_IBR", // an indirect branch + "CF_ICALL", // an indirect call + "CF_ICO", // an indirect jump to co-routine + "CF_RET", // a return + "CF_MITE"}; + +const char *nvbit_decoder_c::g_addr_space_names[MAX_GPU_ADDR_SPACE] = { + "GPU_ADDR_SP_INVALID", "GPU_ADDR_SP_CONST", "GPU_ADDR_SP_GLOBAL", + "GPU_ADDR_SP_LOCAL", "GPU_ADDR_SP_PARAM", "GPU_ADDR_SP_SHARED", + "GPU_ADDR_SP_TEXTURE", "GPU_ADDR_SP_GENERIC"}; + +const char *nvbit_decoder_c::g_cache_op_names[MAX_GPU_CACHE_OP] = { + "GPU_CACHE_OP_INVALID", "GPU_CACHE_OP_CA", "GPU_CACHE_OP_CV", + "GPU_CACHE_OP_CG", "GPU_CACHE_OP_CS", "GPU_CACHE_OP_WB", + "GPU_CACHE_OP_WT"}; + +const char *nvbit_decoder_c::g_cache_level_names[MAX_GPU_CACHE_LEVEL] = { + "GPU_CACHE_INVALID", "GPU_CACHE_L1", "GPU_CACHE_L2"}; + +const char *nvbit_decoder_c::g_fence_level_names[MAX_GPU_FENCE_LEVEL] = { + "GPU_FENCE_INVALID", "GPU_FENCE_CTA", "GPU_FENCE_GL", "GPU_FENCE_SYS"}; + +const char *nvbit_decoder_c::g_optype_names[37] = { + "OP_INV", // invalid opcode + "OP_SPEC", // something weird (rpcc) + "OP_NOP", // is a decoded nop + "OP_CF", // change of flow + "OP_CMOV", // conditional move + "OP_LDA", // load address + "OP_IMEM", // int memory instruction + "OP_IADD", // integer add + "OP_IMUL", // integer multiply + "OP_ICMP", // integer compare + "OP_LOGIC", // logical + "OP_SHIFT", // shift + "OP_BYTE", // byte manipulation + "OP_MM", // multimedia instructions + "OP_FMEM", // fp memory instruction + "OP_FCF", + "OP_FCVT", // floating point convert + "OP_FADD", // floating point add + "OP_FMUL", // floating point multiply + "OP_FDIV", // floating point divide + "OP_FCMP", // floating point compare + "OP_FBIT", // floating point bit + "OP_FCMOV" // floating point cond move +}; + +const char *nvbit_decoder_c::g_mem_type_names[20] = { + "NOT_MEM", // not a memory instruction + "MEM_LD", // a load instruction + "MEM_ST", // a store instruction + "MEM_PF", // a prefetch + "MEM_WH", // a write hint + "MEM_EVICT", // a cache block eviction hint + "MEM_SWPREF_NTA", "MEM_SWPREF_T0", "MEM_SWPREF_T1", "MEM_SWPREF_T2", + "MEM_LD_LM", "MEM_LD_SM", "MEM_LD_GM", "MEM_ST_LM", + "MEM_ST_SM", "MEM_ST_GM", "MEM_LD_CM", "MEM_LD_TM", + "MEM_LD_PM", "NUM_MEM_TYPES"}; diff --git a/src/trace_read_nvbit.h b/src/trace_read_nvbit.h new file mode 100644 index 00000000..dd571b24 --- /dev/null +++ b/src/trace_read_nvbit.h @@ -0,0 +1,158 @@ +/* +Copyright (c) <2012>, All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted +provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list of conditions +and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this list of +conditions and the following disclaimer in the documentation and/or other materials provided +with the distribution. + +Neither the name of the nor the names of its contributors +may be used to endorse or promote products derived from this software without specific prior +written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY +AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +*/ + +/********************************************************************************************** + * File : trace_read_gpu.h + * Author : HPArch Research Group + * Date : + * SVN : $Id: dram.h 912 2009-11-20 19:09:21Z kacear $ + * Description : Trace handling class + *********************************************************************************************/ + +#ifndef TRACE_READ_NVBIT_H_INCLUDED +#define TRACE_READ_NVBIT_H_INCLUDED + +#include "uop.h" +#include "inst_info.h" +#include "trace_read.h" + +//#define MAX_NVBIT_OPCODE_NAME 163 +#define MAX_NVBIT_OPCODE_NAME NVBIT_OPCODE_LAST + +/////////////////////////////////////////////////////////////////////////////////////////////// +/// \brief Trace reader class +/// +/// This class handles all trace related operations. Read instructions from the file, +/// decode, split to micro ops, uop setups ... +/////////////////////////////////////////////////////////////////////////////////////////////// +// class nvbit_decoder_c : public +class nvbit_decoder_c : public trace_read_c +{ +public: + /** + * Constructor + */ + nvbit_decoder_c(macsim_c *simBase, ofstream *dprint_output); + + /** + * Destructor + */ + ~nvbit_decoder_c(); + + /** + * Get an uop from trace + * Called by frontend.cc + * @param core_id - core id + * @param uop - uop object to hold instruction information + * @param sim_thread_id thread id + */ + bool get_uops_from_traces(int core_id, uop_c *uop, int sim_thread_id); + + /** + * GPU simulation : Read trace ahead to read synchronization information + * @param trace_info - trace information + * @see process_manager_c::sim_thread_schedule + */ + void pre_read_trace(thread_s *trace_info); + + static const char *g_tr_reg_names[MAX_TR_REG]; /**< register name string */ + static const char + *g_tr_opcode_names[MAX_NVBIT_OPCODE_NAME]; /**< opcode name string */ + static const char *g_tr_cf_names[10]; /**< cf type string */ + static const char *g_optype_names[37]; /**< opcode type string */ + static const char *g_mem_type_names[20]; /**< memeory request type string */ + static const char + *g_addr_space_names[MAX_GPU_ADDR_SPACE]; /**