Skip to content

Commit

Permalink
Serialize cluster into yaml (#499)
Browse files Browse the repository at this point in the history
### Issue

Solving part of #431 

### Description

Add a function to serialize cluster to the yaml (reverse of what we were
doing so far). Add flag to all Cluster constructors to create mock chips
inside Cluster if user wants that. Expose this (de)serializing through
the API as mentioned in the issue above. Output yaml look like (note the
styling difference than original, but in YAML world it is the same)

```
arch:
  1: wormhole_b0
  0: wormhole_b0
ethernet_connections:
  -
    - chip: 1
      chan: 1
    - chip: 0
      chan: 9
  -
    - chip: 1
      chan: 0
    - chip: 0
      chan: 8
chips_with_mmio:
  - 0: 0
harvesting:
  1:
    noc_translation: true
    harvest_mask: 513
  0:
    noc_translation: true
    harvest_mask: 129
boardtype:
  1: n300
  0: n300 
```

- Add serialize function to Cluster
- Add serialize function cluste descriptor 
- Add flag to all Cluster constructors to create mock chips
  • Loading branch information
pjanevskiTT authored Feb 25, 2025
1 parent 06f2666 commit 472ef2f
Show file tree
Hide file tree
Showing 6 changed files with 235 additions and 107 deletions.
57 changes: 32 additions & 25 deletions device/api/umd/device/cluster.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#pragma once
#include <cassert>
#include <cstdint>
#include <filesystem>
#include <memory>
#include <set>
#include <stdexcept>
Expand Down Expand Up @@ -689,15 +690,16 @@ class Cluster : public tt_device {
* Simplest form, creates a cluster of all available devices on the system.
*
* @param num_host_mem_ch_per_mmio_device Requested number of host channels (hugepages).
* @param skip_driver_allocs
* @param create_mock_chips Create mock chips for the devices in the cluster descriptor.
* @param clean_system_resource Specifies if host state from previous runs needs to be cleaned up.
* @param perform_harvesting Allow the driver to modify the SOC descriptors per chip.
* @param simulated_harvesting_masks Manually specify additional harvesting masks for the devices in the cluster.
* The ones defined by the devices itself have to be used, they will be merged with the ones passed here.
* @param create_mock_chips Create mock chips for the devices in the cluster descriptor.
*/
Cluster(
const uint32_t& num_host_mem_ch_per_mmio_device = 1,
const bool skip_driver_allocs = false,
const bool create_mock_chips = false,
const bool clean_system_resources = false,
bool perform_harvesting = true,
std::unordered_map<chip_id_t, HarvestingMasks> simulated_harvesting_masks = {});
Expand All @@ -708,7 +710,7 @@ class Cluster : public tt_device {
*
* @param target_devices Devices to target.
* @param num_host_mem_ch_per_mmio_device Requested number of host channels (hugepages).
* @param skip_driver_allocs
* @param create_mock_chips Create mock chips for the devices in the cluster descriptor.
* @param clean_system_resource Specifies if host state from previous runs needs to be cleaned up.
* @param perform_harvesting Allow the driver to modify the SOC descriptors per chip.
* @param simulated_harvesting_masks Manually specify additional harvesting masks for the devices in the cluster.
Expand All @@ -717,7 +719,7 @@ class Cluster : public tt_device {
Cluster(
const std::set<chip_id_t>& target_devices,
const uint32_t& num_host_mem_ch_per_mmio_device = 1,
const bool skip_driver_allocs = false,
const bool create_mock_chips = false,
const bool clean_system_resources = false,
bool perform_harvesting = true,
std::unordered_map<chip_id_t, HarvestingMasks> simulated_harvesting_masks = {});
Expand All @@ -731,7 +733,7 @@ class Cluster : public tt_device {
* harvesting info of the devices in the cluster.
* @param target_devices Devices to target.
* @param num_host_mem_ch_per_mmio_device Requested number of host channels (hugepages).
* @param skip_driver_allocs
* @param create_mock_chips Create mock chips for the devices in the cluster descriptor.
* @param clean_system_resource Specifies if host state from previous runs needs to be cleaned up.
* @param perform_harvesting Allow the driver to modify the SOC descriptors per chip.
* @param simulated_harvesting_masks Manually specify additional harvesting masks for the devices in the cluster.
Expand All @@ -741,38 +743,33 @@ class Cluster : public tt_device {
const std::string& sdesc_path,
const std::set<chip_id_t>& target_devices,
const uint32_t& num_host_mem_ch_per_mmio_device = 1,
const bool skip_driver_allocs = false,
const bool create_mock_chips = false,
const bool clean_system_resources = false,
bool perform_harvesting = true,
std::unordered_map<chip_id_t, HarvestingMasks> simulated_harvesting_masks = {});

/**
* Cluster constructor.
* This constructor offers maximal flexibility, allowing the user to pass manually created Chips.
* The user has to know what they are doing.
* TODO: Could fail if logical_ids not match the ones in cluster descriptor, while Cluster still uses cluster
* descriptor.
* This constructor can be used with custom cluster descriptor. If the cluster descriptor does not match the
* actual devices on the system, the constructor will throw an exception. If create_mock_chips is set to true,
* the constructor will create mock chips for the devices in the cluster descriptor.
*
* @param chips Map of logical device ids to Chip instances.
* @param cluster_descriptor Cluster descriptor object based on which Cluster is going to be created.
* @param num_host_mem_ch_per_mmio_device Requested number of host channels (hugepages).
* @param skip_driver_allocs
* @param create_mock_chips Create mock chips for the devices in the cluster descriptor.
* @param clean_system_resource Specifies if host state from previous runs needs to be cleaned up.
* @param perform_harvesting Allow the driver to modify the SOC descriptors per chip.
* @param simulated_harvesting_masks
* @param simulated_harvesting_masks Manually specify additional harvesting masks for the devices in the cluster.
* The ones defined by the devices itself have to be used, they will be merged with the ones passed here.
*/
Cluster(
std::unordered_map<chip_id_t, std::unique_ptr<Chip>>& chips,
std::unique_ptr<tt_ClusterDescriptor> cluster_descriptor,
const uint32_t& num_host_mem_ch_per_mmio_device = 1,
const bool skip_driver_allocs = false,
const bool create_mock_chips = false,
const bool clean_system_resources = false,
bool perform_harvesting = true,
std::unordered_map<chip_id_t, HarvestingMasks> simulated_harvesting_masks = {});

/**
* Cluster constructor which creates a cluster with Mock chips.
*/
static std::unique_ptr<Cluster> create_mock_cluster();

// Existing API we want to keep. UMD is transitioning to use CoreCoord instead of tt_xy_pair.
// This set of function shouldn't be removed even after the transition.
// TODO: regroup the functions from this set into setup/teardown, runtime, and misc functions.
Expand Down Expand Up @@ -996,6 +993,11 @@ class Cluster : public tt_device {
const chip_id_t chip, const std::unordered_set<tt::umd::CoreCoord>& cores, const std::string& fallback_tlb);

static std::unique_ptr<tt_ClusterDescriptor> create_cluster_descriptor();

static std::string serialize();

static std::filesystem::path serialize_to_file();

// Destructor
virtual ~Cluster();

Expand All @@ -1005,7 +1007,7 @@ class Cluster : public tt_device {
void create_device(
const std::set<chip_id_t>& target_mmio_device_ids,
const uint32_t& num_host_mem_ch_per_mmio_device,
const bool skip_driver_allocs,
const bool create_mock_chips,
const bool clean_system_resources);
void initialize_interprocess_mutexes(int logical_device_id, bool cleanup_mutexes_in_shm);
void cleanup_shared_host_state();
Expand Down Expand Up @@ -1137,18 +1139,23 @@ class Cluster : public tt_device {

// Helper functions for constructing the chips from the cluster descriptor.
std::unique_ptr<Chip> construct_chip_from_cluster(
chip_id_t chip_id, tt_ClusterDescriptor* cluster_desc, tt_SocDescriptor& soc_desc);
chip_id_t chip_id,
tt_ClusterDescriptor* cluster_desc,
tt_SocDescriptor& soc_desc,
const bool create_mock_chip = false);
std::unique_ptr<Chip> construct_chip_from_cluster(
const std::string& soc_desc_path,
chip_id_t chip_id,
tt_ClusterDescriptor* cluster_desc,
bool perform_harvesting,
std::unordered_map<chip_id_t, HarvestingMasks>& simulated_harvesting_masks);
std::unordered_map<chip_id_t, HarvestingMasks>& simulated_harvesting_masks,
const bool create_mock_chip = false);
std::unique_ptr<Chip> construct_chip_from_cluster(
chip_id_t logical_device_id,
tt_ClusterDescriptor* cluster_desc,
bool perform_harvesting,
std::unordered_map<chip_id_t, HarvestingMasks>& simulated_harvesting_masks);
std::unordered_map<chip_id_t, HarvestingMasks>& simulated_harvesting_masks,
const bool create_mock_chip = false);
void add_chip(chip_id_t chip_id, std::unique_ptr<Chip> chip);
HarvestingMasks get_harvesting_masks(
chip_id_t chip_id,
Expand All @@ -1174,7 +1181,7 @@ class Cluster : public tt_device {
std::unordered_map<chip_id_t, HarvestingMasks>& simulated_harvesting_masks);
void construct_cluster(
const uint32_t& num_host_mem_ch_per_mmio_device,
const bool skip_driver_allocs,
const bool create_mock_chips,
const bool clean_system_resources,
bool perform_harvesting,
std::unordered_map<chip_id_t, HarvestingMasks> simulated_harvesting_masks);
Expand Down
4 changes: 4 additions & 0 deletions device/api/umd/device/tt_cluster_descriptor.h
Original file line number Diff line number Diff line change
Expand Up @@ -122,4 +122,8 @@ class tt_ClusterDescriptor {
chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const;

void enable_all_devices();

std::string serialize() const;

std::filesystem::path serialize_to_file() const;
};
28 changes: 28 additions & 0 deletions device/api/umd/device/types/cluster_descriptor_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <cstdint>
#include <functional>

#include "fmt/core.h"
#include "umd/device/types/harvesting.h"

// Small performant hash combiner taken from boost library.
Expand Down Expand Up @@ -51,6 +52,33 @@ enum BoardType : uint32_t {
UNKNOWN,
};

inline std::string board_type_to_string(const BoardType board_type) {
switch (board_type) {
case BoardType::E75:
return "e75";
case BoardType::E150:
return "e150";
case BoardType::E300:
return "e300";
case BoardType::N150:
return "n150";
case BoardType::N300:
return "n300";
case BoardType::P100:
return "p100";
case BoardType::P150:
return "p150";
case BoardType::P300:
return "p300";
case BoardType::GALAXY:
return "galaxy";
case BoardType::UNKNOWN:
return "unknown";
}

throw std::runtime_error("Unknown board type passed for conversion to string.");
}

// TODO: add Wormhole and Grayskull board types to this function
inline BoardType get_board_type_from_board_id(const uint64_t board_id) {
uint64_t upi = (board_id >> 36) & 0xFFFFF;
Expand Down
Loading

0 comments on commit 472ef2f

Please sign in to comment.