Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Custom energy monitoring #274

Merged
merged 10 commits into from
Nov 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ zip_safe = True
include_package_data = True
install_requires =
SpiNNUtilities == 1!7.3.1
typing_extensions

[options.packages.find]
include =
Expand Down
9 changes: 9 additions & 0 deletions spinn_machine/machine.py
Original file line number Diff line number Diff line change
Expand Up @@ -833,6 +833,15 @@ def get_fpga_link_with_id(
f" {board_address}")
return self._fpga_links[b_key]

@property
def n_fpga_links(self) -> int:
"""
The number of FPGA links in the machine.

:rtype: int
"""
return len(self._fpga_links)

def add_spinnaker_links(self) -> None:
"""
Add SpiNNaker links that are on a given machine depending on the
Expand Down
53 changes: 51 additions & 2 deletions spinn_machine/version/abstract_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@
from __future__ import annotations
import logging
import re
from typing import (Dict, Iterable, List, Optional, Sequence, Tuple,
TYPE_CHECKING)
from typing import (
Dict, Iterable, List, Optional, Sequence, Tuple, TYPE_CHECKING)
from typing_extensions import TypeAlias

from spinn_utilities.abstract_base import AbstractBase, abstractmethod
from spinn_utilities.log import FormatAdapter
Expand All @@ -28,6 +29,12 @@

logger = FormatAdapter(logging.getLogger(__name__))

ChipXY: TypeAlias = Tuple[int, int]
# Dict of the number of packets sent by each router in each category
RouterPackets: TypeAlias = Dict[ChipXY, Dict[str, int]]
# Dict of the time the cores were active in seconds, and the number of cores
ChipActiveTime: TypeAlias = Dict[ChipXY, Tuple[float, int]]

CORE_RANGE = re.compile(r"(\d+)-(\d+)")
CORE_SINGLE = re.compile(r"(-*)(\d+)")

Expand Down Expand Up @@ -528,5 +535,47 @@ def version_parse_cores_string(self, core_string: str) -> Iterable[int]:
"""
raise NotImplementedError

@abstractmethod
def get_idle_energy(
self, time_s: float, n_frames: int, n_boards: int,
n_chips: int) -> float:
"""
Returns the idle energy consumption of the system in joules

:param float time_s: The time to calculate the energy for in seconds
:param int n_frames: The number of frames
:param int n_boards: The number of boards
:param int n_chips: The number of chips
:rtype: float
"""
raise NotImplementedError

@abstractmethod
def get_active_energy(
self, time_s: float, n_frames: int, n_boards: int, n_chips: int,
chip_active_time: ChipActiveTime,
router_packets: RouterPackets) -> float:
"""
Returns the active energy consumption of the system in joules

:param float time_s: The time to calculate the energy for in seconds
:param int n_frames: The number of frames
:param int n_boards: The number of boards
:param int n_chips: The number of chips
:param dict chip_active_time: The time the cores were active in seconds
:param dict router_packets: The number of packets sent by each router
:rtype: float
"""
raise NotImplementedError

@abstractmethod
def get_router_report_packet_types(self) -> List[str]:
"""
Returns the list of packet types that the router can send

:rtype: list(str)
"""
raise NotImplementedError

def __hash__(self):
return self.number
31 changes: 31 additions & 0 deletions spinn_machine/version/version_3.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from spinn_machine.full_wrap_machine import FullWrapMachine
from spinn_machine.machine import Machine
from .version_spin1 import VersionSpin1
from .abstract_version import ChipActiveTime, RouterPackets

CHIPS_PER_BOARD: Final = {(0, 0): 18, (0, 1): 18, (1, 0): 18, (1, 1): 18}

Expand All @@ -31,6 +32,10 @@ class Version3(VersionSpin1):
"""
__slots__ = ()

#: From measuring the power of an idle 4-chip board for 1 hour, the cost
#: is 3.56W
WATTS_FOR_4_CHIP_BOARD_IDLE_COST: Final = 3.56

@property
@overrides(VersionSpin1.name)
def name(self) -> str:
Expand Down Expand Up @@ -86,5 +91,31 @@ def spinnaker_links(self) -> List[Tuple[int, int, int]]:
def fpga_links(self) -> List[Tuple[int, int, int, int, int]]:
return []

@overrides(VersionSpin1.get_idle_energy)
def get_idle_energy(
self, time_s: float, n_frames: int, n_boards: int,
n_chips: int) -> float:
if n_frames != 0:
raise SpinnMachineException(
"A version 3 SpiNNaker 1 board has no frames!")
if n_boards > 1:
raise SpinnMachineException(
"A version 3 SpiNNaker 1 board has exactly one board!")

# We allow n_boards to be 0 to discount the cost of the board
if n_boards == 0:
return n_chips * self.WATTS_PER_IDLE_CHIP * time_s
return self.WATTS_FOR_4_CHIP_BOARD_IDLE_COST * time_s

@overrides(VersionSpin1.get_active_energy)
def get_active_energy(
self, time_s: float, n_frames: int, n_boards: int, n_chips: int,
chip_active_time: ChipActiveTime,
router_packets: RouterPackets) -> float:
return (
self.get_idle_energy(time_s, n_frames, n_boards, n_chips) +
self._get_router_active_energy(router_packets) +
self._get_core_active_energy(chip_active_time))

def __eq__(self, other):
return isinstance(other, Version3)
61 changes: 61 additions & 0 deletions spinn_machine/version/version_5.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

from .version_48_chips import Version48Chips
from .version_spin1 import VersionSpin1
from .abstract_version import ChipActiveTime, RouterPackets

CHIPS_PER_BOARD: Final = {
(0, 0): 18, (0, 1): 18, (0, 2): 18, (0, 3): 18, (1, 0): 18, (1, 1): 17,
Expand All @@ -39,6 +40,29 @@ class Version5(VersionSpin1, Version48Chips):
"""
__slots__ = ()

#: measured from the real power meter and timing between
#: the photos for a days powered off
#: this is the cost of just a frame itself, including the switch and
#: cooling, while the system is idle.
WATTS_FOR_FRAME_IDLE_COST: Final = 117

#: measured from the loading of the column and extrapolated
#: this is the cost of just a frame itself, including the switch and
#: cooling, while the system is active, over the idle cost for simplicity
WATTS_PER_FRAME_ACTIVE_COST_OVERHEAD: Final = 154.163558 - 117

# pylint: disable=invalid-name
#: from Optimising the overall power usage on the SpiNNaker neuromimetic
#: platform, an idle board uses 26.84W and from measurement of a boxed
#: board with all cores idle for 1 hour including the power supply and all
#: parts, uses 31.88W, so the overhead is 5.04W
WATTS_FOR_48_CHIP_BOX_COST_OVERHEAD: Final = 5.04

# pylint: disable=invalid-name
#: from Optimising the overall power usage on the SpiNNaker neuromimetic
#: platform, an idle board uses 26.84W
WATTS_FOR_48_CHIP_BOARD_IDLE_COST: Final = 26.84

@property
@overrides(VersionSpin1.name)
def name(self) -> str:
Expand Down Expand Up @@ -83,5 +107,42 @@ def fpga_links(self) -> List[Tuple[int, int, int, int, int]]:
(7, 6, 0, 2, 10), (7, 6, 1, 2, 9),
(7, 7, 0, 2, 8), (7, 7, 1, 2, 7), (7, 7, 2, 2, 6)]

@overrides(VersionSpin1.get_idle_energy)
def get_idle_energy(
self, time_s: float, n_frames: int, n_boards: int,
n_chips: int) -> float:

# We allow n_boards to be 0 to discount the cost of the board
if n_boards == 0:
energy = n_chips * self.WATTS_PER_IDLE_CHIP * time_s
else:
energy = n_boards * self.WATTS_FOR_48_CHIP_BOARD_IDLE_COST * time_s

# The container of the boards idle energy
if n_frames != 0:
if n_boards > 1:
energy += n_frames * self.WATTS_FOR_FRAME_IDLE_COST * time_s
elif n_boards == 1:
energy += (
n_frames * self.WATTS_FOR_48_CHIP_BOX_COST_OVERHEAD *
time_s)

return energy

@overrides(VersionSpin1.get_active_energy)
def get_active_energy(
self, time_s: float, n_frames: int, n_boards: int, n_chips: int,
chip_active_time: ChipActiveTime,
router_packets: RouterPackets) -> float:
container_energy = 0.0
if n_frames != 0:
container_energy = (
n_frames * self.WATTS_PER_FRAME_ACTIVE_COST_OVERHEAD * time_s)
return (
container_energy +
self.get_idle_energy(time_s, n_frames, n_boards, n_chips) +
self._get_router_active_energy(router_packets) +
self._get_core_active_energy(chip_active_time))

def __eq__(self, other):
return isinstance(other, Version5)
60 changes: 58 additions & 2 deletions spinn_machine/version/version_spin1.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List, Iterable, Tuple
from typing import List, Iterable, Tuple, Final
from spinn_utilities.abstract_base import AbstractBase
from spinn_utilities.exceptions import ConfigException
from spinn_utilities.overrides import overrides

from spinn_machine.exceptions import SpinnMachineException
from .abstract_version import AbstractVersion
from .abstract_version import (
AbstractVersion, RouterPackets, ChipActiveTime)


class VersionSpin1(AbstractVersion, metaclass=AbstractBase):
Expand All @@ -27,6 +28,42 @@ class VersionSpin1(AbstractVersion, metaclass=AbstractBase):
Shared code for all Spin1 board versions
"""

#: From Optimising the overall power usage on the SpiNNaker neuromimetic
#: platform - all chips on a 48-chip board together use 5.23 + 5.17 + 5.52W
#: + SDRAM of 0.90W = 16.82W when idle, so each chip use 0.35W
WATTS_PER_IDLE_CHIP: Final = 0.35

#: From measuring the power of all 48 chips on a boxed board with all cores
#: idle for 1 hour and 806 cores active for 1 hour we get 31.88W idle and
#: 59.38W active, so 27.50W active overhead, which is 0.034W per core
WATTS_PER_CORE_ACTIVE_OVERHEAD: Final = 0.034

#: stated in papers (SpiNNaker: A 1-W 18 core system-on-Chip for
#: Massively-Parallel Neural Network Simulation)
#: 25pJ per bit
JOULES_PER_ROUTER_BIT = 0.000000000025

#: stated in papers (SpiNNaker: A 1-W 18 core system-on-Chip for
#: Massively-Parallel Neural Network Simulation)
#: 25pJ per bit - spike packets are 40 bits so 1nJ per spike
JOULES_PER_PACKET: Final = JOULES_PER_ROUTER_BIT * 40

#: As above, but with extra 32-bits
JOULES_PER_PACKET_WITH_PAYLOAD: Final = JOULES_PER_ROUTER_BIT * 72

#: Cost of each packet type
COST_PER_PACKET_TYPE = {
"Local_Multicast_Packets": JOULES_PER_PACKET,
"External_Multicast_Packets": JOULES_PER_PACKET,
"Reinjected": JOULES_PER_PACKET,
"Local_P2P_Packets": JOULES_PER_PACKET_WITH_PAYLOAD,
"External_P2P_Packets": JOULES_PER_PACKET_WITH_PAYLOAD,
"Local_NN_Packets": JOULES_PER_PACKET,
"External_NN_Packets": JOULES_PER_PACKET,
"Local_FR_Packets": JOULES_PER_PACKET_WITH_PAYLOAD,
"External_FR_Packets": JOULES_PER_PACKET_WITH_PAYLOAD
}

__slots__ = ()

def __init__(self) -> None:
Expand Down Expand Up @@ -73,3 +110,22 @@ def id_to_qx_qy_qp(self, core_id: int) -> Tuple[int, int, int]:
def version_parse_cores_string(self, core_string: str) -> Iterable[int]:
raise ConfigException(
f"{core_string} does not represent cores for Version 1 boards")

@overrides(AbstractVersion.get_router_report_packet_types)
def get_router_report_packet_types(self) -> List[str]:
return list(self.COST_PER_PACKET_TYPE.keys())

def _get_router_active_energy(
self, router_packets: RouterPackets) -> float:
return sum(
value * self.COST_PER_PACKET_TYPE[name]
for packets in router_packets.values()
for name, value in packets.items())

def _get_core_active_energy(
self, core_active_times: ChipActiveTime) -> float:
# TODO: treat cores that are active sometimes differently to cores that
# are always idle
return sum(
time * self.WATTS_PER_CORE_ACTIVE_OVERHEAD
for time, _n_cores in core_active_times.values())
23 changes: 22 additions & 1 deletion spinn_machine/version/version_spin2.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
from spinn_utilities.exceptions import ConfigException
from spinn_utilities.overrides import overrides

from .abstract_version import AbstractVersion
from .abstract_version import (
AbstractVersion, ChipActiveTime, RouterPackets)

CHIPS_PER_BOARD: Final = {(0, 0): 152}
CORE_QX_QY_QP = re.compile(r"(\d)\.(\d)\.(\d)")
Expand Down Expand Up @@ -126,3 +127,23 @@ def version_parse_cores_string(self, core_string: str) -> Iterable[int]:

raise ConfigException(
f"{core_string} does not represent cores for Version 2 boards")

@overrides(AbstractVersion.get_idle_energy)
def get_idle_energy(
self, time_s: float, n_frames: int, n_boards: int,
n_chips: int) -> float:
# TODO: Work this out for SpiNNaker 2
raise NotImplementedError

@overrides(AbstractVersion.get_active_energy)
def get_active_energy(
self, time_s: float, n_frames: int, n_boards: int, n_chips: int,
chip_active_time: ChipActiveTime,
router_packets: RouterPackets) -> float:
# TODO: Work this out for SpiNNaker 2
raise NotImplementedError

@overrides(AbstractVersion.get_router_report_packet_types)
def get_router_report_packet_types(self) -> List[str]:
# TODO: Work this out for SpiNNaker 2
raise NotImplementedError