Skip to content

Commit

Permalink
feat: implement preflight (#502)
Browse files Browse the repository at this point in the history
* feat: barebones

* feat: Implement preflight checks for GPU and Docker runtime

* feat: Preflight logging and success messages

* chore:  restore comments in [pkg/resourceprovider/resourceprovider.go]

* feat: gpu check + cleanup

* fix: gpu nvidia-smi check

* fix: handle no-GPU case gracefully

* fix: update start method in RP

* chore: remove unused dockerfiles

* chore: restore comments in [pkg/resourceprovider/resourceprovider.go]

* chore: remove comments within [pkg/resourceprovider/resourceprovider.go]

* refactor: removed minimum GPU parameter

* feat: 1gb ram requirement

* refactor: simplify GPU configuration by removing unnecessary parameters

* refactor: enhance GPU info logging and remove types file

* refactor: replace hardcoded GPU memory with default constant

* refactor: improve GPU info parsing and validation in GetGPUInfo

* chore: comments for required GPU VRAM

* refactor: Move preflight checker from interface to struct

Co-authored-by: logan <[email protected]>

* chore: Remove preflight check from start function

Co-authored-by: logan <[email protected]>

* refactor: Move RunPreflightChecks function to preflight package

Co-authored-by: logan <[email protected]>

* refactor: Move preflight config to preflight package

Co-authored-by: logan <[email protected]>

* chore: refactor context within resource provider

* chore: remove unused gpuInfo field

* refactor: Make functions and structs private where possible

Co-authored-by: logan <[email protected]>

* chore: Exit early when no GPU detected

Co-authored-by: logan <[email protected]>

* chore: Improve failed to parse GPU string error

Co-authored-by: logan <[email protected]>

---------

Co-authored-by: Brian Ginsburg <[email protected]>
  • Loading branch information
noryev and bgins authored Feb 4, 2025
1 parent c310839 commit 9c2ae59
Show file tree
Hide file tree
Showing 4 changed files with 303 additions and 3 deletions.
58 changes: 58 additions & 0 deletions pkg/resourceprovider/preflight/docker.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
package preflight

import (
"context"
"encoding/json"
"fmt"
"os/exec"
)

type dockerInfo struct {
Runtimes map[string]interface{} `json:"Runtimes"`
}

func (p *preflightChecker) checkDockerRuntime(ctx context.Context) checkResult {
cmd := exec.CommandContext(ctx, "docker", "info", "--format", "{{json .}}")
output, err := cmd.Output()
if err != nil {
return checkResult{
passed: false,
error: fmt.Errorf("failed to get Docker info: %w", err),
message: "Docker check failed",
}
}

var info dockerInfo
if err := json.Unmarshal(output, &info); err != nil {
return checkResult{
passed: false,
error: fmt.Errorf("failed to parse Docker info: %w", err),
message: "Docker info parsing failed",
}
}

// Check for nvidia runtime
_, hasNvidia := info.Runtimes["nvidia"]
if !hasNvidia {
return checkResult{
passed: false,
error: fmt.Errorf("nvidia runtime not found in Docker configuration"),
message: "NVIDIA runtime not found in Docker",
}
}

// Test nvidia runtime
testCmd := exec.CommandContext(ctx, "docker", "run", "--rm", "--runtime=nvidia", "nvidia/cuda:11.8.0-base", "nvidia-smi")
if err := testCmd.Run(); err != nil {
return checkResult{
passed: false,
error: fmt.Errorf("failed to run NVIDIA runtime test: %w", err),
message: "NVIDIA runtime test failed",
}
}

return checkResult{
passed: true,
message: "NVIDIA runtime is available and functional",
}
}
151 changes: 151 additions & 0 deletions pkg/resourceprovider/preflight/gpu.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
package preflight

import (
"context"
"fmt"
"os/exec"
"strconv"
"strings"

"github.com/rs/zerolog/log"
)

type gpuCheckConfig struct {
required bool
minGPUs int
minMemory int64
capabilities []string
}

func checkNvidiaSMI() error {
_, err := exec.LookPath("nvidia-smi")
return err
}

type nvidiaSmiResponse struct {
uuid string
name string
memoryTotal string
driverVersion string
}

func parseGPURecord(record string) (*gpuInfo, error) {
fields := strings.Split(record, ", ")
if len(fields) != 4 {
return nil, fmt.Errorf("invalid record format: expected 4 fields, got %d", len(fields))
}

// Parse memory, handling potential empty fields
memoryParts := strings.Split(strings.TrimSpace(fields[2]), " ")
if len(memoryParts) != 2 {
return nil, fmt.Errorf("invalid memory format: %s", fields[2])
}

memoryStr := memoryParts[0]
if memoryStr == "" {
return nil, fmt.Errorf("empty memory value")
}

memoryMiB, err := strconv.ParseInt(memoryStr, 10, 64)
if err != nil {
return nil, fmt.Errorf("failed to parse memory value '%s': %w", memoryStr, err)
}

// Create GPU info with trimmed fields and validated memory
gpu := &gpuInfo{
uuid: strings.TrimSpace(fields[0]),
name: strings.TrimSpace(fields[1]),
memoryTotal: memoryMiB,
driverVersion: strings.TrimSpace(fields[3]),
}

// Validate required fields
if gpu.uuid == "" {
return nil, fmt.Errorf("empty UUID")
}
if gpu.name == "" {
return nil, fmt.Errorf("empty Name")
}
if gpu.driverVersion == "" {
return nil, fmt.Errorf("empty DriverVersion")
}

return gpu, nil
}

func (p *preflightChecker) getGPUInfo(ctx context.Context) ([]gpuInfo, error) {
if err := checkNvidiaSMI(); err != nil {
return nil, fmt.Errorf("nvidia-smi not available: %w", err)
}

cmd := exec.CommandContext(ctx, "nvidia-smi",
"--query-gpu=gpu_uuid,gpu_name,memory.total,driver_version",
"--format=csv,noheader")
output, err := cmd.CombinedOutput()
if err != nil {
log.Error().Str("output", string(output)).Err(err).Msg("nvidia-smi command failed")
return nil, fmt.Errorf("error running nvidia-smi: %w", err)
}

records := strings.Split(strings.TrimSpace(string(output)), "\n")
gpus := make([]gpuInfo, 0, len(records))

for _, record := range records {
gpu, err := parseGPURecord(record)
if err != nil {
log.Warn().Err(err).Msgf("Failed to parse GPU record: %s", record)
continue
}

gpus = append(gpus, *gpu)
log.Info().
Str("name", gpu.name).
Str("uuid", gpu.uuid).
Int64("memory_mb", gpu.memoryTotal).
Msgf("🎮 GPU %d details", len(gpus))
}

if len(gpus) == 0 {
return nil, fmt.Errorf("no valid GPUs found in nvidia-smi output")
}

return gpus, nil
}

func (p *preflightChecker) checkGPU(ctx context.Context, config *gpuCheckConfig) checkResult {
if !config.required {
// Attempt to retrieve GPU info
gpus, err := p.getGPUInfo(ctx)
if err != nil {
log.Warn().Msg("⚠️ Running without GPU support - Resource Provider will operate in CPU-only mode")
return checkResult{
passed: true,
message: "Operating in CPU-only mode",
}
}

// If we found GPUs, log them but still continue
log.Info().Msgf("🎮 Found %d optional GPUs available for use", len(gpus))
return checkResult{
passed: true,
message: fmt.Sprintf("Found %d NVIDIA GPUs (optional)", len(gpus)),
}
}

// Required GPU checks
log.Info().Msg("Starting required GPU checks")
gpus, err := p.getGPUInfo(ctx)
if err != nil {
return checkResult{
passed: false,
error: err,
message: "Required GPU check failed - no NVIDIA GPUs detected",
}
}

log.Info().Msg("✅ GPU requirements satisfied")
return checkResult{
passed: true,
message: fmt.Sprintf("Found %d suitable GPUs", len(gpus)),
}
}
86 changes: 86 additions & 0 deletions pkg/resourceprovider/preflight/preflight.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
package preflight

import (
"context"
"fmt"

"github.com/rs/zerolog/log"
)

const RequiredGPUMemoryGB = 1 // 1GB of VRAM is required to startup if GPU is enabled

type gpuInfo struct {
uuid string
name string
memoryTotal int64
driverVersion string
}

type checkResult struct {
passed bool
message string
error error
}

type preflightConfig struct {
GPU struct {
MinMemoryGB int64
}
Docker struct {
CheckRuntime bool
}
}

type preflightChecker struct {
gpuInfo []gpuInfo
}

func RunPreflightChecks() error {
ctx := context.Background()
log.Info().Msg("Starting preflight checks...")
checker := &preflightChecker{}
config := preflightConfig{
GPU: struct {
MinMemoryGB int64
}{
MinMemoryGB: RequiredGPUMemoryGB,
},
}

// Logging GPU requirements
gpuInfo, err := checker.getGPUInfo(ctx)
if err != nil {
log.Warn().Err(err).Msg("⚠️ No GPU detected - will operate in CPU-only mode")
return nil
} else {
log.Info().
Int("gpu_count", len(gpuInfo)).
Int64("min_memory_gb", config.GPU.MinMemoryGB).
Msg("🎮 GPU requirements")
}

err = checker.runAllChecks(ctx, config)
if err != nil {
log.Error().Err(err).Msg("❌ Preflight checks failed")
return err
}
return nil
}

func (p *preflightChecker) runAllChecks(ctx context.Context, config preflightConfig) error {
gpuResult := p.checkGPU(ctx, &gpuCheckConfig{
minMemory: config.GPU.MinMemoryGB * 1024 * 1024 * 1024,
})
if !gpuResult.passed {
return fmt.Errorf("GPU check failed: %s", gpuResult.message)
}

if config.Docker.CheckRuntime {
runtimeResult := p.checkDockerRuntime(ctx)
if !runtimeResult.passed {
return fmt.Errorf("Docker runtime check failed: %s", runtimeResult.message)
}
}

return nil
}
11 changes: 8 additions & 3 deletions pkg/resourceprovider/resourceprovider.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
"github.com/lilypad-tech/lilypad/pkg/executor/bacalhau"
"github.com/lilypad-tech/lilypad/pkg/ipfs"
"github.com/lilypad-tech/lilypad/pkg/powLogs"
"github.com/lilypad-tech/lilypad/pkg/resourceprovider/preflight"
"github.com/lilypad-tech/lilypad/pkg/system"
"github.com/lilypad-tech/lilypad/pkg/web3"
"github.com/lilypad-tech/lilypad/pkg/web3/bindings/pow"
Expand Down Expand Up @@ -59,9 +60,8 @@ type ResourceProviderOfferOptions struct {

// this configures the pow we will keep track of
type ResourceProviderPowOptions struct {
DisablePow bool
NumWorkers int

DisablePow bool
NumWorkers int
CudaGridSize int
CudaBlockSize int
CudaHashsPerThread int
Expand All @@ -88,10 +88,15 @@ func NewResourceProvider(
executor executor.Executor,
tracer trace.Tracer,
) (*ResourceProvider, error) {
if err := preflight.RunPreflightChecks(); err != nil {
return nil, fmt.Errorf("preflight checks failed: %w", err)
}

controller, err := NewResourceProviderController(options, web3SDK, executor, tracer)
if err != nil {
return nil, err
}

solver := &ResourceProvider{
controller: controller,
options: options,
Expand Down

0 comments on commit 9c2ae59

Please sign in to comment.