-
Notifications
You must be signed in to change notification settings - Fork 20
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* feat: barebones * feat: Implement preflight checks for GPU and Docker runtime * feat: Preflight logging and success messages * chore: restore comments in [pkg/resourceprovider/resourceprovider.go] * feat: gpu check + cleanup * fix: gpu nvidia-smi check * fix: handle no-GPU case gracefully * fix: update start method in RP * chore: remove unused dockerfiles * chore: restore comments in [pkg/resourceprovider/resourceprovider.go] * chore: remove comments within [pkg/resourceprovider/resourceprovider.go] * refactor: removed minimum GPU parameter * feat: 1gb ram requirement * refactor: simplify GPU configuration by removing unnecessary parameters * refactor: enhance GPU info logging and remove types file * refactor: replace hardcoded GPU memory with default constant * refactor: improve GPU info parsing and validation in GetGPUInfo * chore: comments for required GPU VRAM * refactor: Move preflight checker from interface to struct Co-authored-by: logan <[email protected]> * chore: Remove preflight check from start function Co-authored-by: logan <[email protected]> * refactor: Move RunPreflightChecks function to preflight package Co-authored-by: logan <[email protected]> * refactor: Move preflight config to preflight package Co-authored-by: logan <[email protected]> * chore: refactor context within resource provider * chore: remove unused gpuInfo field * refactor: Make functions and structs private where possible Co-authored-by: logan <[email protected]> * chore: Exit early when no GPU detected Co-authored-by: logan <[email protected]> * chore: Improve failed to parse GPU string error Co-authored-by: logan <[email protected]> --------- Co-authored-by: Brian Ginsburg <[email protected]>
- Loading branch information
Showing
4 changed files
with
303 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
package preflight | ||
|
||
import ( | ||
"context" | ||
"encoding/json" | ||
"fmt" | ||
"os/exec" | ||
) | ||
|
||
type dockerInfo struct { | ||
Runtimes map[string]interface{} `json:"Runtimes"` | ||
} | ||
|
||
func (p *preflightChecker) checkDockerRuntime(ctx context.Context) checkResult { | ||
cmd := exec.CommandContext(ctx, "docker", "info", "--format", "{{json .}}") | ||
output, err := cmd.Output() | ||
if err != nil { | ||
return checkResult{ | ||
passed: false, | ||
error: fmt.Errorf("failed to get Docker info: %w", err), | ||
message: "Docker check failed", | ||
} | ||
} | ||
|
||
var info dockerInfo | ||
if err := json.Unmarshal(output, &info); err != nil { | ||
return checkResult{ | ||
passed: false, | ||
error: fmt.Errorf("failed to parse Docker info: %w", err), | ||
message: "Docker info parsing failed", | ||
} | ||
} | ||
|
||
// Check for nvidia runtime | ||
_, hasNvidia := info.Runtimes["nvidia"] | ||
if !hasNvidia { | ||
return checkResult{ | ||
passed: false, | ||
error: fmt.Errorf("nvidia runtime not found in Docker configuration"), | ||
message: "NVIDIA runtime not found in Docker", | ||
} | ||
} | ||
|
||
// Test nvidia runtime | ||
testCmd := exec.CommandContext(ctx, "docker", "run", "--rm", "--runtime=nvidia", "nvidia/cuda:11.8.0-base", "nvidia-smi") | ||
if err := testCmd.Run(); err != nil { | ||
return checkResult{ | ||
passed: false, | ||
error: fmt.Errorf("failed to run NVIDIA runtime test: %w", err), | ||
message: "NVIDIA runtime test failed", | ||
} | ||
} | ||
|
||
return checkResult{ | ||
passed: true, | ||
message: "NVIDIA runtime is available and functional", | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
package preflight | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"os/exec" | ||
"strconv" | ||
"strings" | ||
|
||
"github.com/rs/zerolog/log" | ||
) | ||
|
||
type gpuCheckConfig struct { | ||
required bool | ||
minGPUs int | ||
minMemory int64 | ||
capabilities []string | ||
} | ||
|
||
func checkNvidiaSMI() error { | ||
_, err := exec.LookPath("nvidia-smi") | ||
return err | ||
} | ||
|
||
type nvidiaSmiResponse struct { | ||
uuid string | ||
name string | ||
memoryTotal string | ||
driverVersion string | ||
} | ||
|
||
func parseGPURecord(record string) (*gpuInfo, error) { | ||
fields := strings.Split(record, ", ") | ||
if len(fields) != 4 { | ||
return nil, fmt.Errorf("invalid record format: expected 4 fields, got %d", len(fields)) | ||
} | ||
|
||
// Parse memory, handling potential empty fields | ||
memoryParts := strings.Split(strings.TrimSpace(fields[2]), " ") | ||
if len(memoryParts) != 2 { | ||
return nil, fmt.Errorf("invalid memory format: %s", fields[2]) | ||
} | ||
|
||
memoryStr := memoryParts[0] | ||
if memoryStr == "" { | ||
return nil, fmt.Errorf("empty memory value") | ||
} | ||
|
||
memoryMiB, err := strconv.ParseInt(memoryStr, 10, 64) | ||
if err != nil { | ||
return nil, fmt.Errorf("failed to parse memory value '%s': %w", memoryStr, err) | ||
} | ||
|
||
// Create GPU info with trimmed fields and validated memory | ||
gpu := &gpuInfo{ | ||
uuid: strings.TrimSpace(fields[0]), | ||
name: strings.TrimSpace(fields[1]), | ||
memoryTotal: memoryMiB, | ||
driverVersion: strings.TrimSpace(fields[3]), | ||
} | ||
|
||
// Validate required fields | ||
if gpu.uuid == "" { | ||
return nil, fmt.Errorf("empty UUID") | ||
} | ||
if gpu.name == "" { | ||
return nil, fmt.Errorf("empty Name") | ||
} | ||
if gpu.driverVersion == "" { | ||
return nil, fmt.Errorf("empty DriverVersion") | ||
} | ||
|
||
return gpu, nil | ||
} | ||
|
||
func (p *preflightChecker) getGPUInfo(ctx context.Context) ([]gpuInfo, error) { | ||
if err := checkNvidiaSMI(); err != nil { | ||
return nil, fmt.Errorf("nvidia-smi not available: %w", err) | ||
} | ||
|
||
cmd := exec.CommandContext(ctx, "nvidia-smi", | ||
"--query-gpu=gpu_uuid,gpu_name,memory.total,driver_version", | ||
"--format=csv,noheader") | ||
output, err := cmd.CombinedOutput() | ||
if err != nil { | ||
log.Error().Str("output", string(output)).Err(err).Msg("nvidia-smi command failed") | ||
return nil, fmt.Errorf("error running nvidia-smi: %w", err) | ||
} | ||
|
||
records := strings.Split(strings.TrimSpace(string(output)), "\n") | ||
gpus := make([]gpuInfo, 0, len(records)) | ||
|
||
for _, record := range records { | ||
gpu, err := parseGPURecord(record) | ||
if err != nil { | ||
log.Warn().Err(err).Msgf("Failed to parse GPU record: %s", record) | ||
continue | ||
} | ||
|
||
gpus = append(gpus, *gpu) | ||
log.Info(). | ||
Str("name", gpu.name). | ||
Str("uuid", gpu.uuid). | ||
Int64("memory_mb", gpu.memoryTotal). | ||
Msgf("🎮 GPU %d details", len(gpus)) | ||
} | ||
|
||
if len(gpus) == 0 { | ||
return nil, fmt.Errorf("no valid GPUs found in nvidia-smi output") | ||
} | ||
|
||
return gpus, nil | ||
} | ||
|
||
func (p *preflightChecker) checkGPU(ctx context.Context, config *gpuCheckConfig) checkResult { | ||
if !config.required { | ||
// Attempt to retrieve GPU info | ||
gpus, err := p.getGPUInfo(ctx) | ||
if err != nil { | ||
log.Warn().Msg("⚠️ Running without GPU support - Resource Provider will operate in CPU-only mode") | ||
return checkResult{ | ||
passed: true, | ||
message: "Operating in CPU-only mode", | ||
} | ||
} | ||
|
||
// If we found GPUs, log them but still continue | ||
log.Info().Msgf("🎮 Found %d optional GPUs available for use", len(gpus)) | ||
return checkResult{ | ||
passed: true, | ||
message: fmt.Sprintf("Found %d NVIDIA GPUs (optional)", len(gpus)), | ||
} | ||
} | ||
|
||
// Required GPU checks | ||
log.Info().Msg("Starting required GPU checks") | ||
gpus, err := p.getGPUInfo(ctx) | ||
if err != nil { | ||
return checkResult{ | ||
passed: false, | ||
error: err, | ||
message: "Required GPU check failed - no NVIDIA GPUs detected", | ||
} | ||
} | ||
|
||
log.Info().Msg("✅ GPU requirements satisfied") | ||
return checkResult{ | ||
passed: true, | ||
message: fmt.Sprintf("Found %d suitable GPUs", len(gpus)), | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
package preflight | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
|
||
"github.com/rs/zerolog/log" | ||
) | ||
|
||
const RequiredGPUMemoryGB = 1 // 1GB of VRAM is required to startup if GPU is enabled | ||
|
||
type gpuInfo struct { | ||
uuid string | ||
name string | ||
memoryTotal int64 | ||
driverVersion string | ||
} | ||
|
||
type checkResult struct { | ||
passed bool | ||
message string | ||
error error | ||
} | ||
|
||
type preflightConfig struct { | ||
GPU struct { | ||
MinMemoryGB int64 | ||
} | ||
Docker struct { | ||
CheckRuntime bool | ||
} | ||
} | ||
|
||
type preflightChecker struct { | ||
gpuInfo []gpuInfo | ||
} | ||
|
||
func RunPreflightChecks() error { | ||
ctx := context.Background() | ||
log.Info().Msg("Starting preflight checks...") | ||
checker := &preflightChecker{} | ||
config := preflightConfig{ | ||
GPU: struct { | ||
MinMemoryGB int64 | ||
}{ | ||
MinMemoryGB: RequiredGPUMemoryGB, | ||
}, | ||
} | ||
|
||
// Logging GPU requirements | ||
gpuInfo, err := checker.getGPUInfo(ctx) | ||
if err != nil { | ||
log.Warn().Err(err).Msg("⚠️ No GPU detected - will operate in CPU-only mode") | ||
return nil | ||
} else { | ||
log.Info(). | ||
Int("gpu_count", len(gpuInfo)). | ||
Int64("min_memory_gb", config.GPU.MinMemoryGB). | ||
Msg("🎮 GPU requirements") | ||
} | ||
|
||
err = checker.runAllChecks(ctx, config) | ||
if err != nil { | ||
log.Error().Err(err).Msg("❌ Preflight checks failed") | ||
return err | ||
} | ||
return nil | ||
} | ||
|
||
func (p *preflightChecker) runAllChecks(ctx context.Context, config preflightConfig) error { | ||
gpuResult := p.checkGPU(ctx, &gpuCheckConfig{ | ||
minMemory: config.GPU.MinMemoryGB * 1024 * 1024 * 1024, | ||
}) | ||
if !gpuResult.passed { | ||
return fmt.Errorf("GPU check failed: %s", gpuResult.message) | ||
} | ||
|
||
if config.Docker.CheckRuntime { | ||
runtimeResult := p.checkDockerRuntime(ctx) | ||
if !runtimeResult.passed { | ||
return fmt.Errorf("Docker runtime check failed: %s", runtimeResult.message) | ||
} | ||
} | ||
|
||
return nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters