From 45377adb8e2a59d1f95b3f2683e1c210ae8fd40f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Te=C3=AFlo=20M?= <teilomillet@gmail.com>
Date: Sun, 22 Dec 2024 09:53:56 +0100
Subject: [PATCH] fix: resolve some of the errors spot with the lint.

---
 .github/workflows/go-ci.yml   |   4 +-
 README.md                     | 326 ++++++++--------------------------
 cmd/hapax/main.go             |   7 +-
 config.example.yaml           |   6 +-
 errors/errors.go              |  13 +-
 server/middleware/timeout.go  |   3 +
 server/routing/router.go      |  39 +++-
 server/routing/router_test.go |   8 +-
 server/server.go              |  49 +++--
 tests/circuitbreaker_test.go  |  19 +-
 tests/docker_test.go          |  26 ++-
 11 files changed, 208 insertions(+), 292 deletions(-)

diff --git a/.github/workflows/go-ci.yml b/.github/workflows/go-ci.yml
index 73d2907..157a86e 100644
--- a/.github/workflows/go-ci.yml
+++ b/.github/workflows/go-ci.yml
@@ -39,11 +39,9 @@ jobs:
         with:
           version: latest
           args: |
-            --timeout=5m  
-            --disable-all
+            --timeout=5m
             --out-format=colored-line-number
             --issues-exit-code=1
-          # Only show new issues for pull requests
           only-new-issues: true
           skip-pkg-cache: true
           skip-build-cache: false
diff --git a/README.md b/README.md
index 3bdaed9..b601042 100644
--- a/README.md
+++ b/README.md
@@ -1,290 +1,106 @@
 # Hapax
 
-A lightweight HTTP server for Large Language Model (LLM) interactions, built with Go.
-
-## Version
-v0.0.16
-
-## Features
-
-- HTTP server with completion endpoint (`/v1/completions`)
-- Health check endpoint (`/health`)
-- Configurable server settings (port, timeouts, etc.)
-- Clean shutdown handling
-- Comprehensive test suite with mock LLM implementation
-- Token validation with tiktoken
-  - Automatic token counting
-  - Context length validation
-  - Max tokens validation
-- Middleware architecture:
-  - Request ID tracking
-  - Request timing metrics
-  - Panic recovery
-  - CORS support
-  - API key authentication
-  - Rate limiting (token bucket)
-  - Prometheus metrics collection
-- Enhanced error handling:
-  - Structured JSON error responses
-  - Request ID tracking in errors
-  - Zap-based logging with context
-  - Custom error types for different scenarios
-  - Seamless error middleware integration
-- Dynamic routing:
-  - Version-based routing (v1, v2)
-  - Route-specific middleware
-  - Health check endpoints
-  - Header validation
-- Provider management:
-  - Multiple provider support (OpenAI, Anthropic, etc.)
-  - Provider health monitoring
-  - Automatic failover to backup providers
-  - Configurable health check intervals
-  - Provider-specific configuration
-
-## Installation
+## Large Language Model Infrastructure, Simplified
 
-```bash
-go get github.com/teilomillet/hapax
-```
+Building with Large Language Models is complex. Multiple providers, varying APIs, inconsistent performance, unpredictable costs—these challenges consume more engineering time than the actual innovation.
 
-## Configuration
+Hapax offers a different approach. 
 
-Hapax uses YAML for configuration. Here's an example configuration file:
+What if managing LLM infrastructure was as simple as editing a configuration file? What if switching providers, adding endpoints, or implementing fallback strategies could be done with minimal effort?
 
-```yaml
-server:
-  port: 8080
-  read_timeout: 30s
-  write_timeout: 30s
-  max_header_bytes: 1048576  # 1MB
-  shutdown_timeout: 30s
+Imagine a system that:
+- Connects to multiple LLM providers seamlessly
+- Provides automatic failover between providers
+- Offers comprehensive monitoring and metrics
+- Allows instant configuration updates without downtime
 
-routes:
-  - path: "/completions"
-    handler: "completion"
-    version: "v1"
-    methods: ["POST"]
-    middleware: ["auth", "ratelimit"]
-    headers:
-      Content-Type: "application/json"
-    health_check:
-      enabled: true
-      interval: 30s
-      timeout: 5s
-      threshold: 3
-      checks:
-        api: "http"
-
-  - path: "/health"
-    handler: "health"
-    version: "v1"
-    methods: ["GET"]
-    health_check:
-      enabled: true
-      interval: 15s
-      timeout: 2s
-      threshold: 2
-      checks:
-        system: "tcp"
-
-llm:
-  provider: ollama
-  model: llama2
-  endpoint: http://localhost:11434
-  system_prompt: "You are a helpful assistant."
-  max_context_tokens: 4096  # Maximum context length for your model
-  options:
-    temperature: 0.7
-    max_tokens: 2000
-
-logging:
-  level: info  # debug, info, warn, error
-  format: json # json, text
-```
-
-### Configuration Options
-
-#### Server Configuration
-- `port`: HTTP server port (default: 8080)
-- `read_timeout`: Maximum duration for reading request body (default: 30s)
-- `write_timeout`: Maximum duration for writing response (default: 30s)
-- `max_header_bytes`: Maximum size of request headers (default: 1MB)
-- `shutdown_timeout`: Maximum duration to wait for graceful shutdown (default: 30s)
-
-#### LLM Configuration
-- `provider`: LLM provider name (e.g., "ollama", "openai")
-- `model`: Model name (e.g., "llama2", "gpt-4")
-- `endpoint`: API endpoint URL
-- `system_prompt`: Default system prompt for conversations
-- `max_context_tokens`: Maximum context length in tokens (model-dependent)
-- `options`: Provider-specific options
-  - `temperature`: Sampling temperature (0.0 to 1.0)
-  - `max_tokens`: Maximum tokens to generate
-
-#### Logging Configuration
-- `level`: Log level (debug, info, warn, error)
-- `format`: Log format (json, text)
-
-## Quick Start
-
-```go
-package main
-
-import (
-    "context"
-    "log"
-
-    "github.com/teilomillet/hapax"
-    "github.com/teilomillet/gollm"
-    "go.uber.org/zap"
-)
-
-func main() {
-    // Initialize logger (optional, defaults to production config)
-    logger, _ := zap.NewProduction()
-    defer logger.Sync()
-    hapax.SetLogger(logger)
-
-    // Create an LLM instance (using gollm)
-    llm := gollm.New()
-
-    // Create a completion handler
-    handler := hapax.NewCompletionHandler(llm)
-
-    // Create a router
-    router := hapax.NewRouter(handler)
-
-    // Use default configuration
-    config := hapax.DefaultConfig()
-
-    // Create and start server
-    server := hapax.NewServer(config, router)
-    if err := server.Start(context.Background()); err != nil {
-        log.Fatal(err)
-    }
-}
-```
+This is Hapax.
 
-## API Endpoints
+### Real-World Flexibility in Action
 
-### POST /v1/completions
+Imagine you're running a production service using OpenAI's GPT model. Suddenly, you want to:
+- Add a new Anthropic Claude model endpoint
+- Create a fallback strategy
+- Implement detailed monitoring
 
-Generate completions using the configured LLM.
+With Hapax, this becomes simple:
 
-**Request:**
-```json
-{
-    "prompt": "Your prompt here"
-}
-```
-
-**Response:**
-```json
-{
-    "completion": "LLM generated response"
-}
+```yaml
+# Simply append to your existing configuration
+providers:
+  anthropic:
+    type: anthropic
+    models:
+      claude-3.5-haiku:
+        api_key: ${ANTHROPIC_API_KEY}
+        endpoint: /v1/anthropic/haiku
 ```
 
-**Error Responses:**
-- 400 Bad Request: Invalid JSON or missing prompt
-- 405 Method Not Allowed: Wrong HTTP method
-- 500 Internal Server Error: LLM error
-
-### GET /health
-
-Check server health status.
-
-**Response:**
-```json
-{
-    "status": "ok"
-}
-```
+No downtime. No complex redeployment. Just configuration.
 
-## Error Handling
+## Intelligent Provider Management
 
-Hapax provides structured error handling with JSON responses:
+Hapax goes beyond simple API routing. It creates a resilient ecosystem for your LLM interactions:
 
-```json
-{
-    "type": "validation_error",
-    "message": "Invalid request format",
-    "request_id": "req_123abc",
-    "details": {
-        "field": "prompt",
-        "error": "required"
-    }
-}
-```
+**Automatic Failover**: When one provider experiences issues, Hapax seamlessly switches to backup providers. Your service continues operating without interruption.
 
-Error types include:
-- `validation_error`: Request validation failures
-- `provider_error`: LLM provider issues
-- `rate_limit_error`: Rate limiting
-- `internal_error`: Unexpected server errors
+**Deduplication**: Prevent duplicate requests and unnecessary API calls. Hapax intelligently manages request caching and prevents redundant processing.
 
-## Docker Support
+**Provider Health Monitoring**: Continuously track provider performance. Automatically reconnect to primary providers once they're back online, ensuring optimal resource utilization.
 
-The application comes with full Docker support, making it easy to deploy and run in containerized environments.
+## Comprehensive Observability
 
-### Features
+Hapax isn't just a gateway—it's a complete monitoring and alerting system for your LLM infrastructure:
+- Detailed Prometheus metrics
+- Real-time performance tracking
+- Comprehensive error reporting
+- Intelligent alerting mechanisms
 
-- **Multi-stage Build**: Optimized container size with separate build and runtime stages
-- **Security**: Runs as non-root user with minimal runtime dependencies
-- **Health Checks**: Built-in health monitoring for container orchestration
-- **Prometheus Integration**: Ready-to-use metrics endpoint for monitoring
-- **Docker Compose**: Complete setup with Prometheus integration
+## API Versioning for Scalability
 
-### Running with Docker
+Create multiple API versions effortlessly. Each endpoint can have its own configuration, allowing granular control and smooth evolutionary paths for your services.
 
-1. Build and run using Docker:
-```bash
-docker build -t hapax .
-docker run -p 8080:8080 hapax
+```yaml
+routes:
+  - path: /v1/completions
+    handler: completion
+    version: v1
+  - path: /v2/completions
+    handler: advanced_completion
+    version: v2
 ```
 
-2. Or use Docker Compose for the full stack with Prometheus:
+## Getting Started
+
 ```bash
-docker compose up -d
+# Pull Hapax
+docker pull ghcr.io/teilomillet/hapax:latest
+
+# Generate default configuration
+docker run --rm -v $(pwd):/output \
+  ghcr.io/teilomillet/hapax:latest \
+  cp /app/config.example.yaml /output/config.yaml
+
+# Launch Hapax
+docker run -p 8080:8080 \
+  -v $(pwd)/config.yaml:/app/config.yaml \
+  ghcr.io/teilomillet/hapax:latest
 ```
 
-### Container Health
-
-The container includes health checks that monitor:
-- HTTP server availability
-- Application readiness
-- Basic functionality
+## What's Next
 
-Access the health status:
-- Health endpoint: http://localhost:8080/health
-- Metrics endpoint: http://localhost:8080/metrics
-- Prometheus: http://localhost:9090
+Hapax is continuously evolving. 
 
-## Testing
+## Open Source
 
-The project includes a comprehensive test suite with a mock LLM implementation that can be used for testing LLM-dependent code:
-
-```go
-import "github.com/teilomillet/hapax/mock_test"
-
-// Create a mock LLM with custom response
-llm := &MockLLM{
-    GenerateFunc: func(ctx context.Context, p *gollm.Prompt) (string, error) {
-        return "Custom response", nil
-    },
-}
-```
-
-Run the tests:
-```bash
-go test ./...
-```
+Licensed under Apache 2.0, Hapax is open for collaboration and customization.
 
-## License
+## Community & Support
 
-APACHE License 2.0
+- **Discussions**: [GitHub Discussions](https://github.com/teilomillet/hapax/discussions)
+- **Documentation**: [Hapax Wiki](https://github.com/teilomillet/hapax/wiki)
+- **Issues**: [GitHub Issues](https://github.com/teilomillet/hapax/issues)
 
-## Contributing
+## Our Vision
 
-Contributions are welcome! Please feel free to submit a Pull Request.
\ No newline at end of file
+We believe LLM infrastructure should be simple, reliable, and adaptable. Hapax represents our commitment to making LLM integration accessible and powerful.
\ No newline at end of file
diff --git a/cmd/hapax/main.go b/cmd/hapax/main.go
index be2554d..03a1fa1 100644
--- a/cmd/hapax/main.go
+++ b/cmd/hapax/main.go
@@ -35,7 +35,12 @@ func main() {
 	if err != nil {
 		log.Fatalf("Failed to create logger: %v", err)
 	}
-	defer logger.Sync()
+	defer func() {
+		if err := logger.Sync(); err != nil {
+			// Log sync failure, but use fmt.Fprintf to stderr since the zap logger might be unavailable
+			fmt.Fprintf(os.Stderr, "Failed to sync logger: %v\n", err)
+		}
+	}()
 
 	// Load configuration
 	cfg, err := config.LoadFile(*configFile)
diff --git a/config.example.yaml b/config.example.yaml
index 42c2ea2..496f381 100644
--- a/config.example.yaml
+++ b/config.example.yaml
@@ -8,15 +8,15 @@ server:
 providers:
   openai:
     type: openai
-    model: gpt-4
+    model: gpt-4o-mini
     api_key: ${OPENAI_API_KEY}
   anthropic:
     type: anthropic
-    model: claude-2
+    model: claude-3.5-haiku-latest
     api_key: ${ANTHROPIC_API_KEY}
   ollama:
     type: ollama
-    model: llama2
+    model: llama3
     api_key: ""
 
 # Order of provider preference for failover
diff --git a/errors/errors.go b/errors/errors.go
index 743b59c..20e9fc4 100644
--- a/errors/errors.go
+++ b/errors/errors.go
@@ -152,13 +152,20 @@ func WriteError(w http.ResponseWriter, err *HapaxError) {
 	w.Header().Set("Content-Type", "application/json")
 	w.WriteHeader(err.Code)
 
-	// Convert HapaxError to ErrorResponse and write it
-	json.NewEncoder(w).Encode(&ErrorResponse{
+	// Check the error return from Encode
+	if encodeErr := json.NewEncoder(w).Encode(&ErrorResponse{
 		Type:      err.Type,
 		Message:   err.Message,
 		RequestID: err.RequestID,
 		Details:   err.Details,
-	})
+	}); encodeErr != nil {
+		// What do we do if encoding fails?
+		// Typically, you'd log the error
+		zap.L().Error("Failed to encode error response", zap.Error(encodeErr))
+
+		// Optionally, try a fallback method
+		w.Write([]byte(`{"error": "Failed to encode error response"}`))
+	}
 }
 
 // Error is a drop-in replacement for http.Error that creates and writes
diff --git a/server/middleware/timeout.go b/server/middleware/timeout.go
index ca43ea3..69232b7 100644
--- a/server/middleware/timeout.go
+++ b/server/middleware/timeout.go
@@ -60,6 +60,9 @@ func Timeout(timeout time.Duration) func(http.Handler) http.Handler {
 	return func(next http.Handler) http.Handler {
 		return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 			// Create a context with timeout
+			if timeout == 0 {
+				timeout = defaultTimeout
+			}
 			ctx, cancel := context.WithTimeout(r.Context(), timeout)
 			defer cancel() // Ensure cancel is called to release resources
 			
diff --git a/server/routing/router.go b/server/routing/router.go
index 48191ae..0793c2a 100644
--- a/server/routing/router.go
+++ b/server/routing/router.go
@@ -114,7 +114,7 @@ func (r *Router) setupRoutes() {
 			if route.HealthCheck != nil {
 				healthPath := fmt.Sprintf("%s/health", path)
 				router.Get(healthPath, r.healthCheckHandler(route)) // Register health check handler
-				r.startHealthCheck(route) // Start health check routine
+				r.startHealthCheck(route)                           // Start health check routine
 			}
 		})
 	}
@@ -127,24 +127,33 @@ func (r *Router) setupRoutes() {
 }
 
 // healthCheckHandler returns a handler for route-specific health checks.
-// It checks the health state of the route and responds accordingly.
 func (r *Router) healthCheckHandler(route config.RouteConfig) http.HandlerFunc {
 	return func(w http.ResponseWriter, req *http.Request) {
 		status := "healthy"
 		if v, ok := r.healthState.Load(route.Path); ok && !v.(bool) {
 			status = "unhealthy"
-			w.WriteHeader(http.StatusServiceUnavailable) // Respond with 503 if unhealthy
+			w.WriteHeader(http.StatusServiceUnavailable)
+		}
+
+		// Properly handle potential JSON encoding errors
+		w.Header().Set("Content-Type", "application/json")
+		if err := json.NewEncoder(w).Encode(map[string]string{"status": status}); err != nil {
+			// Log the error and send a generic error response
+			r.logger.Error("Failed to encode health check response",
+				zap.String("route", route.Path),
+				zap.Error(err))
+
+			// Send a fallback error response
+			http.Error(w, "Internal Server Error", http.StatusInternalServerError)
 		}
-		json.NewEncoder(w).Encode(map[string]string{"status": status}) // Encode health status as JSON
 	}
 }
 
 // globalHealthCheckHandler returns a handler for the global health check endpoint.
-// It checks the health of all routes and responds with their statuses.
 func (r *Router) globalHealthCheckHandler() http.HandlerFunc {
 	return func(w http.ResponseWriter, req *http.Request) {
 		allHealthy := true
-		statuses := make(map[string]string) // Map to hold health statuses
+		statuses := make(map[string]string)
 
 		// Iterate through health states of all routes
 		r.healthState.Range(func(key, value interface{}) bool {
@@ -160,13 +169,25 @@ func (r *Router) globalHealthCheckHandler() http.HandlerFunc {
 		})
 
 		if !allHealthy {
-			w.WriteHeader(http.StatusServiceUnavailable) // Respond with 503 if any service is unhealthy
+			w.WriteHeader(http.StatusServiceUnavailable)
 		}
 
-		json.NewEncoder(w).Encode(map[string]interface{}{
+		// Properly handle potential JSON encoding errors
+		w.Header().Set("Content-Type", "application/json")
+		response := map[string]interface{}{
 			"status":   map[string]bool{"global": allHealthy},
 			"services": statuses,
-		}) // Encode global health status as JSON
+		}
+
+		if err := json.NewEncoder(w).Encode(response); err != nil {
+			// Log the error and send a generic error response
+			r.logger.Error("Failed to encode global health check response",
+				zap.Bool("all_healthy", allHealthy),
+				zap.Error(err))
+
+			// Send a fallback error response
+			http.Error(w, "Internal Server Error", http.StatusInternalServerError)
+		}
 	}
 }
 
diff --git a/server/routing/router_test.go b/server/routing/router_test.go
index 3a26604..c6195e5 100644
--- a/server/routing/router_test.go
+++ b/server/routing/router_test.go
@@ -98,10 +98,14 @@ func TestRouter_VersionedRouting(t *testing.T) {
 	}
 	handlers := map[string]http.Handler{
 		"test": http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-			w.Write([]byte("v1"))
+			if _, err := w.Write([]byte("v1")); err != nil {
+				t.Fatalf("Failed to write response: %v", err)
+			}
 		}),
 		"test2": http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-			w.Write([]byte("v2"))
+			if _, err := w.Write([]byte("v2")); err != nil {
+				t.Fatalf("Failed to write response: %v", err)
+			}
 		}),
 	}
 	logger := zap.NewNop()
diff --git a/server/server.go b/server/server.go
index f79bd92..fd105a6 100644
--- a/server/server.go
+++ b/server/server.go
@@ -127,7 +127,9 @@ func NewRouter(completion http.Handler) *Router {
 	// - LLM request counts by provider/model
 	r.Get("/metrics", func(w http.ResponseWriter, r *http.Request) {
 		w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
-		w.Write([]byte(`
+
+		// Check the error return from Write
+		if _, err := w.Write([]byte(`
 # HELP hapax_requests_total The total number of HTTP requests.
 # TYPE hapax_requests_total counter
 hapax_requests_total{code="200"} 10
@@ -145,7 +147,14 @@ hapax_request_duration_seconds_count 10
 # HELP hapax_llm_requests_total The total number of LLM requests.
 # TYPE hapax_llm_requests_total counter
 hapax_llm_requests_total{provider="openai",model="gpt-3.5-turbo"} 5
-`))
+`)); err != nil {
+			// In a real-world scenario, log the error
+			fmt.Printf("Failed to write metrics response: %v", err)
+
+			// Send an error response
+			http.Error(w, "Failed to generate metrics", http.StatusInternalServerError)
+			return
+		}
 	})
 
 	return router
@@ -353,39 +362,57 @@ func (s *Server) Start(ctx context.Context) error {
 }
 
 func main() {
+	// Create logger with explicit error handling
 	logger, err := zap.NewProduction()
 	if err != nil {
-		fmt.Printf("Failed to create logger: %v\n", err)
-		return
+		// Fail fast if logger creation fails
+		fmt.Printf("Critical error: Failed to create logger: %v\n", err)
+		os.Exit(1)
 	}
-	defer logger.Sync()
+
+	// Ensure logger is synced, with robust error handling
+	defer func() {
+		if syncErr := logger.Sync(); syncErr != nil {
+			// Log sync failure, but don't mask the original error
+			fmt.Printf("Warning: Failed to sync logger: %v\n", syncErr)
+		}
+	}()
+
+	// Set global logger
 	errors.SetLogger(logger)
 
-	configPath := "config.yaml" // Or get from environment/flags
+	// Configuration and server setup with comprehensive error handling
+	configPath := "config.yaml"
 	server, err := NewServer(configPath, logger)
 	if err != nil {
-		logger.Fatal("Failed to create server",
+		logger.Fatal("Server initialization failed",
 			zap.Error(err),
+			zap.String("config_path", configPath),
 		)
 	}
 
-	// Handle graceful shutdown
+	// Graceful shutdown infrastructure
 	ctx, cancel := context.WithCancel(context.Background())
 	defer cancel()
 
-	// Handle OS signals
+	// Signal handling with detailed logging
 	sigChan := make(chan os.Signal, 1)
 	signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
 
 	go func() {
 		sig := <-sigChan
-		logger.Info("Received shutdown signal", zap.String("signal", sig.String()))
+		logger.Info("Shutdown signal received",
+			zap.String("signal", sig.String()),
+			zap.String("action", "initiating graceful shutdown"),
+		)
 		cancel()
 	}()
 
+	// Server start with comprehensive error tracking
 	if err := server.Start(ctx); err != nil {
-		logger.Fatal("Server error",
+		logger.Fatal("Server startup or runtime error",
 			zap.Error(err),
+			zap.String("action", "server_start_failed"),
 		)
 	}
 }
diff --git a/tests/circuitbreaker_test.go b/tests/circuitbreaker_test.go
index e3e3bae..97e731b 100644
--- a/tests/circuitbreaker_test.go
+++ b/tests/circuitbreaker_test.go
@@ -122,16 +122,33 @@ func TestCircuitBreaker(t *testing.T) {
 		cb, err := newCB()
 		require.NoError(t, err)
 
+		// Track failures explicitly
+		var failureCount int
+		var successCount int
+
 		// Execute a mix of successful and failed requests
 		for i := 0; i < 3; i++ {
-			cb.Execute(func() error {
+			execErr := cb.Execute(func() error {
 				if i%2 == 0 {
 					return errors.New("failure")
 				}
 				return nil
 			})
+
+			// Explicitly handle the error return
+			if execErr != nil {
+				failureCount++
+				// Optional: log or make specific assertions about the error
+				t.Logf("Execution %d failed: %v", i, execErr)
+			} else {
+				successCount++
+			}
 		}
 
+		// Now we can make more precise assertions
+		assert.Equal(t, 2, failureCount, "Expected 2 failures")
+		assert.Equal(t, 1, successCount, "Expected 1 success")
+
 		counts := cb.Counts()
 		assert.True(t, counts.TotalFailures > 0)
 		assert.True(t, counts.Requests > counts.TotalFailures)
diff --git a/tests/docker_test.go b/tests/docker_test.go
index d8e1f3d..11e7b4a 100644
--- a/tests/docker_test.go
+++ b/tests/docker_test.go
@@ -53,7 +53,13 @@ func TestDockerBuild(t *testing.T) {
 	cleanup := func() {
 		cleanupCtx, cleanupCancel := context.WithTimeout(context.Background(), 10*time.Second)
 		defer cleanupCancel()
-		exec.CommandContext(cleanupCtx, "docker", "rm", "-f", containerName).Run()
+
+		// Check the error return from Run()
+		if err := exec.CommandContext(cleanupCtx, "docker", "rm", "-f", containerName).Run(); err != nil {
+			// In a test, you typically want to log the error rather than fail the entire test
+			// unless the cleanup failure is critical
+			t.Logf("Failed to remove Docker container %s: %v", containerName, err)
+		}
 	}
 	cleanup() // Clean up any leftover containers
 	defer cleanup()
@@ -228,12 +234,24 @@ func TestDockerCompose(t *testing.T) {
 
 	// Enhanced cleanup to remove both containers and test config
 	cleanup := func() {
+		// Docker Compose cleanup with error handling
 		cmd := exec.CommandContext(ctx, "docker", "compose", "-f", filepath.Join(projectRoot, "docker-compose.yml"), "down", "-v")
 		cmd.Stdout = os.Stdout
 		cmd.Stderr = os.Stderr
-		cmd.Run()
-		// Clean up the config file
-		os.Remove(filepath.Join(projectRoot, "config.yaml"))
+
+		if err := cmd.Run(); err != nil {
+			// Log the error without failing the test, as this is a cleanup step
+			t.Logf("Failed to remove Docker Compose containers: %v", err)
+		}
+
+		// Config file cleanup with error handling
+		configPath := filepath.Join(projectRoot, "config.yaml")
+		if err := os.Remove(configPath); err != nil {
+			// Only log if the error is not because the file doesn't exist
+			if !os.IsNotExist(err) {
+				t.Logf("Failed to remove config file %s: %v", configPath, err)
+			}
+		}
 	}
 	cleanup() // Clean up any leftover containers and files
 	defer cleanup()